In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.metrics import accuracy_score
warnings.filterwarnings('ignore')

In [2]:
#To load dataset losses.xlsx 
#xlsx means excel file , to read data from .xlsx file : read_xlsx() 
#inbuilt method which define in pandas
df=pd.read_excel("Losses.xlsx")
#to display first 5 records
df.head()

Unnamed: 0,DAY,MONTH,MONTH_ID,YEAR,CAUSE,GROSS INCURRED AMOUNT
0,1.0,January,1.0,1999.0,WINDSTORM,477.88
1,1.0,January,1.0,1999.0,FIRE,700.0
2,1.0,January,1.0,1999.0,WINDSTORM,99.87
3,1.0,January,1.0,1999.0,WINDSTORM,139.8
4,1.0,January,1.0,1999.0,WINDSTORM,548.66


In [3]:
#to show no. of rows and columns 
df.shape

(47565, 6)

In [4]:
#here target /outpu variable : CAUSE 
#How many categories in Cause
df['CAUSE'].unique()

array(['WINDSTORM', 'FIRE', 'ESCAPE OF WATER', 'SUBSIDENCE     ',
       'FLOOD          ', 'ACCIDENTAL DAMAGE', 'THEFT', 'EARTHQUAKE     '],
      dtype=object)

In [5]:
#How many samples of all categories 
df['CAUSE'].value_counts()

ACCIDENTAL DAMAGE    17993
WINDSTORM             9867
THEFT                 7564
FLOOD                 5175
ESCAPE OF WATER       3964
FIRE                  2046
SUBSIDENCE             818
EARTHQUAKE             138
Name: CAUSE, dtype: int64

In [6]:
#to check null values
df.isnull().sum()

DAY                      0
MONTH                    0
MONTH_ID                 0
YEAR                     0
CAUSE                    0
GROSS INCURRED AMOUNT    0
dtype: int64

In [7]:
#to check datatypes
df.dtypes

DAY                      float64
MONTH                     object
MONTH_ID                 float64
YEAR                     float64
CAUSE                     object
GROSS INCURRED AMOUNT    float64
dtype: object

In [8]:
#to check duplicates rows 
df.duplicated().sum()

3264

In [9]:
#To remove duplicates rows permanently
df.drop_duplicates(inplace=True)

#to check duplicates rows 
df.duplicated().sum()

0

In [10]:
#To remove MONTH column permanently from given dataset
df.drop("MONTH",axis=1,inplace=True)
df.head()

Unnamed: 0,DAY,MONTH_ID,YEAR,CAUSE,GROSS INCURRED AMOUNT
0,1.0,1.0,1999.0,WINDSTORM,477.88
1,1.0,1.0,1999.0,FIRE,700.0
2,1.0,1.0,1999.0,WINDSTORM,99.87
3,1.0,1.0,1999.0,WINDSTORM,139.8
4,1.0,1.0,1999.0,WINDSTORM,548.66


In [11]:
df.dtypes

DAY                      float64
MONTH_ID                 float64
YEAR                     float64
CAUSE                     object
GROSS INCURRED AMOUNT    float64
dtype: object

In [12]:
#Apply LAbelEncoder on CAUSE column 
from sklearn.preprocessing import LabelEncoder
#create object of LabelEncoder class
le=LabelEncoder()
df['CAUSE']=le.fit_transform(df['CAUSE'])
df.dtypes

DAY                      float64
MONTH_ID                 float64
YEAR                     float64
CAUSE                      int32
GROSS INCURRED AMOUNT    float64
dtype: object

In [13]:
#select  input and output
X=df.drop('CAUSE',axis=1) #input
Y=df['CAUSE'] #output

In [14]:
# spliting the data of x and y using train_test_split
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=1)
#stratify pass output value for multiclass classifier

In [15]:
X_train.shape,X_test.shape

((31010, 4), (13291, 4))

In [16]:
Y_train.value_counts()

0    11894
7     5758
6     5124
4     3471
2     2710
3     1434
5      519
1      100
Name: CAUSE, dtype: int64

In [17]:
Y_test.value_counts()

0    5056
7    2450
6    2304
4    1445
2    1166
3     591
5     249
1      30
Name: CAUSE, dtype: int64

In [18]:
#clearly understanding , data are imbalance 
#Apply Balance technology for multi class classifier : use SMOTE inbuilt
#class which define in outer class over_sampling and over_sampling class
#are define in inbuilt package imblearn
#if multiclass classifier and unbalance then use SMOTE class
from imblearn.over_sampling import SMOTE

In [19]:
#full form of SMOTE class :- Synthetic Minority Oversampling Technique

In [20]:
#fit_resample() inbuilt method of SMOTE class 
X_train1,Y_train1=SMOTE().fit_resample(X_train,Y_train)

In [21]:
#After apply balanced on X_train,Y_train 
Y_train1.value_counts()

3    11894
4    11894
7    11894
2    11894
0    11894
6    11894
5    11894
1    11894
Name: CAUSE, dtype: int64

In [22]:
#same balance tech apply on testing data
X_test1,Y_test1=SMOTE().fit_resample(X_test,Y_test)

In [23]:
##apply apply SMOTE
Y_test1.value_counts()

7    5056
6    5056
0    5056
3    5056
2    5056
4    5056
5    5056
1    5056
Name: CAUSE, dtype: int64

In [24]:
#apply Standard Scaler on Training input X_train1 and testing input
#X_test1
from sklearn.preprocessing import StandardScaler
#create object of StandardScaler class
ss=StandardScaler()
X_train1=ss.fit_transform(X_train1)
X_test1=ss.transform(X_test1)

In [25]:
X_train1

array([[ 0.83907846, -1.33909224, -1.44340319, -0.28216954],
       [ 0.62890252, -1.02459216, -0.17653162, -0.17409123],
       [ 1.04925441,  1.80590847,  0.2457589 , -0.28801093],
       ...,
       [ 1.46518996,  1.19012524,  1.09033994, -0.22544767],
       [ 0.65949557,  1.58296518,  0.96740336, -0.22544767],
       [ 0.41872658, -1.09792343,  0.73281462, -0.14793865]])

In [26]:
#create function for train and test the model
def create_model(model):
    #train the model with70% data
    model.fit(X_train1,Y_train1)
    # test the model with 30% data
    Y_pred=model.predict(X_test1)
    #generate thye report
    print(classification_report(Y_test1,Y_pred))
    print(confusion_matrix(Y_test1,Y_pred))
    return model

In [27]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score

In [28]:
#first model perform with logistic regression 
#first call LogisticRegression class
from sklearn.linear_model import LogisticRegression

#create object of LogisticRegression class
lr=LogisticRegression()


In [29]:
lr= LogisticRegression()
lr.fit(X_train, Y_train)
Y_pred = lr.predict(X_test)

#  prediction Summary by species
print(classification_report(Y_test, Y_pred))

# Accuracy score
LRC = accuracy_score(Y_pred,Y_test)
print(f"{round(LRC*100,2)}% Accurate")

              precision    recall  f1-score   support

           0       0.41      0.98      0.57      5056
           1       0.00      0.00      0.00        30
           2       0.00      0.00      0.00      1166
           3       0.21      0.02      0.04       591
           4       0.26      0.04      0.07      1445
           5       0.67      0.01      0.02       249
           6       0.00      0.00      0.00      2304
           7       0.19      0.06      0.10      2450

    accuracy                           0.39     13291
   macro avg       0.22      0.14      0.10     13291
weighted avg       0.24      0.39      0.25     13291

38.89% Accurate


In [30]:
from sklearn.tree import DecisionTreeClassifier

In [31]:
#create the object of DecisionTreeClassifier class 
dt=DecisionTreeClassifier() #bydefault gini index method



In [32]:
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
Y_pred = dt.predict(X_test)

#  prediction Summary by species
print(classification_report(Y_test, Y_pred))

# Accuracy score
DTC = accuracy_score(Y_pred,Y_test)
print(f"{round(DTC*100,2)}% Accurate")

              precision    recall  f1-score   support

           0       0.45      0.45      0.45      5056
           1       0.68      0.83      0.75        30
           2       0.16      0.16      0.16      1166
           3       0.08      0.09      0.09       591
           4       0.25      0.26      0.26      1445
           5       0.22      0.18      0.20       249
           6       0.20      0.20      0.20      2304
           7       0.43      0.43      0.43      2450

    accuracy                           0.34     13291
   macro avg       0.31      0.33      0.32     13291
weighted avg       0.34      0.34      0.34     13291

33.52% Accurate


In [33]:
#show the information gain of all input features 
IG=dt.feature_importances_
print("information Gain :",IG)
dict={'Input':X.columns,'IG':IG}
df=pd.DataFrame(dict)
df.sort_values('IG',ascending=False)#descending order according to IG

information Gain : [0.28276596 0.16003885 0.09981358 0.45738161]


Unnamed: 0,Input,IG
3,GROSS INCURRED AMOUNT,0.457382
0,DAY,0.282766
1,MONTH_ID,0.160039
2,YEAR,0.099814


In [34]:
#create the tree 

from sklearn import tree
features=X.columns #input features
fig=plt.figure(figsize=(20,20))
#_=tree.plot_tree(dt,feature_names=features,filled=True)

<Figure size 1440x1440 with 0 Axes>

In [35]:
from sklearn.svm import LinearSVC

In [36]:
#create object of LinearSVC class
svc=LinearSVC(random_state=1) #hard margin- before adding error is known as hard margin#create object of LinearSVC class



In [37]:
svc= LinearSVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)

#  prediction Summary by species
print(classification_report(Y_test, Y_pred))

# Accuracy score
SVC = accuracy_score(Y_pred,Y_test)
print(f"{round(SVC*100,2)}% Accurate")

              precision    recall  f1-score   support

           0       0.46      0.78      0.58      5056
           1       0.00      0.00      0.00        30
           2       0.00      0.00      0.00      1166
           3       0.01      0.00      0.00       591
           4       0.00      0.00      0.00      1445
           5       0.03      0.56      0.06       249
           6       0.00      0.00      0.00      2304
           7       0.00      0.00      0.00      2450

    accuracy                           0.31     13291
   macro avg       0.06      0.17      0.08     13291
weighted avg       0.17      0.31      0.22     13291

30.86% Accurate


In [38]:
#Use ENsembling Technique : -
#1. RandomForestTree : use DecisionTreeClassifier Algorithm 
from sklearn.ensemble import RandomForestClassifier

In [39]:
#create object of RandomForestClassifier class and passing some parameters
rfc=RandomForestClassifier(n_estimators=10,max_features=4,random_state=1)
#n_estimators : no. of algorithm (>=10 and <=100) 
#algorithm : DecisionTreeClassifer



In [40]:
rfc= RandomForestClassifier()
rfc.fit(X_train, Y_train)
Y_pred = rfc.predict(X_test)

#  prediction Summary by species
print(classification_report(Y_test, Y_pred))

# Accuracy score
RFC = accuracy_score(Y_pred,Y_test)
print(f"{round(RFC*100,2)}% Accurate")

              precision    recall  f1-score   support

           0       0.47      0.62      0.53      5056
           1       0.84      0.87      0.85        30
           2       0.18      0.13      0.15      1166
           3       0.13      0.08      0.10       591
           4       0.29      0.27      0.28      1445
           5       0.23      0.13      0.16       249
           6       0.23      0.17      0.19      2304
           7       0.50      0.46      0.48      2450

    accuracy                           0.40     13291
   macro avg       0.36      0.34      0.34     13291
weighted avg       0.37      0.40      0.38     13291

39.61% Accurate


In [41]:
'''for n in range(10,101):
    rfc=RandomForestClassifier(n_estimators=13,random_state=1)
    print("No. of trees : ",n)
    rfc=create_model(rfc)'''

'for n in range(10,101):\n    rfc=RandomForestClassifier(n_estimators=13,random_state=1)\n    print("No. of trees : ",n)\n    rfc=create_model(rfc)'

In [42]:
X_train.shape

(31010, 4)

In [43]:
# #show the information gain of all input features 
# IG=rfc.feature_importances_
# #print("information Gain :",IG)

# dict={'Input':X.columns,'IG':IG}
# df=pd.DataFrame(dict)
# df.sort_values('IG',ascending=False)#descending order according to IG

In [44]:
#Apply Ensembling Technique Boosting 
#1. first apply ADA Boost : call inbuilt class AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

In [45]:
#create the object of AdaBoostClassifier class
#ADA Boost creates decision stump (means one root node and 2 leaf node)
#leaf node : no any childs
ada=AdaBoostClassifier(n_estimators=3,random_state=1)
#call function
ada=create_model(ada)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      5056
           1       0.59      0.86      0.70      5056
           2       0.17      0.87      0.28      5056
           3       0.00      0.00      0.00      5056
           4       0.00      0.00      0.00      5056
           5       0.45      0.26      0.33      5056
           6       0.00      0.00      0.00      5056
           7       0.25      0.19      0.21      5056

    accuracy                           0.27     40448
   macro avg       0.18      0.27      0.19     40448
weighted avg       0.18      0.27      0.19     40448

[[   0  591 3445    0    0   17    0 1003]
 [   0 4350  295    0    0  338    0   73]
 [   0  383 4411    0    0   91    0  171]
 [   0  253 4041    0    0  540    0  222]
 [   0  441 3511    0    0  351    0  753]
 [   0  329 3353    0    0 1301    0   73]
 [   0  530 3905    0    0  122    0  499]
 [   0  543 3464    0    0  112    0  937]]


In [46]:
ada= AdaBoostClassifier()
ada.fit(X_train, Y_train)
Y_pred = ada.predict(X_test)

#  prediction Summary by species
print(classification_report(Y_test, Y_pred))

# Accuracy score
ADA= accuracy_score(Y_pred,Y_test)
print(f"{round(ADA*100,2)}% Accurate")

              precision    recall  f1-score   support

           0       0.45      0.83      0.58      5056
           1       0.57      0.83      0.68        30
           2       0.18      0.03      0.05      1166
           3       0.20      0.03      0.04       591
           4       0.25      0.35      0.29      1445
           5       0.82      0.06      0.11       249
           6       0.25      0.03      0.06      2304
           7       0.43      0.23      0.30      2450

    accuracy                           0.41     13291
   macro avg       0.39      0.30      0.26     13291
weighted avg       0.36      0.41      0.33     13291

40.75% Accurate


In [47]:
#check impotant features
dict={'Input':X.columns,'IG':ada.feature_importances_}
df=pd.DataFrame(dict)
df.sort_values('IG',ascending=False)

Unnamed: 0,Input,IG
2,YEAR,0.46
0,DAY,0.2
3,GROSS INCURRED AMOUNT,0.2
1,MONTH_ID,0.14


In [48]:
#call GradientBoostingClassifier class from following package
from sklearn.ensemble import GradientBoostingClassifier

In [49]:
#create the object of GradientBoostingClassifier class
gbc=GradientBoostingClassifier(random_state=1,n_estimators=4)
#n_estimators inbuilt parameter of GradientBoostingClassifier class
#means no. of iterations , min =10 and max=100 
#n_estimators >=10 and <=100 
#call function
gbc=create_model(gbc)


              precision    recall  f1-score   support

           0       0.42      0.43      0.42      5056
           1       0.96      0.86      0.91      5056
           2       0.30      0.66      0.41      5056
           3       0.21      0.01      0.02      5056
           4       0.28      0.65      0.39      5056
           5       0.45      0.40      0.42      5056
           6       0.24      0.01      0.01      5056
           7       0.50      0.28      0.36      5056

    accuracy                           0.41     40448
   macro avg       0.42      0.41      0.37     40448
weighted avg       0.42      0.41      0.37     40448

[[2173   73 1226   17  914  157   10  486]
 [   0 4350  127    0  172  407    0    0]
 [ 337    3 3320   15  738  337   20  286]
 [ 242   15 2014   40 1914  637   28  166]
 [ 819   39  246   28 3282  476   16  150]
 [ 211    6 1142   35 1570 2028   17   47]
 [ 918   28 1738   35 1817  220   36  264]
 [ 529   21 1389   24 1383  290   23 1397]]


In [50]:
gbc= GradientBoostingClassifier()
gbc.fit(X_train, Y_train)
Y_pred = gbc.predict(X_test)

#  prediction Summary by species
print(classification_report(Y_test, Y_pred))

# Accuracy score
GBC= accuracy_score(Y_pred,Y_test)
print(f"{round(GBC*100,2)}% Accurate")

              precision    recall  f1-score   support

           0       0.46      0.88      0.60      5056
           1       0.67      0.87      0.75        30
           2       0.26      0.04      0.07      1166
           3       0.37      0.05      0.08       591
           4       0.38      0.26      0.31      1445
           5       0.57      0.24      0.33       249
           6       0.35      0.06      0.10      2304
           7       0.56      0.44      0.50      2450

    accuracy                           0.47     13291
   macro avg       0.45      0.35      0.34     13291
weighted avg       0.43      0.47      0.39     13291

46.54% Accurate


In [51]:
#check impotant features
dict={'Input':X.columns,'IG':gbc.feature_importances_}
df=pd.DataFrame(dict)
df.sort_values('IG',ascending=False)

Unnamed: 0,Input,IG
3,GROSS INCURRED AMOUNT,0.448747
2,YEAR,0.248252
1,MONTH_ID,0.173022
0,DAY,0.129979


In [52]:
from xgboost import XGBClassifier

In [53]:
#create object of class XGBClassifier
xgb=XGBClassifier(n_estimators=10 ,reg_alpha=1)
#reg means regularation : lambda or alpha
#automatic overfitting : reg means regularation and alpha or lambda : 
#hyperparameter
#1 means True


In [54]:
xgb= XGBClassifier()
xgb.fit(X_train, Y_train)
Y_pred = xgb.predict(X_test)

#  prediction Summary by species
print(classification_report(Y_test, Y_pred))

# Accuracy score
XGB= accuracy_score(Y_pred,Y_test)
print(f"{round(XGB*100,2)}% Accurate")

              precision    recall  f1-score   support

           0       0.48      0.85      0.61      5056
           1       0.82      0.90      0.86        30
           2       0.24      0.08      0.12      1166
           3       0.31      0.06      0.09       591
           4       0.35      0.24      0.28      1445
           5       0.52      0.24      0.33       249
           6       0.34      0.10      0.16      2304
           7       0.60      0.48      0.53      2450

    accuracy                           0.47     13291
   macro avg       0.46      0.37      0.37     13291
weighted avg       0.43      0.47      0.41     13291

47.02% Accurate


In [55]:
'''for i in range(10,101):
    xgb=XGBClassifier(n_estimators=i ,reg_alpha=1)
    print("No. of iterations : ",i)
    #call function
    xgb=create_model(xgb)'''

'for i in range(10,101):\n    xgb=XGBClassifier(n_estimators=i ,reg_alpha=1)\n    print("No. of iterations : ",i)\n    #call function\n    xgb=create_model(xgb)'

In [56]:
from sklearn.neighbors import KNeighborsClassifier

In [57]:
#create the object of KNeighborsClassifier class and passing the 
#following parameter
knn=KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
#p=2 means Euclidean distance method

In [58]:
'''for k in range(5,10,2):  #k=5 next k=7 k=9
    knn=KNeighborsClassifier(n_neighbors=k,metric='minkowski',p=2)
    print("K=",k)
    #call function
    knn=create_model(knn)'''

'for k in range(5,10,2):  #k=5 next k=7 k=9\n    knn=KNeighborsClassifier(n_neighbors=k,metric=\'minkowski\',p=2)\n    print("K=",k)\n    #call function\n    knn=create_model(knn)'

In [59]:
knn= KNeighborsClassifier()
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)

#  prediction Summary by species
print(classification_report(Y_test, Y_pred))

# Accuracy score
KNN= accuracy_score(Y_pred,Y_test)
print(f"{round(KNN*100,2)}% Accurate")

              precision    recall  f1-score   support

           0       0.44      0.70      0.54      5056
           1       0.06      0.03      0.04        30
           2       0.15      0.08      0.10      1166
           3       0.11      0.05      0.07       591
           4       0.23      0.20      0.22      1445
           5       0.32      0.13      0.19       249
           6       0.22      0.13      0.16      2304
           7       0.36      0.24      0.29      2450

    accuracy                           0.37     13291
   macro avg       0.24      0.20      0.20     13291
weighted avg       0.32      0.37      0.33     13291

36.69% Accurate


In [60]:
score = [DTC,RFC,XGB,LRC,KNN,SVC,ADA,GBC]
Models = pd.DataFrame({'n_neighbors': ["Decision Tree","Random Forest","XGBoost", "Logistic Regression","K-Neighbors","Support vector","ADA BOOST","Gradient Boost"],'Score': score})
Models.sort_values(by='Score',ascending=False)

Unnamed: 0,n_neighbors,Score
2,XGBoost,0.470243
7,Gradient Boost,0.465428
6,ADA BOOST,0.407494
1,Random Forest,0.396057
3,Logistic Regression,0.38891
4,K-Neighbors,0.366865
0,Decision Tree,0.335189
5,Support vector,0.30863


In [None]:
#CONCLUSION FROM THE ABOVE ALGO XGBOOST GIVES BEST RESULT WITH 47.02% score

In [1]:
X='THE DATASET BASICALLY FOCUSS ON THE LOSSES OCCURED DUE TP THESE CALAMITIES OCCURED ACCIDENTAL ,DAMAGE ,WINDSTORM ,THEFT ,ESCAPE OF WATER , SUBSIDENCE, EARTHQUAKE' 

In [4]:
X.lower()

'the dataset basically focuss on the losses occured due tp these calamities occured accidental ,damage ,windstorm ,theft ,escape of water , subsidence, earthquake'