In [1]:
# importing the libs
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import warnings
warnings.filterwarnings('ignore')
import matplotlib
sns.set_style('darkgrid')
matplotlib.rcParams['font.size']=14
matplotlib.rcParams['figure.figsize']= (25, 10)
matplotlib.rcParams['figure.facecolor']= '#000000'
%matplotlib inline

In [2]:
#reading the data
train_data = pd.read_csv('datasets/zindi/autoinland/Train.csv')
test_data = pd.read_csv('datasets/zindi/autoinland/Test.csv')

In [3]:
split = train_data.shape[0]
dataset = pd.concat((train_data, test_data), axis=0)

In [4]:
print(dataset.shape)
dataset.tail(2)

(13281, 14)


Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
1200,ID_ZWQRL8L,2010-02-16,2011-02-15,Male,44,2010-02-16,2,Saloon,,Nissan,Aba North,Aba-North,Car Classic,
1201,ID_ZWZ92GU,2010-03-18,2011-03-17,Male,28,2010-03-18,1,Saloon,Silver,TOYOTA,,,Car Classic,


In [40]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13281 entries, 0 to 1201
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   Age                          13281 non-null  int64   
 1   No_Pol                       13281 non-null  int64   
 2   target                       12079 non-null  category
 3   Policy Start Dateyear        13281 non-null  int64   
 4   Policy Start Datemonth       13281 non-null  int64   
 5   Policy Start Dateday         13281 non-null  int64   
 6   Policy End Dateyear          13281 non-null  int64   
 7   Policy End Datemonth         13281 non-null  int64   
 8   Policy End Dateday           13281 non-null  int64   
 9   First Transaction Dateyear   13281 non-null  int64   
 10  First Transaction Datemonth  13281 non-null  int64   
 11  First Transaction Dateday    13281 non-null  int64   
 12  age_div_nopol                13281 non-null  float64 
 13  lg

### PreProcessing

In [6]:
dataset.drop(['ID'], axis=1, inplace=True)

In [7]:
fill_fet = ['Gender', 'Car_Category', 'Subject_Car_Colour', 'Subject_Car_Make', 'LGA_Name', 'State']
for fet in dataset.columns:
    for fet in fill_fet:
        dataset[fet] = dataset[fet].fillna(dataset[fet].mode()[0])

In [8]:
num_col = ['Policy Start Date', 'Policy End Date', 'First Transaction Date', 'Age', 'Target']
for col in dataset.columns:
    if col not in num_col:
        dataset[col] = dataset[col].astype('category')

In [9]:
dataset[fill_fet].nunique()

Gender                  7
Car_Category           16
Subject_Car_Colour     46
Subject_Car_Make       75
LGA_Name              270
State                 113
dtype: int64

In [10]:
# unique values
for col in fill_fet:
    print(col)
    print(dataset[col].unique(), '\n')

Gender
['Male', 'Female', 'Entity', 'Joint Gender', 'NO GENDER', 'NOT STATED', 'SEX']
Categories (7, object): ['Male', 'Female', 'Entity', 'Joint Gender', 'NO GENDER', 'NOT STATED', 'SEX'] 

Car_Category
['Saloon', 'JEEP', 'Motorcycle', 'Truck', 'Bus', ..., 'Wagon', 'Shape Of Vehicle Chasis', 'Sedan', 'Station 4 Wheel', 'Tipper Truck']
Length: 16
Categories (16, object): ['Saloon', 'JEEP', 'Motorcycle', 'Truck', ..., 'Shape Of Vehicle Chasis', 'Sedan', 'Station 4 Wheel', 'Tipper Truck'] 

Subject_Car_Colour
['Black', 'Grey', 'Red', 'As Attached', 'Blue', ..., 'Yellow & White', 'Beige Mitalic', 'Light Gray', 'Blue Sky', 'Red Maroon']
Length: 46
Categories (46, object): ['Black', 'Grey', 'Red', 'As Attached', ..., 'Beige Mitalic', 'Light Gray', 'Blue Sky', 'Red Maroon'] 

Subject_Car_Make
['TOYOTA', 'REXTON', 'Lexus', 'Hyundai', 'Iveco', ..., 'BRILLIANCE', 'Buik', 'COMMANDER', 'Bajaj', 'Datsun']
Length: 75
Categories (75, object): ['TOYOTA', 'REXTON', 'Lexus', 'Hyundai', ..., 'Buik', 'CO

In [11]:
# gender
dataset.Gender = dataset.Gender.replace({'Entity' : 'other',
                                        'Joint Gender' : 'other',
                                        'NO GENDER' : 'other',
                                        'NOT STATED' : 'other',
                                        'SEX' : 'other'})

In [12]:
# converting dates to datetime objects
date_col = ['Policy Start Date', 'Policy End Date', 'First Transaction Date']
for col in date_col:
    dataset[col] = pd.to_datetime(dataset[col])
    
for col in date_col:
    for date_fet in ['year', 'month', 'day']:
        dataset[col+date_fet] = getattr(dataset[col].dt, date_fet)

In [13]:
dataset.drop(['Policy Start Date', 'Policy End Date', 'First Transaction Date'], axis=1, inplace=True)

In [14]:
fill_fet

['Gender',
 'Car_Category',
 'Subject_Car_Colour',
 'Subject_Car_Make',
 'LGA_Name',
 'State']

In [15]:
dataset['age_div_nopol'] = dataset.Age.astype(int) / dataset.No_Pol.astype(int)

In [16]:
# frequency encoding
frq_enc = dataset.groupby('LGA_Name').size()/len(dataset)
dataset.loc[:, '{} _freq_encode'.format('lga_name')] = dataset['LGA_Name'].map(frq_enc)
dataset.drop(['LGA_Name'], axis=1, inplace=True)

In [17]:
frq_enc1 = dataset.groupby('State').size()/len(dataset)
dataset.loc[:, '{} _freq_encode'.format('state')] = dataset['State'].map(frq_enc1)
frq_enc2 = dataset.groupby('Subject_Car_Make').size()/len(dataset)
dataset.loc[:, '{} _freq_encode'.format('car_make')] = dataset['Subject_Car_Make'].map(frq_enc2)
frq_enc3 = dataset.groupby('Subject_Car_Colour').size()/len(dataset)
dataset.loc[:, '{} _freq_encode'.format('car_color')] = dataset['Subject_Car_Colour'].map(frq_enc3)
frq_enc4 = dataset.groupby('Car_Category').size()/len(dataset)
dataset.loc[:, '{} _freq_encode'.format('car_cat')] = dataset['Car_Category'].map(frq_enc4)
frq_enc5 = dataset.groupby('Gender').size()/len(dataset)
dataset.loc[:, '{} _freq_encode'.format('gender')] = dataset['Gender'].map(frq_enc5)

In [18]:
frq_enc6 = dataset.groupby('ProductName').size()/len(dataset)
dataset.loc[:, '{} _freq_encode'.format('prod_name')] = dataset['ProductName'].map(frq_enc6)
frq_enc7 = dataset.groupby('age_div_nopol').size()/len(dataset)
dataset.loc[:, '{} _freq_encode'.format('age_div_nopol')] = dataset['age_div_nopol'].map(frq_enc7)

In [19]:
dataset.drop(['ProductName', 'Car_Category', 'Subject_Car_Colour', 'Subject_Car_Make', 'Gender', 'State'],
            axis=1, inplace=True)

In [20]:
dataset['No_Pol'] = dataset['No_Pol'].astype(int)

In [21]:
dataset['prod_name _freq_encode'] = dataset['prod_name _freq_encode'].astype(float)
# dataset.drop(['prod _name_freq_encode'], axis=1)
# prod = dataset['prod_name _freq_encode'] 

### Training

In [22]:
train = dataset[:split]
test = dataset[split:]

In [23]:
train['target'] = train['target'].astype(int)

In [24]:
test.drop(['target'], axis = 1, inplace=True)

In [25]:
X, y = train.drop(['target'], axis=1), train['target']

In [26]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)

In [28]:
from xgboost import XGBClassifier
xgb = XGBClassifier(learning_rate=1e-4, n_estimators=6000, max_depth=6, n_thread=5, subsample=1.0, rg_alpha=0.007,
                   objective='binary:logistic', seed=42, n_jobs=-1)

xgb.fit(X_train, y_train)

from sklearn.metrics import f1_score
y_pred = xgb.predict(X_test)

f1_score(y_test, y_pred, average='macro')

Parameters: { n_thread, rg_alpha } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




0.5104859739643375

In [48]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=6000, criterion='entropy', max_depth=None, n_jobs=-1)
clf.fit(X_train, y_train)

preds = clf.predict(X_test)
f1_score(y_test, preds)

0.2058319039451115

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
l_reg = LogisticRegression(multi_class='multinomial', random_state=42)
r_for = RandomForestClassifier(n_estimators=500, criterion='entropy', n_jobs=-1, random_state=42)
xgb = XGBClassifier(learning_rate=1e-2, n_estimators=500, n_jobs=-1, objective='binary:logistic', seed=42)
voting = VotingClassifier(estimators=[('lr', l_reg), ('rf', r_for), ('xgb', xgb)], voting='soft')
voting.fit(X_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(multi_class='multinomial',
                                                 random_state=42)),
                             ('rf',
                              RandomForestClassifier(criterion='entropy',
                                                     n_estimators=500,
                                                     n_jobs=-1,
                                                     random_state=42)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=None, importance_type='g...
                                            interaction_constraints=None,
                        

In [47]:
p = voting.predict(X_test)
f1_score(y_test, p)

0.10551558752997604

In [36]:
test1 = scaler.fit_transform(test)

In [37]:
predictions = clf.predict(test1)

In [38]:
test1 = pd.read_csv('datasets/zindi/autoinland/Test.csv')

In [39]:
submission = pd.DataFrame()
submission['ID'] = test1['ID']
submission['target'] = predictions.reshape((predictions.shape[0]))
submission.to_csv('submission2.csv', index=False)