In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
path = Path('../input/telematics')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import random
import os
import pandas_profiling as pp

from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from mlxtend.classifier import StackingCVClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline

from imblearn.over_sampling import SMOTE

import optuna.integration.lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

Lets Take a look at the Data!

In [2]:
os.listdir(path)

In [3]:
simulated_data = pd.read_csv(path / 'simulated_summary_total.csv')
simulated_data.head()

In [4]:
simulated_data.info()

In [5]:
simulated_data.head()

We have 7 columns in the dataset viz : **Vehicle** which represents 30,000 unique Vehicle IDs.The **Days** column stands for the total number of days for which the data was collected.'Distance' stands for the number of miles for which the particular vehicle was driven. This column could be particularly insightful for predicting whether a vehicle has had a collision or not, intuitively we can think of it as, as more number of miles are driven; its highly likely that the vehicle could be prone to accidents. Intuitively we can associate **Distance** with positive collinearity with the target variable **Loss**. Next up we have the column **HardBrakes** which denotes the number of hardbreaking events detected in the 1hz data. Intuitively we can think of this column being important as well to determine whether or not a vehicle will undergo collision. As more number of hardbrakes could be seen to have been applied, we can assume the vehicle prevented a potential accident. This could be thought of as an example of negative collinearity. The column **Hard Acceleration** on the other hand gives us an idea to think in the opposite direction. As more number of impulsive accelerations are made, it could be likely that the vehicle underwent collision. **NightTime_Pct** denotes the percentage for which the total distance that was driven by the vehicle was at night. This could be a very important determinant indicator as most accidents that are happening are over the night due to driver drowsiness or intoxication. Finally we have a last feature column that could potentially contribute to prediction which is **VehicleType** which could signify that a particular type of vehicle, because of its size and power could contribute towards collision or not. All these features are **Intuitively** very important to determine which vehicle is more likely to have undergone or in the future undergo accidents so that they could segmented to charge them a higher premium over the insurance. Lets move on with our Exploratory Data Analysis.

In [6]:
pp.ProfileReport(simulated_data)

****VISUALISATIONS****

In [7]:
plt.figure(figsize = (12, 12))
sns.displot(simulated_data['Distance'], color = 'g', bins = 100 )
plt.title('Distances Travelled by the Vehicles')
plt.show()

In [64]:
plt.figure(figsize = (6, 6))
sns.distplot(simulated_data['HardBrakes'], color = 'b', bins = 100,hist = True, hist_kws = {"range" : [0,100]} )
plt.title('Distances Travelled by the Vehicles')
plt.show()

In [69]:
sns.displot(data.skew(), color = 'r', bins = 100)
plt.title('Skewness in Dataset')
plt.show()

In [72]:
sns.distplot(data.kurtosis(),color='blue',axlabel ='Kurtosis')

In [None]:
plt.figure(figsize = (12, 12))
sns.displot(simulated_data['Distance'], color = 'g', bins = 100 )
plt.title('Distances Travelled by the Vehicles')
plt.show()

In [9]:
corrmat = simulated_data.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

Now lets move on to the good part! Creating and Training a model on the data to get predictive insights!

In [10]:
data = pd.read_csv(path / 'simulated_summary_total.csv')
data.head()

Before moving on to design our model; we have to do some cleaning on the data. If we observe properly, the columns **Vehicle** and **Days** wont be of much contribution for prediction since one is simply an ID of unique vehicles and the other is a constant variable that doesnt change across all data points. We choose the columns **[Distance, HardBrakes, HardAccelerations, NightTime_Pct, VehicleType]** for prediction.

The column **NightTime_Pct** signifies a percentage of miles driven at night. We can interpolate from the total distance; the actual distance driven at night which might be more helpful for prediction than merely a percentage value.

In [11]:
data['NightTime_Pct'] = data['NightTime_Pct'] * data['Distance']

Next up; if we check the datatype of the **VehicleType** columns we can see that it consists Pandas string objects which denote categorical values signifying the type of the vehicle. 

In [12]:
data['VehicleType'].dtype

Any predictive model that has to be trained to gain predictive insights needs its input data to be converted into a numerical value. Hence we shall use a label encoder to assign unique ids. 

In [13]:
le = LabelEncoder()
data['VehicleType'] = le.fit_transform(data['VehicleType'])
print(data['VehicleType'].head(), le.classes_)

Now all of our data is in the right format. Looks like we can use this data for our predictions but there is one major problem. If we look at the spread of the target variable, we can see that it is highly imbalanced.

In [65]:
cov_matrix = data.cov()
cmap = plt.cm.RdBu
sns.heatmap(cov_matrix, linewidths = 0.2, vmax = 1.0, vmin = -1.0)
plt.title('Covariance Matrix')
plt.show()

In [66]:
corr_matrix = data.corr()
cmap = plt.cm.RdBu
sns.heatmap(corr_matrix, linewidths = 0.2, vmax = 1.0, vmin = -1., square = True, cmap = cmap, linecolor = 'white', annot = True, fmt = '.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
from scipy.stats import pearsonr, spearmanr
long_values = sample['longitude'].values
lat_values = sample['latitude'].values
corr, _ = pearsonr(long_values, lat_values)
print('Pearsons correlation between latitude and longitude: %.3f' % corr)
corr, _ = spearmanr(long_values, lat_values)
print('Spearmans correlation between latitude and longitude: %.3f'%corr)

In [14]:
sns.countplot(x = data['Loss'].value_counts(),palette = 'Set2')
plt.show()

Why should that be a problem? Lets find out.

In [15]:
cols = ['Distance', 'HardBrakes', 'HardAccelerations', 'NightTime_Pct', 'VehicleType']
X = data[cols]
Y = data['Loss'].values

In [16]:
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size = 0.2, shuffle = True)

In [17]:
def evaluate(name, predictions, ytest):
    print(f'Accuracy of {name} : {accuracy_score(predictions, ytest) * 100} %')
    print(classification_report(predictions, ytest, target_names = ['No Collision', 'Collision']))
    

In [18]:
log_reg = make_pipeline(StandardScaler(), LogisticRegressionCV())
log_reg.fit(xtrain, ytrain)
predictions = log_reg.predict(xtest)
evaluate('Logistic Regression', predictions, ytest)

Lets try out some other models

In [19]:
svm = make_pipeline(StandardScaler(), SVC(kernel = 'rbf', C = 3))
svm.fit(xtrain, ytrain)
predictions = svm.predict(xtest)
evaluate('Support Vector Machine', predictions, ytest)

In [20]:
knn = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors = 15))
knn.fit(xtrain, ytrain)
predictions = knn.predict(xtest)
evaluate('KNN Classifier', predictions, ytest)

In [21]:
tree = make_pipeline(StandardScaler(), DecisionTreeClassifier(criterion = 'entropy', random_state = 0, max_depth = 100))
tree.fit(xtrain, ytrain)
predictions = tree.predict(xtest)
evaluate('Decision Tree Classifier', predictions, ytest)

So the problem here is that the class with collision output set as true is underrepresented. almost 4000 out of the 30000 examples are the vehicles which have had a collision which is a very small fraction - 14% of the data. In this even a model which guesses randomly will score more than 85% accuracy on the test set. In cases where the dataset is highly imbalanced accuracy fails to capture the true performance of the models. In that case we have precision, recall and f1 score. 

Another very useful method to deal with highly imbalanced data is Subsampling the data. If we do not have enough data pertaining to a particular class of the dataset we try to normalize the data frequency from all classes. This could be done in two ways (i) Downsampling - Remove data from the other classes to match data from underrepresented class (ii) Upsampling involves using custom method like machine learning algorithms like KNN to fabricate data samples that are representative of the underrepresented task. In this study we see the subsampling approach and use the SMOTE library which uses KNN algorithm to create new examples of underrepresented class.

In [22]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 786)
xtrain, ytrain = smote.fit_resample(X, Y)

In [23]:
print(f'Length of new training dataset is {len(xtrain)}')

In [24]:
pd.DataFrame(ytrain).value_counts()

Now lets try some models on the subsampled dataset

In [25]:
log_reg = make_pipeline(StandardScaler(), LogisticRegressionCV())
log_reg.fit(xtrain, ytrain)
predictions = log_reg.predict(xtest)
evaluate('Logistic Regression', predictions, ytest)

In [26]:
svm = make_pipeline(StandardScaler(), SVC(kernel = 'rbf', C = 10))
svm.fit(xtrain, ytrain)
predictions = svm.predict(xtest)
evaluate('Support Vector Machine', predictions, ytest)

In [27]:
knn = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors = 15))
knn.fit(xtrain, ytrain)
predictions = knn.predict(xtest)
evaluate('KNN Classifier', predictions, ytest)

In [28]:
tree = make_pipeline(StandardScaler(), DecisionTreeClassifier(criterion = 'entropy', random_state = 0, max_depth = 100))
tree.fit(xtrain, ytrain)
predictions = tree.predict(xtest)
evaluate('Decision Tree Classifier', predictions, ytest)

In [29]:
forest = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 20, random_state = 2, max_depth = 60))
forest.fit(xtrain, ytrain)
predictions = forest.predict(xtest)
evaluate('Random Forest Classifier', predictions, ytest)

In [30]:
from xgboost import XGBClassifier
XGB = XGBClassifier(learning_rate = 0.01, n_estimators = 25,
                    max_depth = 25, gamma = 0.6,
                    subsample = 0.52, colsample_bytree = 0.6,
                    seed = 27, reg_lambda = 2, booster = 'dart',
                    colsample_bylevel = 0.6, colsample_bynode = 0.5,
                   )
XGB.fit(xtrain, ytrain)
predictions = XGB.predict(xtest)
evaluate(XGB, predictions, ytest)

Cross Validation

In [31]:
import optuna.integration.lightgbm as lgb
dtrain = lgb.Dataset(xtrain, label = ytrain)
dtest = lgb.Dataset(xtest, label = ytest)
params = {'objective' : 'binary',  'metric' : 'binary_logloss', 'verbosity' : -1, 'boosting_type' : 'gbdt'}
model = lgb.train(params, dtrain, valid_sets = [dtest], verbose_eval = 100, early_stopping_rounds = 100)

In [32]:
model.params

In [33]:
from lightgbm import LGBMClassifier
model = LGBMClassifier(**params)

In [50]:
num_folds = 5
oof_train = np.zeros(shape = (len(xtrain)))

In [51]:
ytrain = pd.DataFrame(ytrain)

In [53]:
print('Training...')
folds = KFold(n_splits = num_folds, shuffle = True, random_state = 786)
feature_importance_df_return = pd.DataFrame()
for fold, (train_index, validation_index) in enumerate(folds.split(xtrain, ytrain)):
    print('Fold no %i/%i'%(fold+1, num_folds))
    trainX = xtrain.iloc[train_index]
    trainY = ytrain.iloc[train_index]
    testX = xtrain.iloc[validation_index]
    testY = ytrain.iloc[validation_index]
    model.fit(X = trainX, y = trainY, eval_set = [(trainX, trainY), (testX, testY)], verbose = 500, early_stopping_rounds = 100)
    oof_train[validation_index] = model.predict(testX)
    fold_importance_df = pd.DataFrame()
    fold_importance_df['Feature'] = cols
    fold_importance_df['importance'] = model.feature_importances_
    fold_importance_df['fold'] = fold + 1
    feature_importance_df_return = pd.concat([feature_importance_df_return, fold_importance_df], axis = 0)
    
score = accuracy_score(oof_train[:], ytrain)
print('Training CV score : % .5f'%score)

In [54]:
print(classification_report(oof_train[:], ytrain))

Lets make some predictions!

In [73]:
data.head()

In [79]:
test = {'Distance' : 13114, 'HardBrakes' : 200, 'HardAccelerations' : 200, 'NightTimeMiles' : 400.0, 'VehicleType' : 2}
test = pd.DataFrame(test, index = [0]).to_numpy()

In [80]:
pred = model.predict(test)
print(pred)

So intuitively our model predicts that a car which has driven around 13,000 miles, over which around 200 times HardBrakes have been applied and over which around 200 times HardAccelerations have been applied; A huge SUV car which has driven 400 miles of its total journey at night will most likely undergo collision according to our model's prediction and hence should be charged with a higher premium for insurance. 