# Competition Details

Hosted by - HackerEarth (June 2021)

Dataset and Problem - https://www.kaggle.com/infernape/fast-furious-and-insured

__notebooks__:
- https://www.kaggle.com/ashuto7h/fast-furious-crash

- https://www.kaggle.com/ashuto7h/2-fast-furious-regression

My final Score - 46.875

Winner Final Score - 58.359  

In [None]:
import numpy # linear algebra
import pandas # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    print(dirname)
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%config Completer.use_jedi = False

In [None]:
train_df = pandas.read_csv('/kaggle/input/fast-furious-and-insured/Fast_Furious_Insured/train.csv')
valid_df = pandas.read_csv('/kaggle/input/fast-furious-and-insured/Fast_Furious_Insured/test.csv')
train_df.shape, valid_df.shape

In [None]:
valid_df.columns

In [None]:
pred = pandas.read_csv('/kaggle/input/fast-furious-crash/predict.csv')
imgs = pandas.read_csv('/kaggle/input/fast-furious-crash/imgs.csv')
print(pred.columns, imgs.columns)
pred.columns = ['idx','Condition']
pred['image'] = imgs['0']

In [None]:
pred

In [None]:
valid_df.columns

In [None]:
valid_df = pandas.merge(valid_df,pred, left_on='Image_path',right_on ='image')
submit = pandas.DataFrame()
submit['Image_path'] = valid_df['Image_path']
submit['Condition'] = valid_df['Condition']
valid_df.drop(columns = ['Image_path','image','idx'], inplace = True)

In [None]:
submit.Condition.replace(0, 'zero', inplace = True)
submit.Condition.replace(1,0,inplace=True)
submit.Condition.replace('zero',1, inplace = True)
submit.Condition.value_counts()

In [None]:
valid_df.columns = ['Insurance_company', 'Cost_of_vehicle', 'Min_coverage',
       'Expiry_date', 'Max_coverage', 'Condition']

In [None]:
train_df.columns

In [None]:
from sklearn.preprocessing import StandardScaler
y_scaler = StandardScaler()
train_y = train_df['Amount'].copy()
train_df.drop(columns = ['Amount','Image_path'], inplace = True)

import seaborn
seaborn.scatterplot(x= train_y, y = train_y.index)  
train_y.loc[(train_y > 15000)] = train_y.median()
train_y.loc[train_y< 0] = train_y.median()
train_y.fillna(train_y.mean(),inplace = True)
train_y = y_scaler.fit_transform(train_y.values.reshape(-1,1))
train_y.reshape(1,-1)

In [None]:
df = pandas.concat([train_df, valid_df])
df

In [None]:
# from pandas_profiling import ProfileReport
# pr = ProfileReport(df=df, title='Pandas Profiling Report', explorative=True)
# pr.to_file('report.html')
# pr

# EDA

### Missing values

In [None]:
df['Cost_of_vehicle'].fillna(value= df['Cost_of_vehicle'].mean(),inplace = True)
df['Min_coverage'].fillna(value= df['Min_coverage'].mean(),inplace = True)
df['Max_coverage'].fillna(value= df['Max_coverage'].median(), inplace = True)

In [None]:
df['Expiry_date'] = pandas.to_datetime(df['Expiry_date'])
df['year'] = df['Expiry_date'].dt.year
df['week'] = df['Expiry_date'].dt.isocalendar().week
df['day'] = df['Expiry_date'].dt.day
df.drop(columns = ['Expiry_date'], inplace = True)


### Box plot/Outlier Removal

In [None]:
import matplotlib.pyplot as pyplot
import seaborn
numeric_cols = ['Cost_of_vehicle', 'Min_coverage',
       'Max_coverage', 'week', 'day']

category_cols = ['year','Insurance_company','Condition']

i = 1
pyplot.figure(figsize=(15,11))
for col in numeric_cols:
    pyplot.subplot(2,3,i)
    seaborn.boxplot(data = df, y = df[col])
    i += 1
pyplot.show()


In [None]:
cols = ['Cost_of_vehicle', 'Min_coverage']

for col in cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    df.loc[(df[col] > (q3 + 1.5 * iqr)) | (df[col] < (q1 - 1.5 * iqr)), col] = df[col].median()

In [None]:
numeric_cols = ['Cost_of_vehicle', 'Min_coverage',
       'Max_coverage', 'week', 'day']

category_cols = ['year','Insurance_company','Condition']

i = 1
pyplot.figure(figsize=(15,11))
for col in numeric_cols:
    pyplot.subplot(2,3,i)
    seaborn.violinplot(data = df, y = df[col].astype(float))
    i += 1
pyplot.show()

### Correlation

In [None]:
pyplot.figure(figsize=(8,8))
seaborn.heatmap(df.corr(), annot = True, fmt = '.2f', square = True, vmax=1, vmin = -1,linewidths=0.5, cmap='Dark2')

In [None]:
seaborn.scatterplot(x = df['Min_coverage'],y = df['Cost_of_vehicle'], hue = df['Condition'])

In [None]:
# dropping mincoverage, 
df.drop(columns = ['Min_coverage'],inplace =True)

In [None]:
category_cols = ['year','Insurance_company','Condition']

i = 1
pyplot.figure(figsize=(15,5))
for col in category_cols:
    pyplot.subplot(1,3,i)
    seaborn.countplot(x = df[col])
    i += 1
pyplot.show()

In [None]:
df = pandas.get_dummies(df,columns = category_cols,drop_first=True)
df

### Scaling

In [None]:
numeric_cols = ['Cost_of_vehicle',
       'Max_coverage', 'week', 'day']
i = 1
pyplot.figure(figsize=(15,5))
for col in numeric_cols:
    pyplot.subplot(2,2,i)
    seaborn.histplot(x = df[col])
    i += 1
pyplot.show()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_std = df.copy()
df_std[numeric_cols] = scaler.fit_transform(df[numeric_cols].values)
df_std

## PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95, random_state = 42)
pca.fit(df_std)
pca_df_std = pca.transform(df_std)
df_std = pandas.DataFrame(pca_df_std)
df_std

## RFE

In [None]:
train_y = pandas.Series(train_y.ravel())

In [None]:
train_df = df_std[:1399]
valid_df = df_std[1399:]
train_df.shape,valid_df.shape

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import PoissonRegressor
from sklearn.linear_model import BayesianRidge, ARDRegression
selector = RFECV(ARDRegression(alpha_1=5.370317963702577e+72, alpha_2=114815.36214968817,
              lambda_1=0.7073636363636364, lambda_2=0.17254545454545456,
              normalize=True, threshold_lambda=1.0), cv=5,verbose = 5)
selector = selector.fit(train_df, train_y)
selector.support_

In [None]:
cols = train_df.columns[selector.support_ == False]
print(cols)
train_df.drop(columns = cols, inplace=True)
valid_df.drop(columns = cols, inplace=True)

## Train, Test split

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_df, train_y, test_size=0.25, random_state=42)

## Models

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.linear_model import MultiTaskLasso, ElasticNet, MultiTaskElasticNet
from sklearn.linear_model import Lars, LassoLars, OrthogonalMatchingPursuit
from sklearn.linear_model import BayesianRidge, ARDRegression
from sklearn.linear_model import TweedieRegressor, PoissonRegressor,GammaRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import mean_squared_error, max_error, r2_score
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
import numpy
def pcr(n_components=1,whiten = True, svd_solver = 'auto' ):
    return make_pipeline(StandardScaler(), PCA(n_components=n_components,
                                               whiten = whiten,
                                               svd_solver = svd_solver), ARDRegression(alpha_1=5.370317963702577e+72, alpha_2=114815.36214968817,
              lambda_1=0.7073636363636364, lambda_2=0.17254545454545456,
              normalize=True, threshold_lambda=1.0))
    
def polyreg(degree = 2):
    return make_pipeline(PolynomialFeatures(degree),  ARDRegression(alpha_1=5.370317963702577e+72, alpha_2=114815.36214968817,
              lambda_1=0.7073636363636364, lambda_2=0.17254545454545456,
              normalize=True, threshold_lambda=1.0))
models = [      
    LinearRegression(),
    Ridge(random_state = 42),
    Lasso(random_state = 42),
    ElasticNet(random_state = 42),
    OrthogonalMatchingPursuit(),
    BayesianRidge(),
    ARDRegression(),
    TweedieRegressor(),
    SGDRegressor(random_state = 42),
#     PoissonRegressor(max_iter = 1000),
#     GammaRegressor(),
    DecisionTreeRegressor(random_state = 42),
    RandomForestRegressor(random_state = 42),
    KNeighborsRegressor(),
    SVR(),
    GradientBoostingRegressor(),
    PLSRegression(),
    ExtraTreesRegressor(),
    XGBRegressor(random_state = 42),
    LGBMRegressor(random_state = 42),
    polyreg()
]
params = [
          ['linear', {'normalize' : [True,False]}],
          ['ridge', {'alpha': numpy.logspace(0.01,100,100),
                     'normalize' : [True, False],
                     'solver' : ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}],
          ['lasso', {'alpha' : numpy.logspace(0.01, 100, 100),
                     'normalize' : [True, False],
                     'precompute' : ['auto', True, False],
                     'positive' : [True, False],
                     'selection' : ['cyclic', 'random']}],
          ['elasticnet', {'alpha' : numpy.logspace(0.01,100,100),
                          'l1_ratio' : numpy.linspace(0.01, 1, 50),
                          'normalize': [True, False],
                          'selection' : ['cyclic', 'random'],
                          'precompute' : [True, False],
                          'positive' : [True,False]}],
          ['omp', {'normalize': [True, False],
                   'precompute' : [True, False]}],
          ['bayesian_ridge', {'alpha_1' : numpy.linspace(0.01, 100,100),
                              'alpha_2' : numpy.linspace(0.01, 100,100),
                              'lambda_1' : numpy.linspace(0.001, 1, 100),
                              'lambda_2' : numpy.linspace(0.001, 1, 100),
                              'lambda_init' : numpy.linspace(0.1, 1, 10),
                              'normalize' : [True, False]}],
          ['ard',  {'alpha_1' : numpy.logspace(0.01, 100,100),
                    'alpha_2' : numpy.logspace(0.01, 100,100),
                    'lambda_1' : numpy.linspace(0.001, 1, 100),
                    'lambda_2' : numpy.linspace(0.001, 1, 100),
                    'threshold_lambda' : numpy.linspace(0.1, 1, 10),
                    'normalize' : [True, False]}],
          ['tweedie', {'power' : [0,1,2,3],
                       'alpha' : numpy.logspace(0.01,100,100),
                       'link' : ['auto', 'identity', 'log']}],
          ['SGD', {'loss' : ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
                   'penalty' : ['l2', 'l1', 'elasticnet'],
                   'alpha' : numpy.linspace(0.01,100,100),
                   'l1_ratio' : numpy.linspace(0.001, 1, 100),
                   'shuffle' : [True,False],
                   'epsilon' : numpy.linspace(0.01, 1,30),
                   'learning_rate' : ['constant', 'invscaling', 'optimal', 'adaptive'],
                   'power_t' : numpy.linspace(0.1, 2, 50),
                   'validation_fraction' : numpy.linspace(0.1, 0.9,10),
                   'average' : [True,False]}],
#           ['poisson', {'alpha' : numpy.linspace(0.01,100,100)}],
#           ['gamma', {'alpha' : numpy.linspace(0.01,100,100)}],
          ['dtr', {'criterion' : ['mse', 'friedman_mse', 'mae', 'poisson'],
                   'splitter' : ['best', 'random'],
                   'min_samples_split' : range(2, 30),
                   'min_samples_leaf' : range(1, 30),
                   'min_weight_fraction_leaf': numpy.linspace(0,0.5, 10),
                   'max_features' : ['auto', 'sqrt', 'log2'],
                   'max_leaf_nodes' : range(1,30)}],
          ['rfr', {'n_estimators' : range(2, 100,5),
                   'criterion' : ['mse', 'mae'],
                   'max_depth' : [None, 10,20,30,40,50,60,70,80,90,100],
                   'min_samples_split' : range(2, 30),
                   'min_samples_leaf' : range(1, 30),
                   'min_weight_fraction_leaf': numpy.linspace(0,0.5, 10),
                   'max_features' : ['auto', 'sqrt', 'log2'],
                   'max_leaf_nodes' : range(1,30),
                   'bootstrap' : [True, False],
                   'oob_score' : [True,False],
                   'max_samples' : numpy.linspace(0.1,1, 10),
                   'ccp_alpha' : numpy.linspace(0.01, 100, 100)}],
          ['knn' , {'n_neighbors' : range(2, 30),
                    'weights' : ['uniform', 'distance'],
                    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
                    'p' : [1,2],
                    'metric' : ['euclidean','manhattan','chebyshev','minkowski','seuclidean','mahalanobis']}],
#           ['svr',{}],
          ['svr', {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
                   'degree' : [1,2,3,4,5,6,7,8,9],
                   'gamma' : ['scale', 'auto'],
                   'C' : numpy.linspace(1, 100, 100),
                   'epsilon' : numpy.linspace(0.01, 1, 20),
                   'shrinking' : [True,False]}],
          ['gbr', {'loss' : ['ls', 'lad','huber', 'quantile'],
                   'learning_rate': numpy.linspace(0.01, 1, 50),
                   'n_estimators': range(1,30, 2),
                   'subsample' : numpy.linspace(0.1, 1.0, 20),
                   'criterion' : ['friedman_mse', 'mse', 'mae'],
                   'max_depth' : [None, 10,20,30,40,50,60,70,80,90,100],
                   'min_samples_split' : range(2, 30),
                   'min_samples_leaf' : range(1, 30),
                   'min_weight_fraction_leaf': numpy.linspace(0,0.5, 10),
                   'ccp_alpha' : numpy.linspace(0.1, 10, 100)}],
          ['pls', {'n_components' : range(1,20),
                   'scale' : [True, False]}],
          ['extra_tree', {'n_estimators': range(1,200,2),
                          'criterion':['mse','mae'],
                          'max_depth':[None,10,20,30,40,50,60,70,80],
                          'min_samples_split' : range(2, 30),
                          'min_samples_leaf' : range(1, 30),
                          'min_weight_fraction_leaf': numpy.linspace(0,0.5, 10),
                          'max_features':["auto", "sqrt", "log2"],
                          'max_leaf_nodes' : range(1,30),}],
          ['xgb', {}], 
#           ['xgb', { 'n_estimators': range(1,200,2),
#                     'learning_rate': numpy.linspace(0.01, 1, 50),
#                     'booster': ['gbtree', 'gblinear', 'dart'],
#                     'max_depth':[None,10,20,30,40,50,60,70,80],
#                     'reg_alpha':numpy.linspace(0.1,10,100),
#                     'reg_lambda':numpy.linspace(0.1,10,100),
#                     'importance_type':['gain', 'weight', 'cover', 'total_gain','total_cover']}],
          ['lgbm',{'boosting_type':['gbdt','dart','goss','rf'],
                   'num_leaves': range(1,100),
                   'learning_rate': numpy.linspace(0.1,100,100),
                   'max_depth':range(-1,80),
                   'reg_alpha':numpy.linspace(0.1,10,100),
                   'reg_lambda':numpy.linspace(0.1,10,100)}],
          ['polyreg',{'polynomialfeatures__degree':[2,3]}]
]

In [None]:
train_score = {'mean_sq_err' : [],
         'max_err': [],
         'r2' : [],
         'percent_err' : []}    

test_score = {'mean_sq_err' : [],
         'max_err': [],
         'r2' : [],
         'percent_err' : []}    

for i in range(len(models)):
    print(params[i][0])
    clf = RandomizedSearchCV(models[i],
                        param_distributions= params[i][1],
                        cv = 5,
                        scoring = 'r2',
                        n_jobs = 5, verbose = 100)

    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    y_train_pred = clf.predict(x_train)
    train_score['mean_sq_err'].append(clf.best_score_)
    train_score['max_err'].append(max_error(y_train,y_train_pred))
    train_score['r2'].append(r2_score(y_train,y_train_pred))
    train_score['percent_err'].append(mean_absolute_percentage_error(y_train,y_train_pred))

    test_score['mean_sq_err'].append(mean_squared_error(y_test,y_pred))
    test_score['max_err'].append(max_error(y_test,y_pred))
    test_score['r2'].append(r2_score(y_test,y_pred))
    test_score['percent_err'].append(mean_absolute_percentage_error(y_test,y_pred))

    print()
    print('best_estimator :', clf.best_estimator_)
    print("---------------------------------------------------------------------------")

train_score = pandas.DataFrame(train_score, index = [i[0] for i in params[:]])
test_score = pandas.DataFrame(test_score, index = [i[0] for i in params[:]])
display(train_score)
display(test_score)

In [None]:
train_df.columns

In [None]:
# index = ['lgbm']
# for idx in index:
#     try:
#         train_score.drop(index=[idx], inplace = True)
#         test_score.drop(index = [idx],inplace = True)
#     except:
#         print('except',idx)
pyplot.figure(figsize = (14,5))
pyplot.subplot(121)
pyplot.xticks(rotation=75)
seaborn.barplot( y = train_score['mean_sq_err'], x = train_score.index)
pyplot.subplot(122)
pyplot.xticks(rotation=75)
seaborn.barplot( y = test_score['mean_sq_err'], x = test_score.index)
pyplot.show()

In [None]:
best_model = ARDRegression(alpha_1=4.570881896148864e+65, alpha_2=4.265795188015882e+62,
              lambda_1=0.20281818181818184, lambda_2=0.5257272727272727,
              normalize=True, threshold_lambda=0.2)
best_model.fit(train_df,train_y.values.ravel())
y_pred = best_model.predict(valid_df)
y_pred = y_scaler.inverse_transform(y_pred)

In [None]:
submit['Amount'] = y_pred
submit

In [None]:
submit.loc[submit['Condition']==0, 'Amount'] = 0.0
submit

In [None]:
submit.to_csv('submission.csv')

## Neural network

In [None]:
# gridsearch cross validation in neural network model

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from keras.metrics import MeanSquaredError
import numpy
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = numpy.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

def nn_model(activation = 'relu', neurons = 17, optimizer = 'Adam',dropout = 0.1, init_mode = 'uniform'):
    model = Sequential()
    model.add(Dense(neurons, input_dim = 17, kernel_initializer = init_mode, activation= activation))
    model.add(Dense((neurons*2)//3, kernel_initializer = init_mode,activation= activation))
    model.add(Dense((neurons*4)//9,kernel_initializer = init_mode,  activation = activation))
    model.add(Dropout(dropout))
    model.add(Dense(1, kernel_initializer = init_mode, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer= optimizer, metrics=[MeanSquaredError()])
    return model

# Defining grid parameters
activation = ['softmax', 'softplus', 'softsign', 'relu', 'selu', 'elu', 'tanh','sigmoid', 'linear']
neurons = range(1,1000)
dropout = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
optimizer = ['SGD', 'Adam', 'Adamax','RMSprop','Adagrad','Adadelta','Nadam','Ftrl']
batch_size = range(10,101,10)
param_grid = dict(activation = activation, neurons = neurons, optimizer = optimizer, dropout = dropout, init_mode = init_mode, batch_size = batch_size)

kr = KerasRegressor(build_fn= nn_model, epochs= 5, batch_size = 40, verbose = 20)

model = RandomizedSearchCV(estimator= kr,cv = 5, param_distributions = param_grid, n_jobs=5, verbose = 50, n_iter=10)
model.fit(x_train,y_train)

report(model.cv_results_)

In [None]:
# gridsearch cross validation in neural network model

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from keras.metrics import MeanSquaredError

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = numpy.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

def nn_model(activation = 'linear', neurons = 793, optimizer = 'Nadam',dropout = 0.3, init_mode = 'lecun_uniform'):
    model = Sequential()
    model.add(Dense(neurons, input_dim = 17, kernel_initializer = init_mode, activation= activation))
    model.add(Dense((neurons*2)//3, kernel_initializer = init_mode,activation= activation))
    model.add(Dense((neurons*4)//9,kernel_initializer = init_mode,  activation = activation))
    model.add(Dropout(dropout))
    model.add(Dense(1, kernel_initializer = init_mode, activation='linear'))
    model.compile(loss='mean_squared_error', optimizer= optimizer, metrics=[MeanSquaredError()])
    return model

kr = KerasRegressor(build_fn= nn_model, epochs= 200, batch_size = 50, verbose = 1)

kr.fit(train_df,train_y)
y_pred = kr.predict(valid_df)
y_pred

In [None]:
submit['Amount'] = y_pred
# submit['Amount'] = submit['Amount'].abs()
submit.loc[submit['Condition']==0, 'Amount'] = 0.0
submit

In [None]:
submit.to_csv('submission2.csv')