 # Please! Don't Overfit

In [None]:
import numpy as np 
import pandas as pd

In [None]:
#Reading Training data
data_df=pd.read_csv("train.csv")

In [None]:
data_df.head()

In [None]:
data_df.shape

**There is a good possibility of overfitting due to 300 features but only 250 records to train.**

In [None]:
#Seperating the features and label
X=data_df.iloc[:,2:].values
y=data_df.iloc[:,1].values

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

**Exploratory Data Analysis**  

In [None]:
ax = sns.countplot(x = y ,palette="Set2")
sns.set(font_scale=1.5)
ax.set_xlabel(' ')
ax.set_ylabel(' ')
fig = plt.gcf()
fig.set_size_inches(10,5)
ax.set_ylim(top=300)
for p in ax.patches:
    ax.annotate('{:.2f}%'.format(100*p.get_height()/len(y)), (p.get_x()+ 0.3, p.get_height()))

plt.title('Distribution of labels/outputs')
plt.xlabel('Output Label')
plt.ylabel('Frequency [%]')
plt.show()

**Visualization to see the correlation between the different features.**

In [None]:
sns.set(style="white")

# Compute the correlation matrix
corr = data_df.corr()
plt.figure(figsize=(12,10))
cmap = sns.diverging_palette(0, 1000, as_cmap=True)
sns.heatmap(corr, cmap=cmap, vmax=.3, center=0,
             cbar_kws={"shrink": .5})

plt.show()

In [None]:
#Check for null data points
data_df.isnull().any().any()


**Standardizing the data** 

In [None]:
from sklearn.preprocessing import StandardScaler
stc=StandardScaler()
X_scale=stc.fit_transform(X)
#split test and train
#from sklearn.model_selection import train_test_split
#X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

**Splitting data into Training and Validation set**

In [None]:
#stratified K fold method to split Y 
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
skf = StratifiedKFold(n_splits=5)

# Modeling

In [None]:
#function to calculate ROC score for various algos.Returns mean of 5 ROC scores.
def out_cross_val(model,X,y):
    score=[]
    for i,(train,test) in enumerate(skf.split(X,y)):
        X_train,X_test=X[train],X[test]
        y_train,y_test=y[train],y[test]
        model=model
        model.fit(X_train,y_train)
        y_pred=model.predict(X_test)
        rocscore=roc_auc_score(y_test,y_pred) 
        score.append(rocscore)
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(score), np.std(score)))   
    
    return np.mean(score), np.std(score)

***LOGISTIC REGRESSION***

In [None]:
from sklearn.linear_model import LogisticRegression
model_LR = LogisticRegression(solver='liblinear')
log_score=out_cross_val(model_LR,X_scale,y)

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
cm_LR=confusion_matrix(y,model_LR.predict(X_scale))
cm_LR

***SUPPORT VECTOR CLASSIFIER***

In [None]:
from sklearn.svm import SVC
classifier_svc=SVC(random_state=0,gamma='scale',probability=True)
from sklearn.model_selection import GridSearchCV
param_svc=[{'C':[1,5,10],'kernel':['rbf','linear']}]
gs_svc=GridSearchCV(estimator = classifier_svc,
                           param_grid = param_svc,
                           scoring = 'roc_auc',
                           cv = 10,
                           n_jobs = -1)
gs_svc=gs_svc.fit(X_scale,y)
print('the best score is:{}'.format(gs_svc.best_score_))
print('the best parameters are:{}'.format(gs_svc.best_params_)) 
means = gs_svc.cv_results_['mean_test_score']
stds = gs_svc.cv_results_['std_test_score']
  Zdxfor mean, std, params in zip(means, stds, gs_svc.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))
classifier_svc=SVC(random_state=0,gamma='scale',probability=True,**gs_svc.best_params_)
svc_score=out_cross_val(classifier_svc,X_scale,y)
classifier_svc=classifier_svc.fit(X_scale,y)
#target_svc=classifier_svc.predict(X_test)

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
cm_svc=confusion_matrix(y,classifier_svc.predict(X))
cm_svc

In the above confusion matrix,there are ***0 cases of incorrect predictions***.Data is ***overfitted***.

***Gradient Boosting Classifier*** 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbm0 = GradientBoostingClassifier(random_state=10)
GBM_score=out_cross_val(gbm0, X_scale, y)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
param_gbm=[{'n_estimators':range(20,81,10),'max_depth':range(5,16,2)}]
#gsearch_gbm = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,min_samples_leaf=50,max_features='sqrt',subsample=0.8,random_state=10), 
 #                                                              param_grid = param_gbm, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch_gbm = GridSearchCV(estimator = GradientBoostingClassifier(),param_grid = param_gbm)
gsearch_gbm.fit(X_scale,y)
print('the best score is:{}'.format(gsearch_gbm.best_score_))
print('the best parameters are:{}'.format(gsearch_gbm.best_params_)) 
means = gsearch_gbm.cv_results_['mean_test_score']
stds = gsearch_gbm.cv_results_['std_test_score']

    
gbm0 = GradientBoostingClassifier(random_state=10,**gsearch_gbm.best_params_)
GBM_score=out_cross_val(gbm0, X_scale, y)

Tried tuning various hyper parameters with no much improvement in the ROC score.

**Random Forest Classifier:** Modeling the dataset with the default parameters of Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_RFC=RandomForestClassifier(n_estimators=1000, min_samples_leaf=25, max_features=0.5, n_jobs=-1, 
                                oob_score=True)
score_RFC=out_cross_val(model_RFC,X_scale,y)

## Feature Extraction

**LASSO MODEL:**For overfitting with many features it's a good option to implement L1 regularization technique and calculate ROC score.<br> A technical detail to note here is that once the threshold has been set to 0.01(by trial and error),the L1 model has assigned a weigth of 0 some of the feature.Hence the shape of input dataset(X_transform) is reduced to (250,140) from (250,300) i.e. from 300 features Lasso has discarded 160 features y assigining them a weight of zero.The ROC score has definitely  improved from 0.77 to 0.9

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
model_lasso = Lasso(alpha=0.01)
lasso_score=out_cross_val(model_lasso,X_scale,y)
# Set a minimum threshold of 0.01 by trial and error
sfm_lasso = SelectFromModel(model_lasso, threshold=0.001)
#sfm = SelectFromModel(clf)
sfm_lasso.fit(X_scale, y)
X_transform = sfm_lasso.transform(X)
print(X_transform.shape)
lasso_featurescore=out_cross_val(model_lasso,X_transform,y)

Plotting box plot of all the scores calculated so far!

In [None]:
plt.figure(figsize=(12, 8));
scores_df = pd.DataFrame({'LogisticRegression': log_score})
scores_df['SVC'] = svc_score
#scores_df['Lasso Uni-sel'] = lasso_score_US
scores_df['GBM'] = GBM_score
scores_df['Lasso Regularization'] = lasso_featurescore
scores_df['RandomForestClassifier'] = score_RFC

sns.boxplot(data=scores_df);
plt.xticks(rotation=45);

**Conclusion:** As expected the Lasso tool lived upto its name by internally assigning the zero weights to the features which are leading to overfitting of the data.The other feature selection techniques do not seem to have significant effect on the ROC score in this particular scenario.Even Lasso based univariate selection with 150 features also seem to give a decent result compared to other models.