In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import boxcox, skew
from IPython.display import display
%matplotlib inline

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
ID = test.id
loss = train.loss

train = train.drop(['id','loss'], axis=1)
test = test.drop(['id'], axis=1)

ntrain = train.shape[0]
ntest = test.shape[0]

In [None]:
cat_cols = [x for x in train.select_dtypes(include=['object']).columns]
cont_cols = [x for x in train.select_dtypes(exclude=['object']).columns]

In [None]:
## Putting a semi-colon in the end will stop Ipython to display <matplotlib ....> description
pd.DataFrame.hist(train,column=cont_cols, figsize=(15,15), sharex=True, sharey=True, bins=20);

In [None]:
## Check the skew in continuous variables in the train dataset
skewed_feats = train[cont_cols].apply(lambda x: skew(x.dropna()))
print(skewed_feats)

In [None]:
skewed_feats = skewed_feats[skewed_feats > 0.38]

In [None]:
## Apply Boxcox tranformation to reduce the skew in the train dataset
for column in skewed_feats.index:
    if column == 'cont9' :
        train[column] = train[column]
        train[column], lam = boxcox(train[column])
        print(column + " : " + str(lam))
    else:
        train[column] = train[column] + 1.0
        train[column], lam = boxcox(train[column]) 
        print(column + " : " + str(lam))

In [None]:
pd.DataFrame.hist(train,column=cont_cols, figsize=(10,10), sharex=True, sharey=True, bins=10);

In [None]:
## Plotting the continuous variables in the train dataset after removing skewness
sns.set_style('ticks')
with sns.color_palette('Reds_r'):
    plt.figure(figsize=(8,8))
    for column in cont_cols:
        sns.kdeplot(train[column], shade=True);
        plt.legend(loc=2)

In [None]:
skewed_feats = test[cont_cols].apply(lambda x: skew(x.dropna()))
print(skewed_feats)

In [None]:
skewed_feats = skewed_feats[skewed_feats > 0.36]

In [None]:
#Apply boxcox tranformation for the continuous variables in test set
for column in skewed_feats.index:
    if column == 'cont9' :
        test[column], lam = boxcox(test[column])
        print(column + " : " + str(lam))
    else:
        test[column] = test[column] + 1.0
        test[column], lam = boxcox(test[column]) 
        print(column + " : " + str(lam))

In [None]:
## After removing skewness in the test set
sns.set_style('ticks')
with sns.color_palette('Reds_r'):
    plt.figure(figsize=(8,8))
    for column in cont_cols:
        sns.kdeplot(test[column], shade=True)
        plt.legend(loc=2)

In [None]:
## Detect the outliers in the train dataset

from collections import defaultdict
outliers_count = defaultdict(lambda : 0)  ## For counting the datapoints that are outliers for each feature
df = train[cont_cols]

for feature in cont_cols:
    Q1 = np.percentile(df[feature],25.0)  
    Q3 = np.percentile(df[feature],75.0)
    step = 1.5*(Q3-Q1)
    
    #Define a dataframe that contains all the outliers
    outliers_df = (df[~((df[feature] >= Q1-step) & (df[feature] <= Q3+step))])
    
    ## Count of outliers
    for index in outliers_df.index.values:
        outliers_count[index] +=1

## Define a list that contains data points which are outliers for more than one feature        
max_outliers_count = [] 

## Count the number of features for which a data point is an outlier
max_features_count = set()

for key in outliers_count.keys():
    if outliers_count[key]> 1:
        max_features_count.add(outliers_count[key])
print(max_features_count)        

In [None]:
## We see that there are data points which are outliers for features in the range[2,6].
##Count the datapoints which are ouliers for more than one feature

for i in range(2,6):
    max_outliers_count = []
    for key in outliers_count.keys():
        if outliers_count[key] == i:
            max_outliers_count.append((key, outliers_count[key]))
    print("Datapoints which are outliers for " + str(i) + " features : " , len(max_outliers_count))        

In [None]:
## Remove all the data points from train set which are outliers for more than one feature
outliers_index = []

for key in outliers_count.keys():
        if outliers_count[key] > 1:
            outliers_index.append(key)
            
print("Total count : ", len(outliers_index))

In [None]:
train = train.drop(train.index[outliers_index]).reset_index(drop=True)

print("New train size : ", train.shape)

In [None]:
ntrain = train.shape[0]
ntest = test.shape[0]

In [None]:
## Scale the continuous data
from sklearn.preprocessing import StandardScaler
new_cont_feats = train[cont_cols]
scaler = StandardScaler()
scaler.fit(new_cont_feats)

In [None]:
## Transform the continuous train dataset with the scaler
train[cont_cols] = scaler.transform(new_cont_feats)

## Plot the continuous data in train set after scaling
sns.set_style('ticks')
with sns.color_palette('Reds_r'):
    plt.figure(figsize=(8,8))
    for column in cont_cols:
        sns.kdeplot(train[column], shade=True)
        plt.legend(loc=2)    

In [None]:
## Transform the continuous features in the  test set using the above scaler
test[cont_cols] = scaler.transform(test[cont_cols])
    
## Plot after scaling the test set
sns.set_style('ticks')
with sns.color_palette('Reds_r'):
    plt.figure(figsize=(8,8))
    for column in cont_cols:
        sns.kdeplot(test[column], shade=True)
        plt.legend(loc=2)   

In [None]:
cont_df = train[cont_cols]

In [None]:
## Check for correlation in the train set
corr = cont_df.corr()

##Drop first and last column from corr 
corr.drop(['cont1'], axis=0, inplace = True)
corr.drop(['cont14'], axis=1, inplace = True)

## Create a mask so that we have the correlation values once
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask,1)] = True

##Plot the heatmap
with sns.axes_style('white'):
    plt.figure(figsize=(12,12))
    sns.heatmap(corr, mask=mask, annot=True, cmap='RdBu', fmt='+.2f', cbar=True)

In [None]:
cont_df = test[cont_cols]

In [None]:
## Check for correlation in the test set
corr = cont_df.corr()

##Drop first and last column from corr 
corr.drop(['cont1'], axis=0, inplace = True)
corr.drop(['cont14'], axis=1, inplace = True)

## Create a mask so that we have the correlation values once
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask,1)] = True

##Plot the heatmap
with sns.axes_style('white'):
    plt.figure(figsize=(12,12))
    sns.heatmap(corr, mask=mask, annot=True, cmap='RdBu', fmt='+.2f', cbar=True)

In [None]:
## Combine train-test in order to convert the categorical variables
train_test = pd.concat((train, test)).reset_index(drop=True)

## Enocde the categorical data
for column in cat_cols:
    train_test[column] = pd.factorize(train_test[column], sort=True)[0]

## Separate the train-test sets    
x_train = train_test.iloc[:ntrain, :]
x_test = train_test.iloc[ntrain:, :]
y_train = loss
y_train = y_train.drop(y_train.index[outliers_index]).reset_index(drop=True)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

After the above codeblock, let us start experimenting with our dataset and applying PCA to see if it can give us an improved score.

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=14)
pca.fit(cont_df)

In [None]:
exp_variance = pca.explained_variance_
for element in exp_variance:
    print("{0:.2f}".format(element))

In [None]:
exp_variance_ratio = pca.explained_variance_ratio_
for element in exp_variance_ratio:
    print("{0:.2f}".format(element))

In [None]:
cont_df = train[cont_cols]

In [None]:
pca = PCA(n_components=6)

In [None]:
pca.fit(cont_df)

In [None]:
cont_df = pca.transform(cont_df)

In [None]:
cont_df.shape

In [None]:
cont_df = pd.DataFrame(cont_df, index=None)

In [None]:
cont_df.head()

In [None]:
cat_df = train[cat_cols]

In [None]:
new_train = cat_df.join(cont_df, how='left')

In [None]:
new_train.shape

In [None]:
cont_df = test[cont_cols]
cont_df = pca.transform(cont_df)

In [None]:
cont_df = pd.DataFrame(cont_df, index=None)
cat_df = test[cat_cols]
new_test = cat_df.join(cont_df, how='left')

In [None]:
new_test.shape