In [1]:
# Author : Uchenna Emechebe

# This is a Kaggle Competition from Mercedes benz that ran from May 30th 2017 to July 3rd 2017
# I stumbled into this competition on the 20th of June 2017 as my first Kaggle competition.

# Purpose: Use various features of various mercedes benz cars to predict the time each car will spend 
# on the test bench before it passes testing. This might help reveal features that lead to faster testing times
# and thus save the company money by not spending so much money on unneccessary features that does not help
# and also , maybe, help with thier scheduling times. 

# The challenge posed by this  data set is that it has lots of features (377 features); a mixture of
# categorical and continous data. 

# Pre processing the train and test data set together

import pandas as pd

# Load train data
Training = pd.read_csv('train.csv')

# Load test data
Test = pd.read_csv('test.csv')

In [2]:
Training.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [3]:
Test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [4]:
# Get the common columns
# Training names
Names_Training = Training.columns.values 
# Test names
Names_Test = Test.columns.values 

In [5]:
# Now compare the two and get the common features present in the two data set
Common = list(set(Names_Training).intersection(set(Names_Test)))

In [6]:
len(Common)

377

In [7]:
# 377 features that are common including the ID column.
# I dont need the ID column so remove that from the list 

Common.remove('ID')

In [8]:
len(Common)

376

In [9]:
# Now use these features to describe both train and test data set

Training_common_feature = Training[Common]
Test_common_feature = Test[Common]

In [10]:
Training_common_feature.shape

(4209, 376)

In [11]:
Test_common_feature.shape

(4209, 376)

In [12]:
# Lets use this function to get all the names of the categorical features in the dataset
Categorical_columns_Train = list(Training_common_feature.select_dtypes(include=['category','object']))

In [13]:
Categorical_columns_Train

['X8', 'X2', 'X3', 'X0', 'X1', 'X6', 'X4', 'X5']

In [14]:
# Confirm thats the same for the test data
Categorical_columns_Test = list(Test_common_feature.select_dtypes(include=['category','object']))
Categorical_columns_Test

['X8', 'X2', 'X3', 'X0', 'X1', 'X6', 'X4', 'X5']

In [15]:
# Lets use this function to get all the names of the continous features in the dataset
Continous_columns_Train = list(Training_common_feature.select_dtypes(exclude=['category','object']))
len(Continous_columns_Train)

368

In [16]:
# Test 
Continous_columns_Test = list(Test_common_feature.select_dtypes(exclude=['category','object']))
len(Continous_columns_Test)

368

In [17]:
# Everything seem the same

In [18]:
# One hot encoding for the categorical features

In [19]:
# Subset out the Categorical data in the train data set and then dummify them
# Training
CategoricalData_Train=Training_common_feature[Categorical_columns_Test]
CategoricalData_Train.head()

Unnamed: 0,X8,X2,X3,X0,X1,X6,X4,X5
0,o,at,a,k,v,j,d,u
1,o,av,e,k,t,l,d,y
2,x,n,c,az,w,j,d,x
3,e,n,f,az,t,l,d,x
4,n,n,f,az,v,d,d,h


In [20]:
# Dummify those features
Categorical_dummified_Train=pd.get_dummies(CategoricalData_Train)
Categorical_dummified_Train.head()

Unnamed: 0,X8_a,X8_b,X8_c,X8_d,X8_e,X8_f,X8_g,X8_h,X8_i,X8_j,...,X5_o,X5_p,X5_q,X5_r,X5_s,X5_u,X5_v,X5_w,X5_x,X5_y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Do the same for test data
CategoricalData_Test=Test_common_feature[Categorical_columns_Test]
Categorical_dummified_Test=pd.get_dummies(CategoricalData_Test)
Categorical_dummified_Test.head()

Unnamed: 0,X8_a,X8_b,X8_c,X8_d,X8_e,X8_f,X8_g,X8_h,X8_i,X8_j,...,X5_p,X5_q,X5_r,X5_s,X5_t,X5_v,X5_w,X5_x,X5_y,X5_z
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [22]:
# Already we see there is a difference between the two columns. 
# Lets use the same columns to create both train and test data set

# Get the common columns
# Training names
Categorical_Names_Training = Categorical_dummified_Train.columns.values 
# Test names
Categorical_Names_Test = Categorical_dummified_Test.columns.values 

# Now compare the two and get the common features present in the two data set
Common_Categorical = list(set(Categorical_Names_Training).intersection(set(Categorical_Names_Test)))
len(Common_Categorical)

185

In [23]:
# There are 185 dummified features that are common to both train and test data set.

# Now use these features to describe both the categorical part of train and test data set

Training_dummified_common = Categorical_dummified_Train[Common_Categorical]
Test_dummified_common = Categorical_dummified_Test[Common_Categorical]


In [24]:
Training_dummified_common.head()

Unnamed: 0,X2_as,X0_ak,X2_ap,X2_aw,X2_av,X2_au,X2_at,X2_ay,X0_ad,X0_af,...,X2_e,X2_d,X2_b,X2_a,X2_n,X2_m,X2_k,X2_j,X2_i,X2_h
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [25]:
Test_dummified_common.head()

Unnamed: 0,X2_as,X0_ak,X2_ap,X2_aw,X2_av,X2_au,X2_at,X2_ay,X0_ad,X0_af,...,X2_e,X2_d,X2_b,X2_a,X2_n,X2_m,X2_k,X2_j,X2_i,X2_h
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# Go back to the main data and drop off the 8 categorical data that was dummified

# Train data
Training_common_feature_1=Training_common_feature.drop(Categorical_columns_Train,axis=1)

# Test data
Test_common_feature_1=Test_common_feature.drop(Categorical_columns_Train,axis=1)

In [27]:
Training_common_feature_1.shape

(4209, 368)

In [28]:
Test_common_feature_1.shape

(4209, 368)

In [29]:
# Ok everything seems consistent.
# Now add the common dummified features back to the data set

# Adding for Train data set
Training_common_feature_2 = pd.concat([Training_common_feature_1,Training_dummified_common], axis=1)

# Adding for Test data set
Test_common_feature_2 = pd.concat([Test_common_feature_1,Test_dummified_common], axis=1)

In [30]:
Training_common_feature_2.head()

Unnamed: 0,X189,X185,X184,X187,X186,X181,X180,X183,X182,X291,...,X2_e,X2_d,X2_b,X2_a,X2_n,X2_m,X2_k,X2_j,X2_i,X2_h
0,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [31]:
Test_common_feature_2.head()

Unnamed: 0,X189,X185,X184,X187,X186,X181,X180,X183,X182,X291,...,X2_e,X2_d,X2_b,X2_a,X2_n,X2_m,X2_k,X2_j,X2_i,X2_h
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# I think the categorical part of the data has been dealt with accordingly

# Time to look at the continous data to make sure both train and test agree

In [37]:
ContinousDataTrain=Training_common_feature[Continous_columns_Train]

In [38]:
ContinousDataTest = Test_common_feature[Continous_columns_Test]

In [39]:
ContinousDataTrain.shape

(4209, 368)

In [40]:
ContinousDataTest.shape

(4209, 368)

In [41]:
# The continous data seem to match.

# So I think the data sets seem to be congruent with respect to features.

# So lets save the two data sets.

Training_common_feature_2.to_csv('TrainingDataSet_common_features.csv', sep=',')

In [42]:
Test_common_feature_2.to_csv('TestDataSet_common_features.csv', sep=',')

In [55]:
# Also save the target variable for the training data set

TargetColumns = ['ID','y']
Target_ID_Data = Training[TargetColumns]

In [57]:
# Tested if there is any linear relationship between target and ID
# No surprises there
TargetData.corr()

Unnamed: 0,y,ID
y,1.0,-0.055108
ID,-0.055108,1.0


In [59]:
# So lets save the y column
Target_Data = Training['y']
Target_Data.to_csv('Target_Variable.csv', sep=',')

In [None]:
# So now we can use the TrainingDataSet_common_features and Targer_Data to train a model

# Then use the model generated to make predictions on TestDataSet_common_features.csv

# I have two options now. The first option is to do a lasso regression on the data set to get the best alpha values
# and then use that alpha value to find features that are close to zero. Those features will be dropped

# Open a new notebook called Train_Common_FeatureSelectionWithLasso
