# Mercedes Benz Greener Manufacturing
The goal of this analysis is to lower the time needed on the test bench for cars manufactured

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the datasets
df = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")

## Data exploration

In [3]:
# Check the first few values
df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Check the data types and shape of the dataframe
# y is the float64 column, x0 till x8 are objects since they contain strings
# the rest are int64
df.info(), df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


(None, (4209, 378))

In [5]:
# Check for nan values
# There are no null values
df.isnull().sum().sort_values(ascending=False)

ID      0
X254    0
X263    0
X262    0
X261    0
       ..
X127    0
X126    0
X125    0
X124    0
X385    0
Length: 378, dtype: int64

In [6]:
# Check the test set data
# As expected no target values are included
# Contains the same number of columns and rows as test
df_test

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8410,aj,h,as,f,d,aa,j,e,0,...,0,0,0,0,0,0,0,0,0,0
4205,8411,t,aa,ai,d,d,aa,j,y,0,...,0,1,0,0,0,0,0,0,0,0
4206,8413,y,v,as,f,d,aa,d,w,0,...,0,0,0,0,0,0,0,0,0,0
4207,8414,ak,v,as,a,d,aa,c,q,0,...,0,0,1,0,0,0,0,0,0,0


## Preprocessing the data

In [7]:
# Check for null values
for x, i in enumerate(df.isna().any()):
    if i:
        print(x, i)

In [8]:
# Exlude the y (target)
y_trained = df["y"]
df_x = df.drop("y", axis=1)
df_x

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,k,v,at,a,d,u,j,o,0,...,0,0,1,0,0,0,0,0,0,0
1,6,k,t,av,e,d,y,l,o,0,...,1,0,0,0,0,0,0,0,0,0
2,7,az,w,n,c,d,x,j,x,0,...,0,0,0,0,0,0,1,0,0,0
3,9,az,t,n,f,d,x,l,e,0,...,0,0,0,0,0,0,0,0,0,0
4,13,az,v,n,f,d,h,d,n,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8405,ak,s,as,c,d,aa,d,q,0,...,1,0,0,0,0,0,0,0,0,0
4205,8406,j,o,t,d,d,aa,h,h,0,...,0,1,0,0,0,0,0,0,0,0
4206,8412,ak,v,r,a,d,aa,g,e,0,...,0,0,1,0,0,0,0,0,0,0
4207,8415,al,r,e,f,d,aa,l,u,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Check the variance of each column
# Columns with letters are not considered
df_x.var(numeric_only=True)

ID      5.941936e+06
X10     1.313092e-02
X11     0.000000e+00
X12     6.945713e-02
X13     5.462335e-02
            ...     
X380    8.014579e-03
X382    7.546747e-03
X383    1.660732e-03
X384    4.750593e-04
X385    1.423823e-03
Length: 369, dtype: float64

### Dropping with columns with variance 0
- Find the variance of the relevent columns using iloc
- Get their index while looping through them using enumerate
- Use the columns function to get their names using the index (and fix the indexing if needed)
- Drop the columns in the generated array using the drop function

In [10]:
# Perform the above steps on both the test and train
columns_drop = df_x.columns[[x+9 for x, i in enumerate(df.iloc[:, 9:].var(numeric_only=True)) if i==0]]
df_x_var = df_x.drop(columns_drop, axis=1)
df_test_var = df_test.drop(columns_drop, axis=1)

df_x_var

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,k,v,at,a,d,u,j,o,0,...,0,0,1,0,0,0,0,0,0,0
1,6,k,t,av,e,d,y,l,o,0,...,1,0,0,0,0,0,0,0,0,0
2,7,az,w,n,c,d,x,j,x,0,...,0,0,0,0,0,0,1,0,0,0
3,9,az,t,n,f,d,x,l,e,0,...,0,0,0,0,0,0,0,0,0,0
4,13,az,v,n,f,d,h,d,n,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8405,ak,s,as,c,d,aa,d,q,0,...,1,0,0,0,0,0,0,0,0,0
4205,8406,j,o,t,d,d,aa,h,h,0,...,0,1,0,0,0,0,0,0,0,0
4206,8412,ak,v,r,a,d,aa,g,e,0,...,0,0,1,0,0,0,0,0,0,0
4207,8415,al,r,e,f,d,aa,l,u,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Drop the id column since it will skew the data
df_x_var = df_x_var.drop("ID", axis=1)
df_test_var = df_test_var.drop("ID", axis=1)

### Apply Label Encoding

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
# Initialize the encoder 
# All object columns have 0 and 1 as unique values
# Hence encode the ones that have a higher number of values
df_final = df_x_var.copy()
df_test_final = df_test_var.copy()
enc = LabelEncoder()

for col in df_final.columns:
    if len(np.unique(df_final[col])) > 2:
        df_final[col] = enc.fit_transform(df_final[col])

# Apply same steps to test data
for col in df_test_final.columns:
    if len(np.unique(df_test_final[col])) > 2:
        df_test_final[col] = enc.fit_transform(df_test_final[col])

In [14]:
df_final

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X12,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,32,23,17,0,3,24,9,14,0,0,...,0,0,1,0,0,0,0,0,0,0
1,32,21,19,4,3,28,11,14,0,0,...,1,0,0,0,0,0,0,0,0,0
2,20,24,34,2,3,27,9,23,0,0,...,0,0,0,0,0,0,1,0,0,0
3,20,21,34,5,3,27,11,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,23,34,5,3,12,3,13,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8,20,16,2,3,0,3,16,0,0,...,1,0,0,0,0,0,0,0,0,0
4205,31,16,40,3,3,0,7,7,0,0,...,0,1,0,0,0,0,0,0,0,0
4206,8,23,38,0,3,0,6,4,0,1,...,0,0,1,0,0,0,0,0,0,0
4207,9,19,25,5,3,0,11,20,0,0,...,0,0,0,0,0,0,0,0,0,0


## Applying PCA

In [26]:
from sklearn.decomposition import PCA

n_comp = 20
pca = PCA(n_components = n_comp, random_state = 42)
pca_result_train = pca.fit_transform(df_final)
pca_result_test = pca.transform(df_test_final)

In [27]:
pca_result_train.shape, pca_result_test.shape

((4209, 20), (4209, 20))

## Apply xgboost

In [33]:
# ML Modeling with XGboost
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# Defining train & test for model input
train_X = pca_result_train
train_y = y_trained

# Splitting
x_train, x_valid, y_train, y_valid = train_test_split(train_X, train_y, test_size=0.2, random_state=42)

# Defining feature set
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)
d_test = xgb.DMatrix(pca_result_test)
xgb_params = {
 'eta': 0.01,
 'max_depth': 4,
 'objective': 'reg:squarederror',
}

# Creating a function for the predicting score
def xgb_r2_score(preds, dtrain):
 labels = dtrain.get_label()
 return 'r2', r2_score(labels, preds)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]
mdl = xgb.train(xgb_params, d_train, 1000, watchlist, early_stopping_rounds=50, feval=xgb_r2_score, maximize=True, verbose_eval=10)

[0]	train-rmse:99.99312	train-r2:-60.72097	valid-rmse:99.87946	valid-r2:-63.09193
[10]	train-rmse:90.55032	train-r2:-49.61422	valid-rmse:90.43404	valid-r2:-51.54300
[20]	train-rmse:82.01793	train-r2:-40.52504	valid-rmse:81.90256	valid-r2:-42.09691
[30]	train-rmse:74.30950	train-r2:-33.08641	valid-rmse:74.19589	valid-r2:-34.36802
[40]	train-rmse:67.34649	train-r2:-26.99771	valid-rmse:67.23296	valid-r2:-28.04127
[50]	train-rmse:61.05413	train-r2:-22.01032	valid-rmse:60.95432	valid-r2:-22.87042
[60]	train-rmse:55.37347	train-r2:-17.92762	valid-rmse:55.28781	valid-r2:-18.63857
[70]	train-rmse:50.24891	train-r2:-14.58640	valid-rmse:50.17174	valid-r2:-15.17221
[80]	train-rmse:45.62118	train-r2:-11.84770	valid-rmse:45.56302	valid-r2:-12.33755
[90]	train-rmse:41.45218	train-r2:-9.60687	valid-rmse:41.41132	valid-r2:-10.01765
[100]	train-rmse:37.69297	train-r2:-7.77028	valid-rmse:37.66723	valid-r2:-8.11545
[110]	train-rmse:34.30694	train-r2:-6.26535	valid-rmse:34.29880	valid-r2:-6.55804
[120]	tr

In [34]:
# Predicting on test set
p_test = mdl.predict(d_test)
p_test

array([ 77.79394 ,  95.5716  ,  81.06427 , ...,  93.406136, 109.33432 ,
        98.40287 ], dtype=float32)

In [35]:
# Predicted values in a dataframe
Predicted_Data = pd.DataFrame()
Predicted_Data['y'] = p_test
Predicted_Data.head()

Unnamed: 0,y
0,77.793938
1,95.571602
2,81.06427
3,76.868462
4,109.254913
