# Mercedes-Benz Greener Manufacturing 

In [1]:
## Importing all necessary packages  . 

import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import seaborn as sn
import matplotlib.pyplot as plt
import xgboost, time
from sklearn.metrics import accuracy_score
from sklearn.metrics import  mean_squared_error,mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [2]:
## Loading train and test dataset

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
### Viewing the top 5 records of train dataset

train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
## Viewing the top 5 records of the test datadset

test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [5]:
### viewing the shape of train and test datset

print("The shape of the train dataset is :", train.shape)
print("The shape of the test dataset is :", test.shape)

The shape of the train dataset is : (4209, 378)
The shape of the test dataset is : (4209, 377)


In [6]:
### Viewing the datatypes of the columns in the train dataset

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


In [7]:
### Viewing the datatypes of the columns in the test dataset

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 377 entries, ID to X385
dtypes: int64(369), object(8)
memory usage: 12.1+ MB


In [8]:
### Checking for the columns present only in the train dataset not in test dataset

set(train.columns).difference(set(test.columns))

{'y'}

In [9]:
### Checking for the columns present only in the test dataset not in train dataset

set(test.columns).difference(set(train.columns))

set()

## TASK : Checking for null Values 

In [10]:
#### Checking for the columns with null values in the train dataset . 

train.columns[train.isna().any()]

Index([], dtype='object')

In [11]:
##### sum of all null values in the train dataset .

np.sum(train.isnull().sum())

0

In [12]:
#### Checking for the columns with null values in the test dataset . 

test.columns[test.isna().any()]

Index([], dtype='object')

In [13]:
#### sum of all null values in the test dataset . 

np.sum(test.isnull().sum())

0

## TASK : Identifying the columns with zero variance and removing them  . 

In [14]:
### Checking for the columns with zero variance in train dataset

zero_var_cols=train.var()[train.var()==0].index.values

  zero_var_cols=train.var()[train.var()==0].index.values


In [15]:
## columns with zero variance

zero_var_cols

array(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290',
       'X293', 'X297', 'X330', 'X347'], dtype=object)

In [16]:
### Droping the columns with zero varaince from train dataset

for col in zero_var_cols:
    print(col)
    train.drop(col,axis=1,inplace=True)

X11
X93
X107
X233
X235
X268
X289
X290
X293
X297
X330
X347


In [17]:
### Droping the ID column

train.drop('ID',axis=1,inplace=True)

In [18]:
### Shape of the train dataset after removing the zero varaiance columns and ID

train.shape

(4209, 365)

In [19]:
### Collecting the columns which are present in test but not in train

diff_cols=set(test.columns).difference(set(train.columns))

In [20]:
### List of the difference columns

diff_cols

{'ID',
 'X107',
 'X11',
 'X233',
 'X235',
 'X268',
 'X289',
 'X290',
 'X293',
 'X297',
 'X330',
 'X347',
 'X93'}

In [21]:
### Dropping the columns with Zero variance and ID from test Datset .

test.drop(diff_cols,axis=1,inplace=True)

In [22]:
### Checking the shape of Test Dataset after dropping 

test.shape

(4209, 364)

In [23]:
## Checking the Columns with zero variance in the test dataset

test.var()[test.var()==0].index.values

  test.var()[test.var()==0].index.values


array(['X257', 'X258', 'X295', 'X296', 'X369'], dtype=object)

## TASK : Applying Label Encoding 

In [24]:
### Identifying categorical columns in train Dataset.

obj_cols=train.columns[train.dtypes=='object']

In [25]:
### Viewing the Categorical columns

obj_cols

Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], dtype='object')

In [26]:
### Viewing categorical columns in test Dataset . 

test.columns[test.dtypes=='object']

Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], dtype='object')

In [27]:
## converting the categorical columns in the train and test Dataset into numeric using labelEncoder

le = LabelEncoder()


for col in obj_cols:
    train[col]=le.fit_transform(train[col])
    test[col] = le.fit_transform(test[col])
  

In [28]:
### Viewing the train Dataset after Label Encoding

train.head()

Unnamed: 0,y,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,130.81,32,23,17,0,3,24,9,14,0,...,0,0,1,0,0,0,0,0,0,0
1,88.53,32,21,19,4,3,28,11,14,0,...,1,0,0,0,0,0,0,0,0,0
2,76.26,20,24,34,2,3,27,9,23,0,...,0,0,0,0,0,0,1,0,0,0
3,80.62,20,21,34,5,3,27,11,4,0,...,0,0,0,0,0,0,0,0,0,0
4,78.02,20,23,34,5,3,12,3,13,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
### Viewing the test Dataset after Label Encoding

test.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X12,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,21,23,34,5,3,26,0,22,0,0,...,0,0,0,1,0,0,0,0,0,0
1,42,3,8,0,3,9,6,24,0,0,...,0,0,1,0,0,0,0,0,0,0
2,21,23,17,5,3,0,9,9,0,0,...,0,0,0,1,0,0,0,0,0,0
3,21,13,34,5,3,31,11,13,0,0,...,0,0,0,1,0,0,0,0,0,0
4,45,20,17,2,3,30,8,12,0,0,...,1,0,0,0,0,0,0,0,0,0


###  Data Analysis 

In [30]:
### Dividing the train dataset into features and target . 

train_features = train.drop('y',axis=1)
train_target = train['y']


In [31]:
### Viewing the Shape of the features and target dataset created 

print("Shape of the features in train",train_features.shape)
print("Shape of the target in train",train_target.shape)

Shape of the features in train (4209, 364)
Shape of the target in train (4209,)


In [32]:
### Viewing the cardinality of the all columns in the features dataset.

print("Cardinal of the columns in features dataset : ") 

for col in train_features.columns:
    print(col, train_features[col].nunique() )
          
    

Cardinal of the columns in features dataset : 
X0 47
X1 27
X2 44
X3 7
X4 4
X5 29
X6 12
X8 25
X10 2
X12 2
X13 2
X14 2
X15 2
X16 2
X17 2
X18 2
X19 2
X20 2
X21 2
X22 2
X23 2
X24 2
X26 2
X27 2
X28 2
X29 2
X30 2
X31 2
X32 2
X33 2
X34 2
X35 2
X36 2
X37 2
X38 2
X39 2
X40 2
X41 2
X42 2
X43 2
X44 2
X45 2
X46 2
X47 2
X48 2
X49 2
X50 2
X51 2
X52 2
X53 2
X54 2
X55 2
X56 2
X57 2
X58 2
X59 2
X60 2
X61 2
X62 2
X63 2
X64 2
X65 2
X66 2
X67 2
X68 2
X69 2
X70 2
X71 2
X73 2
X74 2
X75 2
X76 2
X77 2
X78 2
X79 2
X80 2
X81 2
X82 2
X83 2
X84 2
X85 2
X86 2
X87 2
X88 2
X89 2
X90 2
X91 2
X92 2
X94 2
X95 2
X96 2
X97 2
X98 2
X99 2
X100 2
X101 2
X102 2
X103 2
X104 2
X105 2
X106 2
X108 2
X109 2
X110 2
X111 2
X112 2
X113 2
X114 2
X115 2
X116 2
X117 2
X118 2
X119 2
X120 2
X122 2
X123 2
X124 2
X125 2
X126 2
X127 2
X128 2
X129 2
X130 2
X131 2
X132 2
X133 2
X134 2
X135 2
X136 2
X137 2
X138 2
X139 2
X140 2
X141 2
X142 2
X143 2
X144 2
X145 2
X146 2
X147 2
X148 2
X150 2
X151 2
X152 2
X153 2
X154 2
X155 2
X156 2
X157 2
X158 2

In [33]:
### Checking for the columns which high correlation in the correlation Matrix.

train_features.corr()[(train_features.corr() < -0.8) | (train_features.corr() > 0.8)]

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X12,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
X0,1.0,,,,,,,,,,...,,,,,,,,,,
X1,,1.0,,,,,,,,,...,,,,,,,,,,
X2,,,1.0,,,,,,,,...,,,,,,,,,,
X3,,,,1.0,,,,,,,...,,,,,,,,,,
X4,,,,,1.0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X380,,,,,,,,,,,...,,,,,,1.0,,,,
X382,,,,,,,,,,,...,,,,,,,1.0,,,
X383,,,,,,,,,,,...,,,,,,,,1.0,,
X384,,,,,,,,,,,...,,,,,,,,,1.0,


In [34]:
### Copying the categorical data from test dataframe

test_obj_cols = test[obj_cols]

In [35]:
### viewing the shape and data of the categorical_test dataset

print("Shape of test categorical data",test_obj_cols.shape)
test_obj_cols.head()

Shape of test categorical data (4209, 8)


Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,21,23,34,5,3,26,0,22
1,42,3,8,0,3,9,6,24
2,21,23,17,5,3,0,9,9
3,21,13,34,5,3,31,11,13
4,45,20,17,2,3,30,8,12


In [36]:
### Droping the categorical data from test

test = test.drop(obj_cols,axis=1)
print("Shape of test data set after dropping categorical columns",test.shape)

Shape of test data set after dropping categorical columns (4209, 356)


In [37]:
### Copying the categorical data from train Dataframe

train_obj_cols = train_features[obj_cols]

In [38]:
### viewing the shape and data of the categorical_train dataset

print("Shape of train categorical Data",train_obj_cols.shape)
train_obj_cols.head()

Shape of train categorical Data (4209, 8)


Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,32,23,17,0,3,24,9,14
1,32,21,19,4,3,28,11,14
2,20,24,34,2,3,27,9,23
3,20,21,34,5,3,27,11,4
4,20,23,34,5,3,12,3,13


In [39]:
### Droping the categorical data from train

train_features = train_features.drop(obj_cols,axis=1)
print("Shape of train data set after dropping categorical columns",train_features.shape)

Shape of train data set after dropping categorical columns (4209, 356)


## TASK: PCA

In [40]:
### Creating a PCA instance 

pca = PCA(n_components=0.9)

In [41]:
### Fit it on the train Data

pca.fit(train_features)

PCA(n_components=0.9)

In [42]:
### Viewing the percentage of variance that is attributed by each of the selected components

pca.explained_variance_ratio_

array([0.13075463, 0.08895904, 0.08608808, 0.06848157, 0.05775222,
       0.04803243, 0.03839034, 0.03295792, 0.02810834, 0.02509342,
       0.02354912, 0.01974626, 0.01703296, 0.01658974, 0.0154622 ,
       0.0148002 , 0.0139528 , 0.01220624, 0.01040626, 0.01022129,
       0.00979087, 0.00890637, 0.0088512 , 0.00830484, 0.00803175,
       0.0074351 , 0.00733212, 0.0065182 , 0.00618283, 0.00584653,
       0.00531908, 0.00509227, 0.00471843, 0.00462325, 0.00440977,
       0.00421579, 0.00415108, 0.00404883, 0.00390537, 0.0036097 ,
       0.00355435, 0.00343368, 0.00338189, 0.00323801, 0.00316569,
       0.00299818, 0.00284821])

In [43]:
### Viewing the shape

pca.explained_variance_ratio_.shape

(47,)

In [44]:
### Calculating sum of variances ratio . 

np.sum(pca.explained_variance_ratio_)

0.9024984502675398

In [45]:
### Applying PCA tranformation on both train and test Dataset . 

train_features_transformed = pca.transform(train_features)
test_transformed = pca.transform(test)

In [46]:
### Shape of both train and test data after PCA

print("Shape of train Data after tranformation ",train_features_transformed.shape)
print("Shape of test Data after tranformation ",test_transformed.shape)

Shape of train Data after tranformation  (4209, 47)
Shape of test Data after tranformation  (4209, 47)


In [47]:
### Converting the numpy array to dataframe

train_features_transformed = pd.DataFrame(train_features_transformed)
test_transformed = pd.DataFrame(test_transformed)

In [48]:
### Concatinating categorical data to train and test datasets .

train_features_transformed = pd.concat((train_features_transformed,train_obj_cols),axis=1)
test_transformed = pd.concat((test_transformed,test_obj_cols),axis=1)

In [49]:
#### Shape of train features dataset after concatenation

train_features_transformed.shape

(4209, 55)

In [50]:
#### Shape of test dataset after concatenation

test_transformed.shape

(4209, 55)

### Train Test Split

In [51]:
### Splitting the train data into training data and validation data

X_train, X_val, y_train, y_val = train_test_split(train_features_transformed, train_target, random_state = 6)

In [52]:
### Viewing the shape of the resultant datasets

print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(3156, 55)
(1053, 55)
(3156,)
(1053,)


## TASK: XGBoost 

In [53]:
#### Creating an instance of  XGBoost Regressor. Here the target variable is continous so we are using regressor  . 

xgb = xgboost.XGBRegressor(objective= 'reg:squarederror')

In [54]:
#### Training the model using train dataset . 


xgb.fit(X_train, y_train)


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [55]:
### Predicting the target varaible for the train data using trained model and calculating the metrics.

y_pred_train = xgb.predict(X_train)

print('MAE : ', mean_absolute_error(y_train, y_pred_train))
print('RMSE : ', np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('R2 Score : ', xgb.score(X_train, y_train))

MAE :  2.0610499874448593
RMSE :  3.4694844232013025
R2 Score :  0.9278458378991739


In [56]:
### Predicting the target varaible for the validation data using trained model and calculating the metrics.

y_pred_val = xgb.predict(X_val)

print('MAE : ', mean_absolute_error(y_val, y_pred_val))
print('RMSE : ', np.sqrt(mean_squared_error(y_val, y_pred_val)))
print('R2 Score : ', xgb.score(X_val, y_val))

MAE :  5.812591117431409
RMSE :  8.347354764386541
R2 Score :  0.509990277795451


In [57]:
#### Predicted data for test dataframe .

test_pred_y = xgb.predict(test_transformed)

print("Predicted Target variable for Test DataFrame")
test['pred_y1'] = test_pred_y
test.head()

Predicted Target variable for Test DataFrame


Unnamed: 0,X10,X12,X13,X14,X15,X16,X17,X18,X19,X20,...,X376,X377,X378,X379,X380,X382,X383,X384,X385,pred_y1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,76.602242
1,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,97.85006
2,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,80.388412
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,75.581306
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,111.474243


In [58]:
### There is a huge variation between RMSE of Train and Validation ,This is due to Overfitting .
### Hence we are using grid search to find optimal XGBoost parameters .

start = time.time()

params = { 'n_estimators' :[100,200] ,'max_depth':[1,2,3,6,10], 'learning_rate' :[0.1, 0.2, 0.3, 0.5], 'min_child_weight' : [1, 2, 3, 4, 5], 'subsample' : [0.5, 0.6, 0.7, 0.8, 1.0]}
grid_search = GridSearchCV(xgb, params, cv = 3, n_jobs = -1)
grid_search.fit(X_train, y_train)

end = time.time()

time_elapsed = end - start
print("Time taken to find optimal paramters",time_elapsed)

KeyboardInterrupt: 

In [None]:
### Viewing the optimal parameters .

print("The optimal paramters after the Grid search ")

grid_search.best_params_


In [None]:
#### Creating a new model with the optimal parameters learned .

xgb= xgboost.XGBRegressor(objective= 'reg:squarederror',learning_rate = 0.1, max_depth = 3, min_child_weight =3 , n_estimators = 100, subsample = 0.8)

In [None]:
### Training the model using train data

xgb.fit(X_train, y_train)

In [None]:
### Predicting the target varaible for the train data using trained model and calculating the metrics.

y_pred_train = xgb.predict(X_train)

print('MAE : ', mean_absolute_error(y_train, y_pred_train))
print('RMSE : ', np.sqrt(mean_squared_error(y_train, y_pred_train)))
print('R2 Score : ', xgb.score(X_train, y_train))

In [None]:
### Predicting the target varaible for the validation data using trained model and calculating the metrics.

y_pred_val = xgb.predict(X_val)
print('MAE : ', mean_absolute_error(y_val, y_pred_val))
print('RMSE : ', np.sqrt(mean_squared_error(y_val, y_pred_val)))
print('R2 Score : ', xgb.score(X_val, y_val))

In [None]:
print("The difference in RMSE of Train and Validation is much reduced . Hence Overfitting problem is solved")

In [None]:
#### Predicted data for test dataframe .

test_pred_y = xgb.predict(test_transformed)

print("Predicted Target variable for Test DataFrame")
test['pred_y2'] = test_pred_y
test.head()