In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import cross_val_score,cross_val_predict
from xgboost import XGBRegressor
%matplotlib inline

In [2]:
mb_train=pd.read_csv('train.csv')
mb_test=pd.read_csv('test.csv')

In [3]:
print(mb_train.shape)   #Having target variable column
print(mb_test.shape)    #Not having target variable column

(4209, 378)
(4209, 377)


In [4]:
mb_test.head(3)

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0


In [5]:
mb_train.head(3)

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0


In [6]:
mb_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


#### Observation : 
##### 1. 4209 datapoints & 378 features 
##### 2. 1 column of  float64 type, 369 column of int64 type and 8 column of object type.
##### 3. 'y' is the target variable, showing the time that the cars spend on bench.

# EDA and Data-preprocessing starts now.....




### 1. Dealing with zero variance columns.

In [7]:
#Checking columns for zero variance in train dataset :

variance=mb_train.var()
zero_var_col=variance[variance==0].index
zero_var_col

Index(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293',
       'X297', 'X330', 'X347'],
      dtype='object')

In [8]:
#Removing zero variance column in train dataset as well as in test dataset :
mb_train_nonzero_var=mb_train.drop(zero_var_col,axis=1)
mb_test_nonzero_var=mb_test.drop(zero_var_col,axis=1)

print(f'Shape of modified dataset after removing zero variance columns in train -- {mb_train_nonzero_var.shape}')
print(f'Shape of modified dataset after removing zero variance columns in test -- {mb_test_nonzero_var.shape}')

print(f'Shape of original dataset is -- {mb_train.shape}')

Shape of modified dataset after removing zero variance columns in train -- (4209, 366)
Shape of modified dataset after removing zero variance columns in test -- (4209, 365)
Shape of original dataset is -- (4209, 378)


#### Succesfully removed the zero variance columns.

### 2. Checking for null values :

In [9]:
mb_train_nonzero_var.isna().sum()


ID      0
y       0
X0      0
X1      0
X2      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 366, dtype: int64

In [10]:
print(len(mb_train_nonzero_var.columns[mb_train_nonzero_var.isna().any()]))

0


##### No null values in the dataset.

### 3. Checking for unique values :

In [11]:
mb_train_unique=pd.DataFrame(mb_train_nonzero_var.nunique(),columns=['Unique_Val'])
print(mb_train_unique)

      Unique_Val
ID          4209
y           2545
X0            47
X1            27
X2            44
...          ...
X380           2
X382           2
X383           2
X384           2
X385           2

[366 rows x 1 columns]


In [12]:
#Cheking for categorical values type features.
mb_train_unique[mb_train_unique.Unique_Val>2].unstack

<bound method DataFrame.unstack of     Unique_Val
ID        4209
y         2545
X0          47
X1          27
X2          44
X3           7
X4           4
X5          29
X6          12
X8          25>

In [13]:
#Checking for binary values(0/1) type features.
mb_train_unique[mb_train_unique.Unique_Val<=2].unstack

<bound method DataFrame.unstack of       Unique_Val
X10            2
X12            2
X13            2
X14            2
X15            2
...          ...
X380           2
X382           2
X383           2
X384           2
X385           2

[356 rows x 1 columns]>

* Out of 366 train dataset features, 10 features are having greater than 2 unique values and remaining features are having only 2 unique values (0 and 1).
* Since ID and y are not required for training  the dataset so we'll not use them.


In [14]:
X_train=mb_train_nonzero_var[mb_train_nonzero_var.columns[2:]]
X_test=mb_test_nonzero_var[mb_test_nonzero_var.columns[1:]]
y=mb_train_nonzero_var['y']

In [15]:
X_train.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X12,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,k,v,at,a,d,u,j,o,0,0,...,0,0,1,0,0,0,0,0,0,0
1,k,t,av,e,d,y,l,o,0,0,...,1,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,x,0,0,...,0,0,0,0,0,0,1,0,0,0
3,az,t,n,f,d,x,l,e,0,0,...,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,n,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
X_train.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X12,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,k,v,at,a,d,u,j,o,0,0,...,0,0,1,0,0,0,0,0,0,0
1,k,t,av,e,d,y,l,o,0,0,...,1,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,x,0,0,...,0,0,0,0,0,0,1,0,0,0
3,az,t,n,f,d,x,l,e,0,0,...,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,n,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
y.head()

0    130.81
1     88.53
2     76.26
3     80.62
4     78.02
Name: y, dtype: float64

In [18]:
catg_var = list(X_train.select_dtypes('object').columns)
catg_var

['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']

### 4. Applying Label Encoder to the categorical features.

In [19]:
#Function to apply endoing to the object type features :
def label_encoder(df,x):
    catg_var=df.select_dtypes(include='object').columns
    le=LabelEncoder() 
    for i in catg_var:
        x[i]=le.fit_transform(x[i])

In [20]:
label_encoder(mb_train_nonzero_var,X_train)
label_encoder(mb_test_nonzero_var,X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[i]=le.fit_transform(x[i])


In [21]:
X_train

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X12,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,32,23,17,0,3,24,9,14,0,0,...,0,0,1,0,0,0,0,0,0,0
1,32,21,19,4,3,28,11,14,0,0,...,1,0,0,0,0,0,0,0,0,0
2,20,24,34,2,3,27,9,23,0,0,...,0,0,0,0,0,0,1,0,0,0
3,20,21,34,5,3,27,11,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,23,34,5,3,12,3,13,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8,20,16,2,3,0,3,16,0,0,...,1,0,0,0,0,0,0,0,0,0
4205,31,16,40,3,3,0,7,7,0,0,...,0,1,0,0,0,0,0,0,0,0
4206,8,23,38,0,3,0,6,4,0,1,...,0,0,1,0,0,0,0,0,0,0
4207,9,19,25,5,3,0,11,20,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
X_test

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X12,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,21,23,34,5,3,26,0,22,0,0,...,0,0,0,1,0,0,0,0,0,0
1,42,3,8,0,3,9,6,24,0,0,...,0,0,1,0,0,0,0,0,0,0
2,21,23,17,5,3,0,9,9,0,0,...,0,0,0,1,0,0,0,0,0,0
3,21,13,34,5,3,31,11,13,0,0,...,0,0,0,1,0,0,0,0,0,0
4,45,20,17,2,3,30,8,12,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,6,9,17,5,3,1,9,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4205,42,1,8,3,3,1,9,24,0,0,...,0,1,0,0,0,0,0,0,0,0
4206,47,23,17,5,3,1,3,22,0,0,...,0,0,0,0,0,0,0,0,0,0
4207,7,23,17,0,3,1,2,16,0,0,...,0,0,1,0,0,0,0,0,0,0


In [24]:
stdsclr=StandardScaler()

stdsclr.fit(X_train)
X_train = stdsclr.transform(X_train)

X_test=stdsclr.fit(X_test)

X_train


array([[ 0.16301209,  1.39348787, -0.02812155, ..., -0.04081511,
        -0.02180363, -0.03778296],
       [ 0.16301209,  1.15902093,  0.15538793, ..., -0.04081511,
        -0.02180363, -0.03778296],
       [-0.71055977,  1.51072134,  1.53170902, ..., -0.04081511,
        -0.02180363, -0.03778296],
       ...,
       [-1.58413164,  1.39348787,  1.89872798, ..., -0.04081511,
        -0.02180363, -0.03778296],
       [-1.51133398,  0.924554  ,  0.70591637, ..., -0.04081511,
        -0.02180363, -0.03778296],
       [ 1.18217927,  0.924554  , -1.31268791, ..., -0.04081511,
        -0.02180363, -0.03778296]])

### 5. Principal Component Analysis(PCA)

In [25]:
pca=PCA(0.98,svd_solver="full")
#X_train

In [26]:
x_train,x_test,y_train,y_test=train_test_split(X_train,y,random_state=10,train_size=0.75)

In [27]:
pca_fit_X = pca.fit(X_train)
pca.n_components_
#pca_fit_trans_X = pca_fit_X.transform(X_train)

178

In [28]:
var_per = pca_fit_X.explained_variance_ratio_
#cum_var_per = pca_fit_X.explained_variance_ratio_.cumsum()

In [29]:
var_per

array([0.06892669, 0.05688412, 0.04537457, 0.03426771, 0.03264309,
       0.03162661, 0.02862524, 0.02123751, 0.01970413, 0.01783195,
       0.01640066, 0.01564286, 0.01462742, 0.01448335, 0.01348286,
       0.01295162, 0.01243832, 0.01173103, 0.01121051, 0.01077278,
       0.00992513, 0.00969449, 0.00942523, 0.00909868, 0.00874224,
       0.00843069, 0.00790205, 0.00763218, 0.00733542, 0.00714906,
       0.00692957, 0.00676677, 0.00652534, 0.00641494, 0.00622692,
       0.00599196, 0.00588086, 0.00574694, 0.00563703, 0.0055341 ,
       0.00550641, 0.00540083, 0.00533911, 0.00524612, 0.00510194,
       0.00503232, 0.00495885, 0.00472692, 0.00464475, 0.00456521,
       0.00439624, 0.00432948, 0.00430322, 0.00423762, 0.00420212,
       0.00415473, 0.004067  , 0.00403463, 0.00391874, 0.00388747,
       0.00381759, 0.0037556 , 0.0037244 , 0.00365911, 0.00359567,
       0.0035523 , 0.00349701, 0.00346182, 0.00340472, 0.00334158,
       0.00330985, 0.00325522, 0.00324085, 0.00321045, 0.00316

In [None]:
#cum_var_per

In [None]:
plt.figure(figsize=(30,10))
ind = np.arange(len(var_per)) 
plt.bar(ind,var_per)
plt.xlabel('n_components')
plt.ylabel('Variance')

In [None]:
#len(cum_var_per[cum_var_per < 0.98])

In [None]:
#pca = PCA(n_components = len(cum_var_per[cum_var_per < 0.98]))
#X_trans = pca.fit_transform(X_train)
#X_trans.shape

In [30]:
x_train.shape

(3156, 364)

In [31]:
pca_x_train = pd.DataFrame(pca.transform(x_train))
pca_x_test = pd.DataFrame(pca.transform(x_test))
pca_X_test = pd.DataFrame(pca.transform(X_test))

ValueError: Expected 2D array, got scalar array instead:
array=StandardScaler().
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
plt.boxplot(y)

In [None]:
median=y.median()
median

In [None]:
median=y.median()
median

In [None]:
y=np.where(y > 120.806,median, y)

In [None]:
plt.boxplot(y)

In [None]:


model = xgb.XGBRegressor(objective='reg:linear',learning_rate=0.1,verbosity=0)
model.fit(pca_x_train,y_train)

pred_y_test=model.predict(pca_x_test)


In [None]:
r2_Score = cross_val_score(model,pca_x_train,y_train,scoring='r2',cv=10)
r2_Score

In [None]:
r2_Score.mean()