In [166]:
#importing relevant libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [167]:
#importing data
merc_train=pd.read_csv('train.csv')
merc_test=pd.read_csv('test.csv')

In [168]:
merc_train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [169]:
merc_test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [170]:
#Separating features and target
x=merc_train.drop(['ID','y'],axis=1) #ID column does not have any impact.
y=merc_train['y']

In [171]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 376 entries, X0 to X385
dtypes: int64(368), object(8)
memory usage: 12.1+ MB


###### From the x.info(),it can be concluded that the number of categorical values are 8 ,variables raninging from X0-X8
###### --------

In [172]:
#Getting unique values from integer feature columns
integer_cols=list(set(x.columns)-set(['X0','X1','X2','X3','X4','X5','X6','X8'])) #Separating integers columns
unique_val=[]
for col in x[integer_cols]:
    unique_val.append(x[col].unique())
    
unique_val=set(frozenset(i) for i in unique_val) #Removing duplicate values
unique_val=list(map(list,unique_val)) #converting frozenset to list
print('unique values:',unique_val,end='\n--------\n') 


#printing feature columns
columns1=[]
columns2=[]
for col in x[list(integer_cols)]:
    if x[col].unique().tolist()==unique_val[0]: #feature columns with [0,1] as unique values
        columns1.append(col)
    if x[col].unique().tolist()==unique_val[1]: #feature columns with [0] as unique value
        columns2.append(col)

print('Feature columns containing unique values:{0}\n{1}'.format((unique_val[0]),(columns1)),end='\n\n')
print('Feature columns containing unique values:{0}\n{1}'.format((unique_val[1]),(columns2)))

    

unique values: [[0, 1], [0]]
--------
Feature columns containing unique values:[0, 1]
['X333', 'X228', 'X256', 'X164', 'X177', 'X42', 'X342', 'X109', 'X190', 'X212', 'X326', 'X374', 'X10', 'X224', 'X245', 'X310', 'X382', 'X352', 'X78', 'X160', 'X108', 'X379', 'X56', 'X20', 'X147', 'X183', 'X322', 'X277', 'X45', 'X304', 'X363', 'X159', 'X171', 'X343', 'X166', 'X123', 'X272', 'X54', 'X105', 'X60', 'X186', 'X216', 'X155', 'X66', 'X360', 'X351', 'X151', 'X202', 'X257', 'X355', 'X124', 'X176', 'X125', 'X43', 'X341', 'X158', 'X23', 'X328', 'X173', 'X113', 'X110', 'X197', 'X14', 'X299', 'X357', 'X271', 'X211', 'X77', 'X41', 'X76', 'X34', 'X200', 'X71', 'X251', 'X88', 'X81', 'X38', 'X247', 'X87', 'X282', 'X27', 'X221', 'X96', 'X332', 'X47', 'X94', 'X267', 'X248', 'X283', 'X17', 'X90', 'X252', 'X83', 'X15', 'X168', 'X331', 'X335', 'X210', 'X340', 'X338', 'X50', 'X174', 'X274', 'X62', 'X122', 'X117', 'X79', 'X246', 'X198', 'X308', 'X376', 'X40', 'X223', 'X80', 'X208', 'X380', 'X143', 'X163', 'X2

In [173]:
#Removing feature columns with zero variance


df=x[integer_cols].loc[:,x[integer_cols].var()==0] #Dataframe with feature columns showing no variance(variance=0)
print('subset dataframe with zero variance:\n',df)
no_variance=list(df.columns) #extracting columns'name with zero variance
print('columns with 0 variance:',no_variance)
x.drop(no_variance,axis=1,inplace=True)# Removing columns with zero variance



subset dataframe with zero variance:
       X93  X11  X107  X297  X293  X290  X330  X233  X268  X235  X347  X289
0       0    0     0     0     0     0     0     0     0     0     0     0
1       0    0     0     0     0     0     0     0     0     0     0     0
2       0    0     0     0     0     0     0     0     0     0     0     0
3       0    0     0     0     0     0     0     0     0     0     0     0
4       0    0     0     0     0     0     0     0     0     0     0     0
...   ...  ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...
4204    0    0     0     0     0     0     0     0     0     0     0     0
4205    0    0     0     0     0     0     0     0     0     0     0     0
4206    0    0     0     0     0     0     0     0     0     0     0     0
4207    0    0     0     0     0     0     0     0     0     0     0     0
4208    0    0     0     0     0     0     0     0     0     0     0     0

[4209 rows x 12 columns]
columns with 0 variance: ['X93', 'X1

### Columns with zero variance removed
### ------

In [174]:
#Checking for null values in train and test data
print(x.isnull().sum().sum(),y.isna().sum(),merc_test.isnull().sum().sum(),sep='\n')

0
0
0


### No null values present in the train and test data

# ----

In [175]:
# label encoding categoprical values

for i in ['X0','X1','X2','X3','X4','X5','X6','X8']: #categorical columns
    LE=LabelEncoder()
    x[i]=LE.fit_transform(x[i]) #apply label encoding

print(x[['X0','X1','X2','X3','X4','X5','X6','X8']]) #printing label encoded values of categorical columns


      X0  X1  X2  X3  X4  X5  X6  X8
0     32  23  17   0   3  24   9  14
1     32  21  19   4   3  28  11  14
2     20  24  34   2   3  27   9  23
3     20  21  34   5   3  27  11   4
4     20  23  34   5   3  12   3  13
...   ..  ..  ..  ..  ..  ..  ..  ..
4204   8  20  16   2   3   0   3  16
4205  31  16  40   3   3   0   7   7
4206   8  23  38   0   3   0   6   4
4207   9  19  25   5   3   0  11  20
4208  46  19   3   2   3   0   6  22

[4209 rows x 8 columns]


### Applied Label Encoding on categorical feature columns
### --------

In [203]:
# For dimensionality reduction applying PCA(Principal Component Analysis)
from sklearn.decomposition import PCA
pca=PCA(n_components=10) #taking 10 principal components for dimensionality reduction
x_trans=pca.fit_transform(x) #fitting and transforming features

In [204]:
print(pca.explained_variance_ratio_) # explained variance of the 10 principal components

print('\nTotal variance on the output caused by all 10 principal components together is: {}%'.format(round((sum(pca.explained_variance_ratio_)*100),3)) )

[0.38334782 0.21388033 0.13261866 0.11826642 0.09206008 0.01590604
 0.0074454  0.00433701 0.00294021 0.00241796]

Total variance on the output caused by all 10 principal components together is: 97.322%


### Applied Dimenstionality reduction on the features
### --------

In [205]:
#Splitting training and testing data
x_train,x_test,y_train,y_test=train_test_split(x_trans,y,test_size=0.25,random_state=16)


In [206]:
###XGBRegressor parameters:
#objective:regularized linear regression
#evalutaion metric=rmse
#n_estimators=800
#early stopping rounds=80(10% of n_estimators)
#learning rate=10%

In [207]:
#Since the the target varaible(y) is continous,regression must be performed for prediction
#Finding optimal number of n_estimators
xgbmodel=xgb.XGBRegressor(objective='reg:squarederror',eval_metric='rmse',n_estimators=800,early_stopping_rounds=80,learning_rate=0.1,random_state=16)

xgbmodel.fit(x_train,y_train,eval_set=[(x_test,y_test)])

[0]	validation_0-rmse:90.97558
[1]	validation_0-rmse:82.00085
[2]	validation_0-rmse:73.96640
[3]	validation_0-rmse:66.73643
[4]	validation_0-rmse:60.24603
[5]	validation_0-rmse:54.41054
[6]	validation_0-rmse:49.17634
[7]	validation_0-rmse:44.48262
[8]	validation_0-rmse:40.27932
[9]	validation_0-rmse:36.50683
[10]	validation_0-rmse:33.15111
[11]	validation_0-rmse:30.14568
[12]	validation_0-rmse:27.46069
[13]	validation_0-rmse:25.07562
[14]	validation_0-rmse:22.97338
[15]	validation_0-rmse:21.12299
[16]	validation_0-rmse:19.46181
[17]	validation_0-rmse:17.99291
[18]	validation_0-rmse:16.73097
[19]	validation_0-rmse:15.59243
[20]	validation_0-rmse:14.62631
[21]	validation_0-rmse:13.78661
[22]	validation_0-rmse:13.06381
[23]	validation_0-rmse:12.45806
[24]	validation_0-rmse:11.92603
[25]	validation_0-rmse:11.49262
[26]	validation_0-rmse:11.11828
[27]	validation_0-rmse:10.80243
[28]	validation_0-rmse:10.53835
[29]	validation_0-rmse:10.34113
[30]	validation_0-rmse:10.16028
[31]	validation_0-

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=80, enable_categorical=False,
             eval_metric='rmse', gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=800, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=16,
             reg_alpha=0, reg_lambda=1, ...)

In [208]:
#Printing scores of optimal iteration
print('Best model score: {}\nBest iteration round: {}\nBest tree limit: {}'.format((xgbmodel.best_score),(xgbmodel.best_iteration),(xgbmodel.best_ntree_limit)))

Best model score: 9.417760147899532
Best iteration round: 51
Best tree limit: 52


In [209]:
optimal_tree_limit=xgbmodel.best_ntree_limit

In [210]:
#Making new xgboost regressor with best tree limit as n_estimators
xgbmodel=xgb.XGBRegressor(objective='reg:squarederror',n_estimators=optimal_tree_limit)
xgbmodel.fit(x_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=52, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [211]:
y_pred=xgbmodel.predict(x_test) #predicting target values
print('rmse: ',np.sqrt(mean_squared_error(y_test,y_pred)))
print(0.1*y_train.mean())

rmse:  9.66621783299739
10.064689797211656


#### rmse of the model is less than 10% of the mean of the target.This indicates model is performing good.
#### -----

In [212]:
#predicted values
prediction_df=pd.DataFrame(data=y_pred,columns=['prediction'])

### Predicted values below

In [213]:
prediction_df

Unnamed: 0,prediction
0,95.692062
1,112.125778
2,100.421837
3,113.255753
4,97.838516
...,...
1048,96.949890
1049,98.842651
1050,90.835312
1051,92.625420


In [214]:
from sklearn.metrics import r2_score

In [215]:
r2_score(y_test,y_pred)

0.3498779293116404