In [1]:
#Load Libraries
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.decomposition import NMF 

In [2]:
#Load Files
train_data=pd.read_csv("train.csv")
future_data=pd.read_csv("test.csv")
train_data.head(), future_data.head()

(   ID       y  X0 X1  X2 X3 X4 X5 X6 X8  ...  X375  X376  X377  X378  X379  \
 0   0  130.81   k  v  at  a  d  u  j  o  ...     0     0     1     0     0   
 1   6   88.53   k  t  av  e  d  y  l  o  ...     1     0     0     0     0   
 2   7   76.26  az  w   n  c  d  x  j  x  ...     0     0     0     0     0   
 3   9   80.62  az  t   n  f  d  x  l  e  ...     0     0     0     0     0   
 4  13   78.02  az  v   n  f  d  h  d  n  ...     0     0     0     0     0   
 
    X380  X382  X383  X384  X385  
 0     0     0     0     0     0  
 1     0     0     0     0     0  
 2     0     1     0     0     0  
 3     0     0     0     0     0  
 4     0     0     0     0     0  
 
 [5 rows x 378 columns],
    ID  X0 X1  X2 X3 X4 X5 X6 X8  X10  ...  X375  X376  X377  X378  X379  X380  \
 0   1  az  v   n  f  d  t  a  w    0  ...     0     0     0     1     0     0   
 1   2   t  b  ai  a  d  b  g  y    0  ...     0     0     1     0     0     0   
 2   3  az  v  as  f  d  a  j  j    0  ..

In [3]:
#Dimension of the datasets
train_data.shape, future_data.shape

((4209, 378), (4209, 377))

In [4]:
#Drop irrelevant column
train_data.drop(['ID'], axis=1, inplace=True)
future_data.drop(['ID'], axis=1, inplace=True)

In [5]:
#Task 1: If for any column(s), the variance is equal to zero, then you need to remove those variable(s).
#Checking data types
#Selecting columns having only numeric/catagorical data types
column=[]
for i in train_data.columns:
    if(train_data[i].dtypes!='O'):
        column.append(i)

In [6]:
#Selecting columns with zero variance
columns=[]
for i in column:
    if(train_data[i].std()==0):
        columns.append(i)
columns

['X11',
 'X93',
 'X107',
 'X233',
 'X235',
 'X268',
 'X289',
 'X290',
 'X293',
 'X297',
 'X330',
 'X347']

In [7]:
#Dropping Columns with zero variance
train_data.drop(columns, axis=1, inplace=True)
future_data.drop(columns, axis=1, inplace=True)
#Dimension of the datasets after dropping columns
train_data.shape, future_data.shape

((4209, 365), (4209, 364))

In [8]:
#Task 2:  Check for null and unique values for test and train sets.
#Checking null values on both train and test sets

In [9]:
#Checking null values for train set
column=[]
for i in train_data.columns:
    if((train_data[i].isna().sum()>0)==True):
        column.append(i)
if column==[]:
    print("No null Values in train set")

No null Values in train set


In [10]:
#Checking null values for test set
column=[]
for i in future_data.columns:
    if((future_data[i].isna().sum()>0)==True):
        column.append(i)
if column==[]:
    print("No null Values in test set")

No null Values in test set


In [11]:
#Check unique values for test and train sets.
#Checking unique values for train set
for i in train_data.columns:
    print("{}: {}".format(i, train_data[i].unique()))

y: [130.81  88.53  76.26 ...  85.71 108.77  87.48]
X0: ['k' 'az' 't' 'al' 'o' 'w' 'j' 'h' 's' 'n' 'ay' 'f' 'x' 'y' 'aj' 'ak' 'am'
 'z' 'q' 'at' 'ap' 'v' 'af' 'a' 'e' 'ai' 'd' 'aq' 'c' 'aa' 'ba' 'as' 'i'
 'r' 'b' 'ax' 'bc' 'u' 'ad' 'au' 'm' 'l' 'aw' 'ao' 'ac' 'g' 'ab']
X1: ['v' 't' 'w' 'b' 'r' 'l' 's' 'aa' 'c' 'a' 'e' 'h' 'z' 'j' 'o' 'u' 'p' 'n'
 'i' 'y' 'd' 'f' 'm' 'k' 'g' 'q' 'ab']
X2: ['at' 'av' 'n' 'e' 'as' 'aq' 'r' 'ai' 'ak' 'm' 'a' 'k' 'ae' 's' 'f' 'd'
 'ag' 'ay' 'ac' 'ap' 'g' 'i' 'aw' 'y' 'b' 'ao' 'al' 'h' 'x' 'au' 't' 'an'
 'z' 'ah' 'p' 'am' 'j' 'q' 'af' 'l' 'aa' 'c' 'o' 'ar']
X3: ['a' 'e' 'c' 'f' 'd' 'b' 'g']
X4: ['d' 'b' 'c' 'a']
X5: ['u' 'y' 'x' 'h' 'g' 'f' 'j' 'i' 'd' 'c' 'af' 'ag' 'ab' 'ac' 'ad' 'ae'
 'ah' 'l' 'k' 'n' 'm' 'p' 'q' 's' 'r' 'v' 'w' 'o' 'aa']
X6: ['j' 'l' 'd' 'h' 'i' 'a' 'g' 'c' 'k' 'e' 'f' 'b']
X8: ['o' 'x' 'e' 'n' 's' 'a' 'h' 'p' 'm' 'k' 'd' 'i' 'v' 'j' 'b' 'q' 'w' 'g'
 'y' 'l' 'f' 'u' 'r' 't' 'c']
X10: [0 1]
X12: [0 1]
X13: [1 0]
X14: [0 1]
X15: [0 1]
X16: [

In [12]:
#Checking unique values for test set
for i in future_data.columns:
    print("{}: {}".format(i, future_data[i].unique()))

X0: ['az' 't' 'w' 'y' 'x' 'f' 'ap' 'o' 'ay' 'al' 'h' 'z' 'aj' 'd' 'v' 'ak'
 'ba' 'n' 'j' 's' 'af' 'ax' 'at' 'aq' 'av' 'm' 'k' 'a' 'e' 'ai' 'i' 'ag'
 'b' 'am' 'aw' 'as' 'r' 'ao' 'u' 'l' 'c' 'ad' 'au' 'bc' 'g' 'an' 'ae' 'p'
 'bb']
X1: ['v' 'b' 'l' 's' 'aa' 'r' 'a' 'i' 'p' 'c' 'o' 'm' 'z' 'e' 'h' 'w' 'g' 'k'
 'y' 't' 'u' 'd' 'j' 'q' 'n' 'f' 'ab']
X2: ['n' 'ai' 'as' 'ae' 's' 'b' 'e' 'ak' 'm' 'a' 'aq' 'ag' 'r' 'k' 'aj' 'ay'
 'ao' 'an' 'ac' 'af' 'ax' 'h' 'i' 'f' 'ap' 'p' 'au' 't' 'z' 'y' 'aw' 'd'
 'at' 'g' 'am' 'j' 'x' 'ab' 'w' 'q' 'ah' 'ad' 'al' 'av' 'u']
X3: ['f' 'a' 'c' 'e' 'd' 'g' 'b']
X4: ['d' 'b' 'a' 'c']
X5: ['t' 'b' 'a' 'z' 'y' 'x' 'h' 'g' 'f' 'j' 'i' 'd' 'c' 'af' 'ag' 'ab' 'ac'
 'ad' 'ae' 'ah' 'l' 'k' 'n' 'm' 'p' 'q' 's' 'r' 'v' 'w' 'o' 'aa']
X6: ['a' 'g' 'j' 'l' 'i' 'd' 'f' 'h' 'c' 'k' 'e' 'b']
X8: ['w' 'y' 'j' 'n' 'm' 's' 'a' 'v' 'r' 'o' 't' 'h' 'c' 'k' 'p' 'u' 'd' 'g'
 'b' 'q' 'e' 'l' 'f' 'i' 'x']
X10: [0 1]
X12: [0 1]
X13: [0 1]
X14: [0 1]
X15: [0 1]
X16: [0 1]
X17: [0 1]
X18: [

In [13]:
#Task 3:  Apply label encoder.
train_data_new=pd.get_dummies(train_data)
future_data_new=pd.get_dummies(future_data)

In [14]:
train_data_new.head()

Unnamed: 0,y,X10,X12,X13,X14,X15,X16,X17,X18,X19,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
0,130.81,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,88.53,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,76.26,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,80.62,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,78.02,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
future_data_new.head()

Unnamed: 0,X10,X12,X13,X14,X15,X16,X17,X18,X19,X20,...,X8_p,X8_q,X8_r,X8_s,X8_t,X8_u,X8_v,X8_w,X8_x,X8_y
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
#Setting other column of train data.
train_data_new['other']=np.zeros(len(train_data_new))

In [17]:
#List column names in train data not in  test data
columns=np.setdiff1d(train_data_new.columns, future_data_new.columns)
columns

array(['X0_aa', 'X0_ab', 'X0_ac', 'X0_q', 'X2_aa', 'X2_ar', 'X2_c',
       'X2_l', 'X2_o', 'X5_u', 'other', 'y'], dtype=object)

In [18]:
for i in columns[0:len(columns)-1]:
    future_data_new[i]=np.zeros(len(future_data_new))

In [19]:
#List column names of test data not in  train data
columns=np.setdiff1d(future_data_new.columns, train_data_new.columns)
columns

array(['X0_ae', 'X0_ag', 'X0_an', 'X0_av', 'X0_bb', 'X0_p', 'X2_ab',
       'X2_ad', 'X2_aj', 'X2_ax', 'X2_u', 'X2_w', 'X5_a', 'X5_b', 'X5_t',
       'X5_z'], dtype=object)

In [20]:
#Setting other column of test data.
future_data_new['other']=[1 if (future_data_new.loc[i, columns].sum()!=0) else 0 for i in range(len(future_data_new))]
future_data_new

Unnamed: 0,X10,X12,X13,X14,X15,X16,X17,X18,X19,X20,...,X0_ab,X0_ac,X0_q,X2_aa,X2_ar,X2_c,X2_l,X2_o,X5_u,other
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4205,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4206,0,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4207,0,0,1,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [21]:
#Drop columns in test data which is not in train data
future_data_new.drop(columns, axis=1, inplace=True)

In [22]:
#Features and Target
train_feature=train_data_new.drop('y', axis=1)
Target=train_data_new['y']

In [23]:
train_feature.shape, future_data_new.shape

((4209, 552), (4209, 552))

In [24]:
#Splitting the train data
x_train, x_test, y_train, y_test=train_test_split(train_feature, Target, test_size=0.20, random_state=10)
x_train.shape, x_test.shape

((3367, 552), (842, 552))

In [25]:
#Dimensionality Reduction
nmf = NMF(n_components=250, max_iter=3500, init='random', random_state=0).fit(x_train)

In [26]:
train_transform=nmf.transform(x_train)
test_transform=nmf.transform(x_test)
unseen_transform=nmf.transform(future_data_new)
train_transform.shape, test_transform.shape, unseen_transform.shape

((3367, 250), (842, 250), (4209, 250))

In [27]:
params = {'n_estimators': [50, 100, 150, 200], 'learning_rate':[0.1, 0.001, 0.3, 0.05], 'booster':['gbtree', 'gblinear', 'dart'], 'gamma':[0.01, 0.1, 0.3, 0.5]}
gs = GridSearchCV(XGBRegressor(), params, cv=10, n_jobs=-1)
gs.fit(train_transform, y_train)
gs.best_params_

{'booster': 'gbtree', 'gamma': 0.3, 'learning_rate': 0.1, 'n_estimators': 50}

In [32]:
xg_model=XGBRegressor(n_estimators=50, learning_rate=0.1, booster='gbtree', gamma=0.3).fit(train_transform, y_train)

In [33]:
#Prediction
y_pred=xg_model.predict(test_transform)

In [34]:
#R2 Score
r2_score(y_test, y_pred)

0.5426116953990849

In [35]:
#Prediction on future data
y_pred=xg_model.predict(unseen_transform)
y_pred

array([ 88.83326 ,  92.461685,  78.34908 , ...,  89.685326, 111.5986  ,
        92.548645], dtype=float32)