In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error



In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


Concatenating the test and train data

In [3]:
frames = [train, test]
input = pd.concat(frames)

print (input.shape)
input.head()
test.shape

(783667, 12)


(233599, 11)

In [4]:
input.fillna(999, inplace=True)

In [5]:
input.head()

Unnamed: 0,Age,City_Category,Gender,Marital_Status,Occupation,Product_Category_1,Product_Category_2,Product_Category_3,Product_ID,Purchase,Stay_In_Current_City_Years,User_ID
0,0-17,A,F,0,10,3,999.0,999.0,P00069042,8370.0,2,1000001
1,0-17,A,F,0,10,1,6.0,14.0,P00248942,15200.0,2,1000001
2,0-17,A,F,0,10,12,999.0,999.0,P00087842,1422.0,2,1000001
3,0-17,A,F,0,10,12,14.0,999.0,P00085442,1057.0,2,1000001
4,55+,C,M,0,16,8,999.0,999.0,P00285442,7969.0,4+,1000002


In [6]:
target = input.Purchase

In [7]:
target = np.array(target)

In [8]:
input.drop(["Purchase"], axis=1, inplace=True)

In [9]:
#Convert all the columns to string 
input = input.applymap(str)
input.dtypes

Age                           object
City_Category                 object
Gender                        object
Marital_Status                object
Occupation                    object
Product_Category_1            object
Product_Category_2            object
Product_Category_3            object
Product_ID                    object
Stay_In_Current_City_Years    object
User_ID                       object
dtype: object

In [10]:
# Have a copy of the pandas dataframe. Will be useful later on
input_pd = input.copy()

In [11]:
#Convert categorical to numeric using LabelEncoder

input = np.array(input)

for i in range(input.shape[1]):
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(input[:,i]))
    input[:, i] = lbl.transform(input[:, i])

In [12]:
input = input.astype(int)

In [13]:
submission=pd.read_csv('Sample_Submission_Tm9Lura.csv')

# Applying the xgboost model.
i) Parameter "min_child_weight" used to control over-fitting. Higher values prevent a model from learning relations which might be highly specific to the particular sample selected for a tree. Defines the minimum sum of weights of all observations required in a field.
ii) Parameter "subsample" denotes the fraction of observations to be randomly samples for each tree. Lowe values make the algorithm more conservative and prevents overfitting but too small might leads to underfitting. Typical values 0.5-1.
iii) Parameter "colsample_bytree" denotes the fraction of columns to be randomly samples for each tree.
iv) Parameter "silent" is 1 so that no running messages will be printed.
v) Parameter "nthread" is used for parallel processing and number of cores in the system to be printed.
vi) Parameter "objective" is reg:linear here.
vii) Parameter "eta" is analogous to learning rate and makes the model more robust by shrinking weight at each step.
viii) Parameter "eval_metric" is rmse here.
ix) Parameter "seed" can be used for generating reproductible results and also for parameter tuning.
x) Parameter "max_depth" used to control over-fitting as higher depth will allow model to learn relations very specific to a particular sample.

In [14]:
params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 6
params["nthread"] = 6
#params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 3000

In [15]:
xgtrain = xgb.DMatrix(input[:train.shape[0],:], label=target[:train.shape[0]])
watchlist = [(xgtrain, 'train')]
model_1_xgboost = xgb.train(plst, xgtrain, num_rounds)

In [16]:
model_1_predict = model_1_xgboost.predict(xgb.DMatrix(input[train.shape[0]:,:]))
model_1_predict[model_1_predict<0] = 25
submission.Purchase = model_1_predict
submission.User_ID=test.User_ID
submission.Product_ID=test.Product_ID
submission.to_csv("sub1.csv", index=False)


In [17]:
submission.head()

Unnamed: 0,User_ID,Product_ID,Purchase
0,1000004,P00128942,15122.963867
1,1000009,P00113442,10431.894531
2,1000010,P00288442,6955.890137
3,1000010,P00145342,3446.657227
4,1000011,P00053842,968.13446


In [18]:
train.shape[0]

550068

# Preparing data for stacking model
Split dataset into two. First level models to create meta features to feed into a second level model

In [19]:
first_stage_rows = np.random.randint(train.shape[0], size = np.int(train.shape[0]/2))

In [20]:
train_np   = input[:train.shape[0], :]
target_np  = target[:train.shape[0]]
train_fs   = train_np[first_stage_rows, :]
target_fs  = target_np[first_stage_rows]
train_ss   = train_np[-first_stage_rows, :]
target_ss  = target_np[-first_stage_rows]

In [None]:
print (train_fs.shape, target_fs.shape, train_ss.shape, target_ss.shape)

(275034, 11) (275034,) (275034, 11) (275034,)


# Four different models of xgboost with altering max_depth and num_rounds 

In [None]:
xgtrain = xgb.DMatrix(train_fs, label=target_fs)
watchlist = [(xgtrain, 'train')]

# Model 1: 6/3000

params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 6
params["nthread"] = 6
#params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 3000

model_1 = xgb.train(plst, xgtrain, num_rounds)

# Model 2: 8/1420

params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 8
params["nthread"] = 6
#params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 1420

model_2 = xgb.train(plst, xgtrain, num_rounds)

# Model 3: 10/1200

params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 10
params["nthread"] = 6
#params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 1200

model_3 = xgb.train(plst, xgtrain, num_rounds)

# Model 4: 12/800

params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 12
params["nthread"] = 6
#params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 800

model_4 = xgb.train(plst, xgtrain, num_rounds)

# Three different models of ExtraTreesRegressor with altering n_estimators and max_Depth

In [None]:
# This set of models will be ExtraTrees

# Model 5: 8/1450

model_5 = ExtraTreesRegressor(n_estimators=1450, 
                              max_depth=8,
                              min_samples_split=10, 
                              min_samples_leaf=10, 
                              oob_score=True, 
                              n_jobs=6, 
                              random_state=123, 
                              verbose=1, 
                              bootstrap=True)
model_5.fit(train_fs, target_fs)

# Model 6: 6/3000

model_6 = ExtraTreesRegressor(n_estimators=3000, 
                              max_depth=6,
                              min_samples_split=10, 
                              min_samples_leaf=10, 
                              oob_score=True, 
                              n_jobs=6, 
                              random_state=123, 
                              verbose=1, 
                              bootstrap=True)
model_6.fit(train_fs, target_fs)

# Model 7: 12/800

model_7 = ExtraTreesRegressor(n_estimators=800, 
                              max_depth=12,
                              min_samples_split=10, 
                              min_samples_leaf=10, 
                              oob_score=True, 
                              n_jobs=6, 
                              random_state=123, 
                              verbose=1, 
                              bootstrap=True)
model_7.fit(train_fs, target_fs)

# Similarly, three different models of RandomForest with altering n_estimators and max_depth.

In [None]:
# This set of models will be RandomForest

# Model 8: 6/3000
model_8 = RandomForestRegressor(n_estimators=3000, max_depth=6, oob_score=True, n_jobs=6, random_state=123, min_samples_split=10, min_samples_leaf=10)
model_8.fit(train_fs, target_fs)

# Model 9: 8/1500
model_9 = RandomForestRegressor(n_estimators=1500, max_depth=8, oob_score=True, n_jobs=6, random_state=123, min_samples_split=10, min_samples_leaf=10)
model_9.fit(train_fs, target_fs)

# Model 10: 12/800
model_10 = RandomForestRegressor(n_estimators=800, max_depth=12, oob_score=True, n_jobs=6, random_state=123, min_samples_split=10, min_samples_leaf=10)
model_10.fit(train_fs, target_fs)

# Predicting on the next level of training dataset with all the 10 models separately

In [None]:
model_1_predict = model_1.predict(xgb.DMatrix(train_ss))
model_2_predict = model_2.predict(xgb.DMatrix(train_ss))
model_3_predict = model_3.predict(xgb.DMatrix(train_ss))
model_4_predict = model_4.predict(xgb.DMatrix(train_ss))
model_5_predict = model_5.predict(train_ss)
model_6_predict = model_6.predict(train_ss)
model_7_predict = model_7.predict(train_ss)
model_8_predict = model_8.predict(train_ss)
model_9_predict = model_9.predict(train_ss)
model_10_predict = model_10.predict(train_ss)

# Concatenating all the models ( stacking them ) with numpy vstack function and concatenate.

In [None]:
train_ss_w_meta = np.concatenate((train_ss, np.vstack((model_1_predict, model_2_predict, model_3_predict, 
                                                       model_4_predict, model_5_predict,
              model_6_predict, model_7_predict, model_8_predict, model_9_predict, model_10_predict)).T), axis=1)

# Applying the new and final xgboost model on the concatenated stack training data

In [None]:
params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 8
params["nthread"] = 6
#params["gamma"] = 1
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 1400

In [None]:
xgtrain = xgb.DMatrix(train_ss_w_meta, label=target_ss)
watchlist = [(xgtrain, 'train')]
model_ss_xgboost = xgb.train(plst, xgtrain, num_rounds)

# Applying the 10 models on the test data in the similar manner

In [None]:
model_1_predict = model_1.predict(xgb.DMatrix(input[train.shape[0]:, :]))
model_2_predict = model_2.predict(xgb.DMatrix(input[train.shape[0]:, :]))
model_3_predict = model_3.predict(xgb.DMatrix(input[train.shape[0]:, :]))
model_4_predict = model_4.predict(xgb.DMatrix(input[train.shape[0]:, :]))
model_5_predict = model_5.predict(input[train.shape[0]:, :])
model_6_predict = model_6.predict(input[train.shape[0]:, :])
model_7_predict = model_7.predict(input[train.shape[0]:, :])
model_8_predict = model_8.predict(input[train.shape[0]:, :])
model_9_predict = model_9.predict(input[train.shape[0]:, :])
model_10_predict = model_10.predict(input[train.shape[0]:, :])

test_ss_w_meta = np.concatenate((input[train.shape[0]:, :], np.vstack((model_1_predict, model_2_predict, model_3_predict, 
                                                       model_4_predict, model_5_predict,
              model_6_predict, model_7_predict, model_8_predict, model_9_predict, model_10_predict)).T), axis=1)

# The last xgboost model which was trained on the the concatenated training data is now predicted on the concatenated stacked test data

In [None]:

model_ss_predict = model_ss_xgboost.predict(xgb.DMatrix(test_ss_w_meta))
submission.Purchase = model_ss_predict
submission.to_csv("sub2.csv", index=False)