In [1]:
import pickle
from math import sqrt
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor 
import matplotlib.pyplot as plt
# from sklearn.externals import joblib

In [2]:
input=pd.read_csv("./dataset_00_with_header.csv")

In [3]:
# print (input.sample(2))
# print (input.shape)
print (input.dtypes.value_counts())
# print (input.dtypes)
# print (input.isnull().any())
print (input.isnull().any().value_counts())

int64      264
float64     41
dtype: int64
False    264
True      41
dtype: int64


In [4]:
#calculating the corelation
corr_matrix=input.corr(method ='pearson')#['y'][:]
print(corr_matrix["y"].sort_values(ascending=False))

y       1.000000
x235    0.620394
x005    0.575970
x236    0.572875
x022    0.568884
x227    0.537190
x249    0.515316
x226    0.514055
x228    0.512067
x046    0.507595
x244    0.503321
x237    0.499302
x225    0.488159
x025    0.487376
x239    0.487305
x002    0.485744
x023    0.480639
x014    0.474342
x245    0.467735
x246    0.459512
x250    0.447270
x247    0.446191
x262    0.445821
x229    0.443279
x261    0.439065
x260    0.434920
x224    0.432531
x030    0.425470
x027    0.424206
x004    0.419683
          ...   
x276   -0.307428
x099   -0.307428
x277   -0.308842
x172   -0.309105
x036   -0.312055
x278   -0.316596
x056   -0.336567
x173   -0.337363
x168   -0.348083
x063   -0.358450
x304   -0.368138
x293   -0.370417
x297   -0.374772
x162   -0.403556
x064   -0.411101
x065   -0.413885
x059   -0.415029
x253   -0.473823
x148   -0.483754
x302   -0.483852
x155   -0.542090
x242   -0.547844
x287   -0.562191
x058   -0.586811
x057   -0.636510
x041   -0.690840
x067         NaN
x094         N

In [5]:
# because suspicious corelation find the values in the column
print (input["x067"].value_counts())
print (input["x094"].value_counts())
print (input["x095"].value_counts())
print (input["x096"].value_counts())

0    100000
Name: x067, dtype: int64
0    100000
Name: x094, dtype: int64
0    100000
Name: x095, dtype: int64
0    100000
Name: x096, dtype: int64


In [6]:
# columns with more than 75% null values
percent_missing = input.isnull().sum() * 100 / len(input)
missing_value_df = pd.DataFrame({'column_name': input.columns,
                                 'percent_missing': percent_missing})

missing_value_df.sort_values('percent_missing', inplace=True)
nullDeleteList=list(missing_value_df.loc[missing_value_df["percent_missing"]>75]["column_name"])


In [7]:
# add all the features in removal list 

deleteCell=['x067','x094','x095','x096']
# nullDeleteList+deleteCell

nullDeleteList.extend(deleteCell)


In [8]:
col = [c for c in input.columns if c not in nullDeleteList]
numclasses=[]
for c in col:
    numclasses.append(len(np.unique(input[[c]])))

#create categorical features which has only 2-3 values in 
threshold=3
categorical_variables = list(np.array(col)[np.array(numclasses)<=threshold])

In [9]:
# remove all removal columns
input1=input[col]
# find the nunll values in categorical feature if any
len(categorical_variables)
input1[categorical_variables].isnull().any().value_counts()

False    41
dtype: int64

In [10]:
# split the data into train test 
X_train, X_test, train_y, test_y = train_test_split(input1.drop(['y'],1), input1.y , test_size=0.3)
X_test, X_val, test_y, val_y = train_test_split(X_test, test_y , test_size=0.2)

In [11]:
train_cat = X_train[categorical_variables]
test_cat = X_test[categorical_variables]
val_cat=X_val[categorical_variables]
# Do the one Hot encoding of the categorical features

train_cat_encoded = pd.get_dummies(train_cat, columns = train_cat.columns.tolist())
test_cat_encoded = pd.get_dummies(test_cat, columns = test_cat.columns.tolist())
val_cat_encoded = pd.get_dummies(val_cat, columns = val_cat.columns.tolist())

In [12]:
# drop the ids as unique identifier from the contious feature set

train_cont = X_train.drop(categorical_variables,1)
test_cont = X_test.drop(categorical_variables,1)
val_cont = X_val.drop(categorical_variables,1)

X_train.drop(['x001'], 1,inplace=True)
X_test.drop(['x001'], 1,inplace=True)
X_val.drop(['x001'], 1,inplace=True)

train_cont.fillna(train_cont.mean(), inplace=True)
test_cont.fillna(test_cont.mean(), inplace=True)
val_cont.fillna(val_cont.mean(), inplace=True)


In [13]:
# Do normalization of the feature set and learn that norm values(mean and std dev) 
#                                     on train set and apply it on test and train

norm_values= train_cont.agg(["mean","std"])

train_cont_normalized = train_cont.apply(lambda x: (x- x.mean())/x.std())
train_cont_normalized.head()
# pickle.dump(norm_values, open("/data/analytics/norm_values.pkl", 'wb'))
######## normailising using train normalisers 
test_cont_normalized = test_cont - norm_values.loc["mean"]
test_cont_normalized = test_cont_normalized/norm_values.loc["std"]
test_cont_normalized.head()

######## normailising using train normalisers 
val_cont_normalized = val_cont - norm_values.loc["mean"]
val_cont_normalized = val_cont_normalized/norm_values.loc["std"]
val_cont_normalized.head()

Unnamed: 0,x001,x002,x003,x004,x005,x007,x008,x009,x010,x011,...,x289,x290,x291,x292,x293,x294,x296,x297,x302,x303
76483,0.7746,1.376446,-0.057928,0.43232,0.737089,-0.496204,-0.599437,-0.094936,-0.016809,0.056668,...,-0.015118,-0.017308,-0.380129,0.104372,0.013534,-0.148742,-0.36922,0.01629,0.000254,-0.265911
47554,0.429904,-0.031094,-0.007722,-0.024532,-0.666091,-0.496204,-0.599437,-0.589502,0.563374,-0.679617,...,-0.015118,-0.017308,-0.380129,-0.350807,0.013534,-0.148742,-0.36922,0.01629,0.000254,-0.265911
28147,1.513359,-0.031094,-0.007722,-0.024532,-0.583551,-0.496204,-0.599437,-0.589502,-0.596992,-0.679617,...,-0.015118,-0.017308,-0.380129,-0.350807,0.013534,-0.148742,-0.36922,0.01629,0.000254,-0.265911
46327,1.329072,0.329569,3.088769,1.668051,-0.154343,-0.496204,-0.599437,-0.589502,-0.596992,-0.679617,...,5.423473,4.38516,-0.376181,-0.349898,-3.490871,-0.148742,-0.36446,-4.124685,-1.763056,-0.259598
31200,-2.72808,-0.862707,-0.563237,-0.891677,-0.699107,1.641466,8.434803,39.470413,4.044474,-0.311475,...,0.300862,-0.135015,1.57719,0.100034,1.237362,-0.148742,1.990378,1.594966,1.855582,2.864152


In [14]:
# concat the both cat and cont features
train_X= pd.concat([train_cat_encoded,train_cont_normalized],axis =1)
test_X = pd.concat([test_cat_encoded,test_cont_normalized],axis =1)
val_X = pd.concat([val_cat_encoded,val_cont_normalized],axis =1)

In [15]:
# check if any ferature from training set missing in test set and remove them
columns=train_X.columns
missing_cols=set(columns)-set( test_X.columns )     
feature_difference_df = pd.DataFrame(data=np.zeros((test_X.shape[0], len(missing_cols))),
                                 columns=list(missing_cols))

# add "missing" features back to `test
test_X = test_X.join(feature_difference_df)
test_X=test_X[columns]

In [16]:
# check any null values left 
print(test_X.isnull().any().value_counts())
test_X.fillna(0, inplace=True)
train_X.fillna(0, inplace=True)
val_X.fillna(0, inplace=True)

False    332
dtype: int64


In [17]:
############ XGBoost##############
xgb= XGBRegressor(seed=12,
            n_estimators=1000, max_depth=10,nthread =55,learning_rate=0.03001, subsample=0.65, colsample_bytree=0.6
           ,min_child_weight=1, max_delta_step=0, 
                  colsample_bylevel=1, objective="reg:tweedie",
                 reg_alpha=0.8, reg_lambda=1, base_score=0.5,  missing=None
        )

xgb.fit(train_X, train_y)
prediction_xgbp=xgb.predict(test_X)
print("---XgBoost-MSE--", sqrt(mean_squared_error(test_y, prediction_xgbp))  )
print("---XgBoost-R2--", xgb.score(test_X,test_y))

---XgBoost-MSE-- 25.071598959593953
---XgBoost-R2-- 0.955163125418


In [18]:
regressor = LinearRegression()
regressor.fit(train_X, train_y)#predicting the test set results
y_pred = regressor.predict(test_X)

print("---Logistic-MSE--", sqrt(mean_squared_error(test_y, y_pred))  )
print("---Logistic-R2--", regressor.score(test_X,test_y))

---Logistic-MSE-- 47.67096097397547
---Logistic-R2-- 0.837901468394


In [19]:

def scoring(clf):
    scores = cross_val_score(clf, train_X, train_y, cv=15, n_jobs=55, scoring = 'neg_median_absolute_error')
    print (np.median(scores) * -1)
gbr = GradientBoostingRegressor(learning_rate = 0.12,
                                n_estimators = 150,
                                max_depth = 8,
                                min_samples_leaf = 1,
                                random_state = 2)
# clf = GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
#           learning_rate = 0.1, loss = 'ls')
scoring(gbr)

gbr.fit(train_X, train_y)
prediction_gbr=gbr.predict(test_X)
print("---gbr-MSE--", sqrt(mean_squared_error(test_y, prediction_gbr))  )
print("---gbr-R2--", gbr.score(test_X,test_y))

12.9326783763
---gbr-MSE-- 26.548938672922223
---gbr-R2-- 0.949723435259


In [20]:

# clfN2 = MLPRegressor(solver='adam', alpha=.001,activation ="relu",verbose=True,
#                     hidden_layer_sizes=(128,64,32,16,8), random_state=2,max_iter=5000,learning_rate_init=0.0005)


clfN2 = MLPRegressor(solver='adam', alpha=.001,activation ="relu",
                    hidden_layer_sizes=(128,64,32,16), random_state=2,max_iter=5000)
clfN2.fit(train_X, train_y)
prediction_neuralNet2 = clfN2.predict(test_X)
print("---Neural-MSE--",sqrt(mean_squared_error(test_y, prediction_neuralNet2)))
print("---Neural-R2--", clfN2.score(test_X,test_y))

---Neural-MSE-- 28.88725433440957
---Neural-R2-- 0.940477139013


In [21]:
rfr = RandomForestRegressor(n_estimators = 55,
                            min_samples_leaf = 3,
                            random_state = 2)
scoring(rfr)

rfr.fit(train_X, train_y)
prediction_rfr=rfr.predict(test_X)
print("---rfr-MSE--", sqrt(mean_squared_error(test_y, prediction_rfr))  )
print("---rfr-R2--", rfr.score(test_X,test_y))

13.7536626
---rfr-MSE-- 29.021922675037537
---rfr-R2-- 0.939920870895


In [22]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits=4, random_state=7)
results = cross_val_score(xgb, val_X, val_y, cv=kfold)

In [23]:
results

array([ 0.9391913 ,  0.94386338,  0.93509151,  0.94596736])

In [24]:
# ############ Tried adding polynomial features
# from sklearn.preprocessing import PolynomialFeatures
# poly = PolynomialFeatures(2)
# train_poly=poly.fit_transform(train_X)
# test_poly=poly.fit_transform(test_X)

# regressor = LinearRegression()
# regressor.fit(train_poly, train_y)#predicting the test set results

# y_pred = regressor.predict(test_poly)

# print("---Logistic-MSE--", mean_squared_error(test_y, y_pred))  
# print("---Logistic-R2--", regressor.score(test_poly,test_y))


# ---Logistic-MSE-- 7.99538721465e+22
# ---Logistic-R2-- -5.68078419922e+18

In [25]:

# joblib.dump(xgb, 'model.pkl')
pickle.dump(xgb, open("/data/analytics/anuj/model.pkl", 'wb'))
# gbr = joblib.load('model.pkl')

pickle.dump(norm_values, open("/data/analytics/anuj/norm_values.pkl", 'wb'))
pickle.dump(columns, open("/data/analytics/anuj/columnsTrain.pkl", 'wb'))
pickle.dump(col, open("/data/analytics/anuj/genuineCol.pkl", 'wb'))
pickle.dump(categorical_variables, open("//data/analytics/anuj/catvariables.pkl", 'wb'))

In [26]:
otpt=pd.DataFrame(test_y.values, prediction_xgbp).reset_index()
otpt.columns =['actual','pred']
otpt["diff"]=abs(otpt["actual"]-otpt["pred"])
otpt["class"]=0
otpt.loc[otpt["diff"]<=3.0,"class"]=1
final=otpt["class"].value_counts()*100/otpt.shape[0] 
print("---Accuracy --", final[1])

---Accuracy -- 15.3


In [27]:
#Create a plot that ranks the features by importance.
def plot_importances(model, model_name):
    importances = model.feature_importances_
    std = np.std([model.feature_importances_ for feature in model.estimators_],
                 axis=0)
    indices = np.argsort(importances)[::-1]    

    # Plot the feature importances of the forest
    plt.figure(figsize = (8,5))
    plt.title("Feature importances of " + model_name)
    plt.bar(range(X_train.shape[1]), importances[indices], color="r", align="center")
    plt.xticks(range(X_train.shape[1]), indices)
    plt.xlim([-1, X_train.shape[1]])
    plt.show()



In [28]:
print("Feature ranking:")

i = 0
for feature in train_X:
    print (i, feature)
    i += 1
    

# plot_importances(gbr, "Gradient Boosting Regressor")

Feature ranking:
0 x006_0
1 x006_1
2 x025_0
3 x025_1
4 x026_0
5 x026_1
6 x027_0
7 x027_1
8 x060_0
9 x060_1
10 x082_0
11 x082_1
12 x083_0
13 x083_1
14 x084_0
15 x084_1
16 x085_0
17 x085_1
18 x086_0
19 x086_1
20 x087_0
21 x087_1
22 x088_0
23 x088_1
24 x089_0
25 x089_1
26 x090_0
27 x090_1
28 x091_0
29 x091_1
30 x092_0
31 x092_1
32 x093_0
33 x093_1
34 x147_0
35 x147_1
36 x154_0
37 x154_1
38 x161_0
39 x161_1
40 x180_0
41 x180_1
42 x244_0
43 x244_1
44 x245_0
45 x245_1
46 x246_0
47 x246_1
48 x247_0
49 x247_1
50 x248_0
51 x248_1
52 x249_0
53 x249_1
54 x260_0
55 x260_1
56 x261_0
57 x261_1
58 x262_0
59 x262_1
60 x263_0
61 x263_1
62 x269_0
63 x269_1
64 x270_0
65 x270_1
66 x271_0
67 x271_1
68 x282_0
69 x282_1
70 x283_0
71 x283_1
72 x284_0
73 x284_1
74 x298_0
75 x298_1
76 x299_0
77 x299_1
78 x300_0
79 x300_1
80 x301_0
81 x301_1
82 x001
83 x002
84 x003
85 x004
86 x005
87 x007
88 x008
89 x009
90 x010
91 x011
92 x012
93 x013
94 x014
95 x015
96 x016
97 x017
98 x018
99 x019
100 x020
101 x021
102 x022
10