In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [2]:
new_train = train.replace({
    'M':0,
    'F':1,
    'A':0,
    'B':1,
    'C':2,
    '0-17':0,
    '18-25':1,
    '26-35':2,
    '36-45':3,
    '46-50':4,
    '51-55':5,
    '55+':6,
    '4+':4,
})
new_train=new_train.fillna(0)
new_train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,1,0,10,0,2,0,3,0.0,0.0,8370
1,1000001,P00248942,1,0,10,0,2,0,1,6.0,14.0,15200
2,1000001,P00087842,1,0,10,0,2,0,12,0.0,0.0,1422
3,1000001,P00085442,1,0,10,0,2,0,12,14.0,0.0,1057
4,1000002,P00285442,0,6,16,2,4,0,8,0.0,0.0,7969


In [3]:
new_train['Stay_In_Current_City_Years'] = new_train['Stay_In_Current_City_Years'].astype(int)

In [4]:
new_train.dtypes

User_ID                         int64
Product_ID                     object
Gender                          int64
Age                             int64
Occupation                      int64
City_Category                   int64
Stay_In_Current_City_Years      int32
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
dtype: object

In [5]:
feature_cols=[
    'Gender',
    'Age',
    'Occupation',
    'City_Category',
    'Stay_In_Current_City_Years',
    'Marital_Status',
    'Product_Category_1',
    'Product_Category_2',
    'Product_Category_3'
]
X=new_train[feature_cols]
y=new_train['Purchase']

**Linear Regression**

In [6]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X,y)



In [7]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
print(linreg.intercept_)
print(linreg.coef_)
zip(feature_cols, linreg.coef_)

9959.209084709943
[-475.39760956  112.20363357    5.35091458  315.96915564    8.15205396
  -43.08421468 -346.45285475   13.23325314  144.8334964 ]


<zip at 0x16dd016d248>

In [9]:
y_pred=linreg.predict(X_test)
df=pd.DataFrame({
    'Actual':y_test,
    'Predicted':y_pred
})
df.head()

Unnamed: 0,Actual,Predicted
140467,11941,12190.768102
283697,8066,13010.868516
144234,12081,10677.556294
497586,15437,9750.501704
36142,775,6237.237522


In [10]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))

Mean Absolute Error: 3530.6176555047005
Mean Squared Error: 21357527.999469902


In [11]:
def rmse(y,ypred):
    return(np.sqrt(sum((y-ypred)*(y-ypred))/len(y)))

In [12]:
print('Root Mean Squared Error:',rmse(y_test,y_pred))

Root Mean Squared Error: 4621.420560765885


**Decision Tree**

In [13]:
from sklearn.tree import DecisionTreeRegressor
DT = DecisionTreeRegressor()
DT.fit(X_train, y_train)
y_pred = DT.predict(X_test)
df = pd.DataFrame({
    'Actual':y_test,
    'Predicted':y_pred
})
df.head()

Unnamed: 0,Actual,Predicted
140467,11941,15735.0
283697,8066,11613.5
144234,12081,16523.0
497586,15437,15440.0
36142,775,775.0


In [14]:
from sklearn import metrics
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

RMSE: 3338.2847459247278


**XGBoost**

In [15]:
from xgboost import XGBRegressor
XG = XGBRegressor()
XG.fit(X_train, y_train)
y_pred = XG.predict(X_test)
df = pd.DataFrame({
    'Actual':y_test,
    'Predicted':y_pred
})
df.head()

Unnamed: 0,Actual,Predicted
140467,11941,13347.441406
283697,8066,14160.706055
144234,12081,15198.211914
497586,15437,13009.420898
36142,775,1066.083618


In [16]:
from sklearn import metrics
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

RMSE: 2980.712809014717


**Predictions**

In [17]:
new_test = test.replace({
    'M':0,
    'F':1,
    'A':0,
    'B':1,
    'C':2,
    '0-17':0,
    '18-25':1,
    '26-35':2,
    '36-45':3,
    '46-50':4,
    '51-55':5,
    '55+':6,
    '4+':4,
})
new_test=new_test.fillna(0)
new_test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,0,4,7,1,2,1,1,11.0,0.0
1,1000009,P00113442,0,2,17,2,0,0,3,5.0,0.0
2,1000010,P00288442,1,3,1,1,4,1,5,14.0,0.0
3,1000010,P00145342,1,3,1,1,4,1,4,9.0,0.0
4,1000011,P00053842,1,2,1,2,1,0,4,5.0,12.0


In [19]:
new_test['Stay_In_Current_City_Years'] = new_test['Stay_In_Current_City_Years'].astype(int)

In [20]:
Xtest=new_test[feature_cols]

In [21]:
y_sol=XG.predict(Xtest)

In [22]:
y_sol

array([13140.55  ,  9989.979 ,  6102.2676, ..., 12993.105 , 19526.055 ,
        2590.2712], dtype=float32)

In [24]:
solution = pd.DataFrame({
        "User_ID": test["User_ID"],
        "Product_ID": test["Product_ID"],
         "Purchase" : y_sol
    })

In [25]:
solution.head()

Unnamed: 0,User_ID,Product_ID,Purchase
0,1000004,P00128942,13140.549805
1,1000009,P00113442,9989.978516
2,1000010,P00288442,6102.267578
3,1000010,P00145342,3496.717041
4,1000011,P00053842,2916.094971


In [26]:
solution.to_csv("solution.csv", index=False)