In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
#import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

In [2]:
proj_data = pd.read_csv("data_set.csv")

proj_data.head()

Unnamed: 0,item_sku_id,dc_id,date,quantity,vendibility,original_price,discount
0,6,4,2017-03-12,0.0,1.0,,
1,6,4,2017-04-29,0.0,1.0,,
2,6,4,2017-09-01,0.0,1.0,,
3,6,4,2017-09-13,0.0,1.0,,
4,6,4,2017-12-01,1.0,1.0,0.01,10.0


In [3]:
proj_data.isna().sum()

item_sku_id           0
dc_id                 0
date                  0
quantity              0
vendibility           0
original_price    34036
discount          34036
dtype: int64

In [4]:
new_proj_data_drooped = proj_data.drop(columns = ['original_price', 'discount','date'])

new_proj_data_droped.head()

Unnamed: 0,item_sku_id,dc_id,quantity,vendibility
0,6,4,0.0,1.0
1,6,4,0.0,1.0
2,6,4,0.0,1.0
3,6,4,0.0,1.0
4,6,4,1.0,1.0


In [5]:
new_proj_data_droped.isna().sum()

item_sku_id    0
dc_id          0
quantity       0
vendibility    0
dtype: int64

In [6]:
#spliting test and train data
X_train, X_test, y_train, y_test = train_test_split(new_proj_data_droped.drop(columns=['item_sku_id']), new_proj_data_droped['item_sku_id'],
                                                    test_size=0.2,random_state=26)

X_train
X_test
y_train
y_test

Unnamed: 0,dc_id,quantity,vendibility
42133,1,2.00,1.00
34105,2,1.00,1.00
68817,0,0.00,1.00
268,4,0.00,1.00
29497,2,1.00,1.00
...,...,...,...
24275,1,0.00,1.00
58470,0,2.00,1.00
55046,2,1.00,0.00
30014,0,8.00,1.00


Unnamed: 0,dc_id,quantity,vendibility
23073,1,0.00,1.00
54386,4,53.00,1.00
33004,1,6.00,0.00
26880,3,0.00,0.00
65278,1,0.00,0.00
...,...,...,...
33124,1,8.00,1.00
69203,1,2.00,1.00
20584,2,0.00,0.00
26091,4,0.00,0.00


42133    26
34105     6
68817    23
268      21
29497    23
         ..
24275     8
58470    28
55046    28
30014    10
54069    37
Name: item_sku_id, Length: 57326, dtype: int64

23073     8
54386     7
33004    20
26880    38
65278    23
         ..
33124    13
69203    32
20584    17
26091     8
17424    36
Name: item_sku_id, Length: 14332, dtype: int64

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test), columns = X_test.columns)
X_train
X_test
y_train
y_test

Unnamed: 0,dc_id,quantity,vendibility
0,-0.71,-0.19,0.46
1,-0.07,-0.22,0.46
2,-1.35,-0.26,0.46
3,1.21,-0.26,0.46
4,-0.07,-0.22,0.46
...,...,...,...
57321,-0.71,-0.26,0.46
57322,-1.35,-0.19,0.46
57323,-0.07,-0.22,-2.19
57324,-1.35,0.02,0.46


Unnamed: 0,dc_id,quantity,vendibility
0,-0.71,-0.26,0.46
1,1.21,1.61,0.46
2,-0.71,-0.05,-2.19
3,0.57,-0.26,-2.19
4,-0.71,-0.26,-2.19
...,...,...,...
14327,-0.71,0.02,0.46
14328,-0.71,-0.19,0.46
14329,-0.07,-0.26,-2.19
14330,1.21,-0.26,-2.19


42133    26
34105     6
68817    23
268      21
29497    23
         ..
24275     8
58470    28
55046    28
30014    10
54069    37
Name: item_sku_id, Length: 57326, dtype: int64

23073     8
54386     7
33004    20
26880    38
65278    23
         ..
33124    13
69203    32
20584    17
26091     8
17424    36
Name: item_sku_id, Length: 14332, dtype: int64

In [8]:
clf = DecisionTreeClassifier(random_state=50)

clf = clf.fit(X_train, y_train) 

In [9]:
test_output = pd.DataFrame(clf.predict(X_test), index = X_test.index, columns = ['pred_item_sku_id'])
test_output.head()

Unnamed: 0,pred_item_sku_id
0,2
1,7
2,4
3,38
4,6


In [10]:
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_item_sku_id'] - test_output['item_sku_id']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)

Unnamed: 0,pred_item_sku_id,item_sku_id
25,32,2
26,37,2
34,21,4
37,21,4
41,2,13


Mean absolute error is 
13.698506425842305


In [11]:
abs(test_output['pred_item_sku_id'] - test_output['item_sku_id']).mean()/test_output['item_sku_id'].mean()


0.6196851136042236

In [12]:
clf.score(X_test, y_test)

0.15371197320680993

In [13]:
student_data = pd.read_csv("student_test.csv")
student_data.head()

new_student_data = student_data[['dc_id','quantity','vendibility']].copy()
new_student_data.head()

Unnamed: 0,serial_number,dc_id,date,quantity,vendibility,original_price,discount
0,0,0,12/23/17,0,0,,
1,1,3,12/30/17,0,1,,
2,2,0,12/18/17,0,0,,
3,3,3,12/18/17,0,0,,
4,4,1,12/25/17,5,0,0.0,2.87


Unnamed: 0,dc_id,quantity,vendibility
0,0,0,0
1,3,0,1
2,0,0,0
3,3,0,0
4,1,5,0


In [14]:
student_test = pd.DataFrame(sc.transform(new_student_data), columns = new_student_data.columns)

In [16]:
pred = clf.predict(student_test)

In [17]:
print(pred)

[38 32 38 38 27  4  2  2 32 13 29 39  2 26  3  7  6 23  2 38 38  7 38  2
 32  2 19 28  4 38  7 38  6 23 32 32  6 21 21 38 32 21  2 21  2  2  2 38
 29  8 39 14 38  2 29 13  7 32  2 32  6  7  7  2 22  7 38  7 26 38  2  2
 40 32 21 32  2  6  4  6  2  2  7  2  7 21 38 40  8 38 28 38 32 34  2  6
 32  8  7 38 13 21 21 32 18 38  8  2 28 26  2 34 21  2 32 38 23 38 32 28
  6 23  8  4 21  4 23 38 40 29  6  2 28  8  7 29  8 29  7  7  7 23 38 21
 32  7 32  7  2 22  6 21 13  2  7 13 32  2 38 32  8 38 32  2  7  2  2 32
 32 28  8 18 20 22 23  8 34 32 34  8 21 26  6  7  7  7  2  2 32 37  6  2
 38  2 32  2 26  7  7 38 32 19 32  8 38  2 32 34 21 13 32  2 23  4 38 38
 38 21 26  6 32 34  7 32  2  2  7 26  4  8  2 23 32 29 18  2 38 32 34 38
 19 28 38 32  7  2 29 23  8  6  2  8 26 10  8 21 21 23 28  2 32 38  6 13
  2 40 29 32 38 32 38 18 21 34  6 40 28  2 34  8 26  2  2  6  7  6 39  2
 38 34 29 32 38 22 22  2 38 28 26  2  7 18 38  7  3 32 32 29 18 26  7  6
  7 13  6  6  6  6 22  2  6 29 32 32 29  7  8 18 29

In [20]:
student_data = pd.read_csv("student_test.csv")
student_data = student_data[['serial_number']]
student_data.head()

Unnamed: 0,serial_number
0,0
1,1
2,2
3,3
4,4


In [21]:
df2=student_data.assign(prediction_sku = pred)
df2.head()

Unnamed: 0,serial_number,prediction_sku
0,0,38
1,1,32
2,2,38
3,3,38
4,4,27


In [22]:
df2.to_csv('pred_sku.csv')