# Machine Learning Documentation

## Load Data

In [1]:
#import relevant packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
#loading data
df = pd.read_csv('MockDataV1.csv')
df.head()

Unnamed: 0,Index,TempIndex,ItemNumber,StationeryType,StationeryName,ReorderLevel,ReorderQty,UOM,Price,SupplierName,Dates,Time,QuantityOrdered,TotalPrice
0,1,64,P044,Pen,Pencil 4H,100,50,Dozen,$2.30,APLHA,"Monday, January 2, 2017",17:00:07,70,$161.00
1,2,46,P015,Pad,"Pad Postit Memo 2""x4""",100,60,Packet,$6.20,ISS,"Monday, January 2, 2017",14:30:46,60,$372.00
2,3,78,S022,Stapler,Stapler No. 28,50,20,Box,$3.60,APLHA,"Monday, January 2, 2017",13:27:54,36,$129.60
3,4,76,S020,Stapler,Stapler No. 28,50,20,Each,$3.60,CHEAP,"Monday, January 2, 2017",13:42:54,22,$79.20
4,5,52,P032,Pen,Pen Ballpoint Red,100,50,Dozen,$2.50,CHEAP,"Monday, January 2, 2017",11:38:24,85,$212.50


In [3]:
#checking datatypes
df.dtypes

Index               int64
TempIndex           int64
ItemNumber         object
StationeryType     object
StationeryName     object
ReorderLevel        int64
ReorderQty          int64
UOM                object
Price              object
SupplierName       object
Dates              object
Time               object
QuantityOrdered     int64
TotalPrice         object
dtype: object

## Logistic Regression

### Data Pre-processing

In [14]:
#adding a new column(TotalPrice_new) for converting TotalPrice into datatype float64.
df['TotalPrice_new'] = df['TotalPrice'].apply(lambda x: x.replace('$','')).apply(lambda x: x.replace(',','')).astype(np.float64)
df

Unnamed: 0,Index,TempIndex,ItemNumber,StationeryType,StationeryName,ReorderLevel,ReorderQty,UOM,Price,SupplierName,Dates,Time,QuantityOrdered,TotalPrice,TotalPrice_new
0,1,64,P044,Pen,Pencil 4H,100,50,Dozen,$2.30,APLHA,"Monday, January 2, 2017",17:00:07,70,$161.00,161.0
1,2,46,P015,Pad,"Pad Postit Memo 2""x4""",100,60,Packet,$6.20,ISS,"Monday, January 2, 2017",14:30:46,60,$372.00,372.0
2,3,78,S022,Stapler,Stapler No. 28,50,20,Box,$3.60,APLHA,"Monday, January 2, 2017",13:27:54,36,$129.60,129.6
3,4,76,S020,Stapler,Stapler No. 28,50,20,Each,$3.60,CHEAP,"Monday, January 2, 2017",13:42:54,22,$79.20,79.2
4,5,52,P032,Pen,Pen Ballpoint Red,100,50,Dozen,$2.50,CHEAP,"Monday, January 2, 2017",11:38:24,85,$212.50,212.5
5,6,87,T024,Tparency,Transparency Reverse Blue,100,200,Box,$1.40,FANTASTIC,"Monday, January 2, 2017",13:35:41,220,$308.00,308.0
6,7,73,S010,Shorthand,Shorthand Book (100 pg),100,80,Each,$0.50,FANTASTIC,"Monday, January 2, 2017",14:46:24,128,$64.00,64.0
7,8,82,T003,Tacks,Thumb Tacks Small,10,10,Box,$1.00,LOTCASH,"Tuesday, January 3, 2017",17:00:16,12,$12.00,12.0
8,9,63,P043,Pen,Pencil 2B with Eraser End,100,50,Dozen,$1.80,FANTASTIC,"Tuesday, January 3, 2017",11:41:32,75,$135.00,135.0
9,10,6,C006,Clip,Clips Paper Small,50,30,Box,$1.20,BANES,"Tuesday, January 3, 2017",11:08:43,60,$72.00,72.0


In [6]:
#extract the necessary columns
df_extracted = df.iloc[:,[3,4,9,12,13,14]]
df_extracted.head()

Unnamed: 0,StationeryType,StationeryName,SupplierName,QuantityOrdered,TotalPrice,TotalPrice_new
0,Pen,Pencil 4H,APLHA,70,$161.00,161.0
1,Pad,"Pad Postit Memo 2""x4""",ISS,60,$372.00,372.0
2,Stapler,Stapler No. 28,APLHA,36,$129.60,129.6
3,Stapler,Stapler No. 28,CHEAP,22,$79.20,79.2
4,Pen,Pen Ballpoint Red,CHEAP,85,$212.50,212.5


In [7]:
#list down all the stationery types
TypeArray = df_extracted.StationeryType.unique()
TypeArray

#replace stationery type name with an integer 
def Trans_StationeryType(x):
    for y in range(0,len(TypeArray)):
        if x == TypeArray[y]:
            return y+1

#adding a new column for the stationery type in terms of integer
df_extracted['TransStationeryType'] = df_extracted['StationeryType'].apply(Trans_StationeryType)
df_extracted.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,StationeryType,StationeryName,SupplierName,QuantityOrdered,TotalPrice,TotalPrice_new,TransStationeryType
0,Pen,Pencil 4H,APLHA,70,$161.00,161.0,1
1,Pad,"Pad Postit Memo 2""x4""",ISS,60,$372.00,372.0,2
2,Stapler,Stapler No. 28,APLHA,36,$129.60,129.6,3
3,Stapler,Stapler No. 28,CHEAP,22,$79.20,79.2,3
4,Pen,Pen Ballpoint Red,CHEAP,85,$212.50,212.5,1


In [8]:
#groupby Stationery Name & Type & Supplier and sum the quantity and total price
df_new = df_extracted.groupby(['StationeryName','StationeryType','SupplierName']).agg({'QuantityOrdered':'sum','TotalPrice_new':'sum'}) 
df_new

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,QuantityOrdered,TotalPrice_new
StationeryName,StationeryType,SupplierName,Unnamed: 3_level_1,Unnamed: 4_level_1
"Clips Double 1""",Clip,FIEND,1683,1683.00
"Clips Double 2""",Clip,BANES,3987,4784.40
"Clips Double 3/4""",Clip,RANDOM,2379,2141.10
Clips Paper Large,Clip,APLHA,3000,6000.00
Clips Paper Medium,Clip,APLHA,2304,3456.00
Clips Paper Small,Clip,BANES,2745,3294.00
"Envelope Brown (3""x6"")",Envelope,ISS,40320,40320.00
"Envelope Brown (3""x6"") w/ Window",Envelope,OMEGA,42600,46860.00
"Envelope Brown (5""x7"")",Envelope,LOTCASH,35480,56768.00
"Envelope Brown (5""x7"") w/ Window",Envelope,ISS,38200,64940.00


### Model

In [9]:
X = df_extracted.iloc[:,3:4]
y = df_extracted['TransStationeryType']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
logReg = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial', random_state = 42)
logReg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
y_pred = logReg.predict(X_test)
print(y_pred)

[1 1 8 ... 1 1 8]


In [11]:
print(accuracy_score(y_test, y_pred))

0.36814814814814817


In [12]:
X2 = df_extracted.iloc[:,5:6]
y2 = df_extracted['TransStationeryType']
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state = 42)
logReg = LogisticRegression(solver = 'lbfgs', multi_class = 'multinomial', random_state = 42)
logReg.fit(X2_train, y2_train)
y2_pred = logReg.predict(X2_test)
print(y2_pred)
print(accuracy_score(y2_test, y2_pred))

[12 12 12 ... 12 12 12]
0.035555555555555556


