In [1]:
import joblib
import numpy as np
import matplotlib.pyplot as plt

### Performance of multiple linear regression without lasso to predict gross income.

In [2]:
# load test data for regression

df_test = joblib.load("test_data_for_regression.pkl")
df_test

Unnamed: 0,Branch A,Branch B,Branch C,City Yangon,City Mandalay,City Naypyitaw,Fashion accessories,Electronic accessories,Food and beverages,Sports and travel,...,Evening,Night,Gender,Customer type,Quantity,Total,cogs,Rating,Unit price,gross income
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.186379,2.156980,2.156980,-1.054138,90.63,40.7835
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.501847,-0.741485,-0.741485,-0.010222,19.32,6.7620
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.528645,1.389926,1.389926,-1.518101,63.56,31.7800
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,-0.867216,-0.044760,-0.044760,-0.648171,99.60,14.9400
4,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,-1.551748,-0.903697,-0.903697,0.163764,97.16,4.8580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.159581,0.988827,0.988827,-0.416189,90.24,27.0720
196,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.528645,0.893664,0.893664,0.743718,51.91,25.9550
197,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.159581,-0.896370,-0.896370,1.729639,16.48,4.9440
198,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,-1.551748,-1.202263,-1.202263,-0.938148,27.07,1.3535


In [3]:
# load columns for predicting gross income

gross_income_attributes = joblib.load("gross_income_attributes.pkl")

In [4]:
# get X_test and t_test

X_test = df_test[gross_income_attributes[:-1]]
t_test = df_test[gross_income_attributes[-1]]
X_test.shape,t_test.shape

((200, 30), (200,))

In [5]:
# evaluate performance

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from scipy import stats

lin_reg = joblib.load("lin_reg_without_lasso_gi.pkl")
y_test = lin_reg.predict(X_test)

scores = cross_val_score(lin_reg, 
                         X_test, t_test,
                         n_jobs=-1,
                         cv=10, 
                         scoring='r2')

confidence = 0.95
print('r2 scores: ', r2_score(t_test, y_test))
print('95% CI: ', stats.t.interval(confidence, len(scores) - 1,
                 loc=scores.mean(),
                 scale=scores.std(ddof=1)/np.sqrt(len(scores))))

r2 scores:  0.8913088363324457
95% CI:  (0.8141377647513237, 0.8855238116286008)


### Performance of multiple linear regression with lasso to predict gross income.

In [6]:
# evaluate performance

lin_reg = joblib.load("lin_reg_with_lasso_gi.pkl")
y_test = lin_reg.predict(X_test)

scores = cross_val_score(lin_reg, 
                         X_test, t_test,
                         n_jobs=-1,
                         cv=10, 
                         scoring='r2')

confidence = 0.95
print('r2 scores: ', r2_score(t_test, y_test))
print('95% CI: ', stats.t.interval(confidence, len(scores) - 1,
                 loc=scores.mean(),
                 scale=scores.std(ddof=1)/np.sqrt(len(scores))))

r2 scores:  0.892171966785714
95% CI:  (0.828293735812967, 0.8982480777484378)


### Performance of multiple linear regression without lasso to predict Unit price.

In [7]:
# load columns for predicting unit price

unit_price_attributes = joblib.load("unit_price_attributes.pkl")

In [8]:
# get X_test and t_test

X_test = df_test[unit_price_attributes[:-1]]

t_test = df_test[unit_price_attributes[-1]]

X_test.shape, t_test.shape

((200, 30), (200,))

In [9]:
# evaluate performance

lin_reg = joblib.load("lin_reg_without_lasso_up.pkl")
y_test = lin_reg.predict(X_test)

scores = cross_val_score(lin_reg, 
                         X_test, t_test,
                         n_jobs=-1,
                         cv=10, 
                         scoring='r2')

confidence = 0.95
print('r2 scores: ', r2_score(t_test, y_test))
print('95% CI: ', stats.t.interval(confidence, len(scores) - 1,
                 loc=scores.mean(),
                 scale=scores.std(ddof=1)/np.sqrt(len(scores))))

r2 scores:  0.7848505164456147
95% CI:  (0.6844845503325383, 0.7970756896745368)


### Performance of multiple linear regression with lasso to predict Unit price.

In [10]:
# evaluate performance

lin_reg = joblib.load("lin_reg_with_lasso_up.pkl")
y_test = lin_reg.predict(X_test)

scores = cross_val_score(lin_reg, 
                         X_test, t_test,
                         n_jobs=-1,
                         cv=10, 
                         scoring='r2')

confidence = 0.95
print('r2 scores: ', r2_score(t_test, y_test))
print('95% CI: ', stats.t.interval(confidence, len(scores) - 1,
                 loc=scores.mean(),
                 scale=scores.std(ddof=1)/np.sqrt(len(scores))))

r2 scores:  0.7879557586887668
95% CI:  (0.7100104690324487, 0.8186926811842625)


### Performance of logistic regression to classify gender.

In [11]:
# load test data for classification

df_test_clf = joblib.load("test_data_for_classification.pkl")
df_test_clf

Unnamed: 0,Branch A,Branch B,Branch C,City Yangon,City Mandalay,City Naypyitaw,Fashion accessories,Electronic accessories,Food and beverages,Sports and travel,...,Evening,Night,Gender,Customer type,Quantity,Total,cogs,Rating,Unit price,gross income
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.186379,2.156980,2.156980,-1.054138,1.322946,2.156980
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.501847,-0.741485,-0.741485,-0.010222,-1.368753,-0.741485
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.528645,1.389926,1.389926,-1.518101,0.301150,1.389926
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,-0.867216,-0.044760,-0.044760,-0.648171,1.661532,-0.044760
4,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,-1.551748,-0.903697,-0.903697,0.163764,1.569430,-0.903697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.159581,0.988827,0.988827,-0.416189,1.308225,0.988827
196,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.528645,0.893664,0.893664,0.743718,-0.138596,0.893664
197,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.159581,-0.896370,-0.896370,1.729639,-1.475953,-0.896370
198,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,-1.551748,-1.202263,-1.202263,-0.938148,-1.076218,-1.202263


In [12]:
# get X_test and t_test

X_test = df_test_clf.drop(['Gender'],axis=1)
t_test = df_test_clf['Gender']

X_test.shape,t_test.shape

((200, 33), (200,))

In [13]:
# evaluate performance

from sklearn.metrics import accuracy_score

log_reg = joblib.load("log_reg_gender.pkl")
y_test= log_reg.predict(X_test)

scores= cross_val_score(log_reg, 
                        X_test, t_test,
                        n_jobs=-1,
                        cv=10, 
                        scoring='accuracy')


confidence = 0.95
print('Test accuracy: ', accuracy_score(t_test, y_test))
print('95% CI: ', stats.t.interval(confidence, len(scores) - 1,
                 loc=scores.mean(),
                 scale=scores.std(ddof=1)/np.sqrt(len(scores))))

Test accuracy:  0.505
95% CI:  (0.4773059442237334, 0.6526940557762665)


### Performance of logistic regression to classify customer type.

In [14]:
# get X_test and t_test

X_test = df_test_clf.drop(['Customer type'],axis=1)
t_test = df_test_clf['Customer type']

X_test.shape,t_test.shape

((200, 33), (200,))

In [15]:
# evaluate performance

log_reg = joblib.load("log_reg_customer.pkl")
y_test= log_reg.predict(X_test)

scores= cross_val_score(log_reg, 
                        X_test, t_test,
                        n_jobs=-1,
                        cv=10, 
                        scoring='accuracy')


confidence = 0.95
print('Test accuracy: ', accuracy_score(t_test, y_test))
print('95% CI: ', stats.t.interval(confidence, len(scores) - 1,
                 loc=scores.mean(),
                 scale=scores.std(ddof=1)/np.sqrt(len(scores))))

Test accuracy:  0.525
95% CI:  (0.39385717996713043, 0.6161428200328696)


### Performance of logistic regression to predict day of purchase.

In [16]:
# get X_test amd t_test

t_test = df_test_clf[['Monday', 'Tuesday', 'Wednesday', 
                      'Thursday', 'Friday', 'Saturday', 'Sunday']]

X_test = df_test_clf.drop(['Monday', 'Tuesday', 'Wednesday', 
                           'Thursday', 'Friday', 'Saturday', 'Sunday'], 
                                         axis=1)
X_test.shape, t_test.shape

((200, 27), (200, 7))

In [17]:
# one hot to integer encoding

t_test = np.where(t_test==1)[1]
t_test.shape

(200,)

In [18]:
# evaluate performance

log_reg = joblib.load("log_reg_multiclass.pkl")
y_test= log_reg.predict(X_test)

scores = cross_val_score(log_reg, 
                        X_test, t_test,
                        n_jobs=-1,
                        cv=10, 
                        scoring='accuracy')

confidence = 0.95
print('Test accuracy: ', accuracy_score(t_test, y_test))
print('95% CI: ', stats.t.interval(confidence, len(scores) - 1,
                 loc=scores.mean(),
                 scale=scores.std(ddof=1)/np.sqrt(len(scores))))

Test accuracy:  0.155
95% CI:  (0.08458869233302448, 0.19541130766697556)


In [19]:
# evaluate performance

log_reg = joblib.load("random_forest_clf.pkl")
y_test= log_reg.predict(X_test)

scores = cross_val_score(log_reg, 
                        X_test, t_test,
                        n_jobs=-1,
                        cv=10, 
                        scoring='accuracy')

confidence = 0.95
print('Test accuracy: ', accuracy_score(t_test, y_test))
print('95% CI: ', stats.t.interval(confidence, len(scores) - 1,
                 loc=scores.mean(),
                 scale=scores.std(ddof=1)/np.sqrt(len(scores))))

Test accuracy:  0.13
95% CI:  (0.11362458231259578, 0.2363754176874042)
