# Classification (2)
## Iris Data

In [5]:
# sklearn.datasets module always brings data looking like dictionary.
from sklearn import datasets

In [7]:
# Get dataset
iris = datasets.load_iris()
type(iris), iris                            # type: Bunch object

(sklearn.utils.Bunch,
 {'data': array([[5.1, 3.5, 1.4, 0.2],
         [4.9, 3. , 1.4, 0.2],
         [4.7, 3.2, 1.3, 0.2],
         [4.6, 3.1, 1.5, 0.2],
         [5. , 3.6, 1.4, 0.2],
         [5.4, 3.9, 1.7, 0.4],
         [4.6, 3.4, 1.4, 0.3],
         [5. , 3.4, 1.5, 0.2],
         [4.4, 2.9, 1.4, 0.2],
         [4.9, 3.1, 1.5, 0.1],
         [5.4, 3.7, 1.5, 0.2],
         [4.8, 3.4, 1.6, 0.2],
         [4.8, 3. , 1.4, 0.1],
         [4.3, 3. , 1.1, 0.1],
         [5.8, 4. , 1.2, 0.2],
         [5.7, 4.4, 1.5, 0.4],
         [5.4, 3.9, 1.3, 0.4],
         [5.1, 3.5, 1.4, 0.3],
         [5.7, 3.8, 1.7, 0.3],
         [5.1, 3.8, 1.5, 0.3],
         [5.4, 3.4, 1.7, 0.2],
         [5.1, 3.7, 1.5, 0.4],
         [4.6, 3.6, 1. , 0.2],
         [5.1, 3.3, 1.7, 0.5],
         [4.8, 3.4, 1.9, 0.2],
         [5. , 3. , 1.6, 0.2],
         [5. , 3.4, 1.6, 0.4],
         [5.2, 3.5, 1.5, 0.2],
         [5.2, 3.4, 1.4, 0.2],
         [4.7, 3.2, 1.6, 0.2],
         [4.8, 3.1, 1.6, 0.2],
         

In [9]:
# Set x and y
x = iris.data
y = iris.target
x.shape, y.shape

((150, 4), (150,))

In [11]:
# Confirm category of y data
import numpy as np
np.unique(y)

array([0, 1, 2])

In [14]:
# Split train & test data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.85, random_state=5)
x_train.shape, x_test.shape

((127, 4), (23, 4))

In [19]:
# Regression object
from sklearn.linear_model import LogisticRegression
logi = LogisticRegression(solver='newton-cg')           # Changed solver as the code below cast warning

In [20]:
# Fit model with data
logi.fit(x_train, y_train)
logi.classes_, logi.coef_, logi.intercept_

(array([0, 1, 2]),
 array([[-0.43399638,  0.8941057 , -2.35467719, -1.02093544],
        [ 0.68432277, -0.5119369 , -0.16568917, -1.05106566],
        [-0.25032639, -0.3821688 ,  2.52036636,  2.07200109]]),
 array([  9.41459576,   1.74974005, -11.16433581]))

In [24]:
# Predict the probability of y_test data
# Softmax fx is used for multimonial classes
import pandas as pd
pd.DataFrame(logi.predict_proba(x_test), columns=['Cat_0', 'Cat_1', 'Cat_2']).head()

Unnamed: 0,Cat_0,Cat_1,Cat_2
0,0.027951,0.949961,0.02208835
1,0.000108,0.242999,0.7568931
2,9.7e-05,0.038322,0.9615807
3,0.987919,0.012081,4.52586e-08
4,1e-06,0.006586,0.9934129


In [32]:
# Predict class
pd.DataFrame(logi.predict(x_test), columns=['Category']).head()

Unnamed: 0,Category
0,1
1,2
2,2
3,0
4,2


In [27]:
# Compare with the actual y_test data
pd.DataFrame(y_test, columns=['Category']).head()

Unnamed: 0,Category
0,1
1,2
2,2
3,0
4,2


In [28]:
# Confirm with score value - mean accuracy
logi.score(x_test, y_test)                        # Pretty high value

0.9130434782608695

---
---
## Classification Model Evaluation

In [52]:
from sklearn.metrics import classification_report, confusion_matrix, r2_score

In [50]:
# Get report between the actual data & predicted data
y_pred = logi.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       0.83      0.83      0.83         6
           2       0.89      0.89      0.89         9

    accuracy                           0.91        23
   macro avg       0.91      0.91      0.91        23
weighted avg       0.91      0.91      0.91        23



#### Confusion matrix
Confusion matrix shows the answer to 'How many elements were predicted correctly?'
* (i, j) entry: the number of samples with **true label being i-th class** and **predicted label being j-th class**
* Diagonal line: shows how many were predicted correctly

In [54]:
confusion_matrix(y_test, y_pred)             

array([[8, 0, 0],
       [0, 5, 1],
       [0, 1, 8]], dtype=int64)

---
---
## Classification with Advertising Data
This data is about if an advertisement was clicked when exposed along with some variables of one's feature.

In [67]:
# Import excel
ad = pd.read_excel('./files/advertising.xls')
ad.tail(3)             

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
997,51.63,51,42415.72,120.37,Expanded intangible solution,South Jessica,1,Mongolia,2016-02-01 17:24:57,1
998,55.55,19,41920.79,187.95,Proactive bandwidth-monitored policy,West Steven,0,Guatemala,2016-03-24 02:35:54,0
999,45.01,26,29875.8,178.35,Virtual 5thgeneration emulation,Ronniemouth,0,Brazil,2016-06-03 21:43:21,1


In [63]:
# Check info
ad.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Daily Time Spent on Site  1000 non-null   float64       
 1   Age                       1000 non-null   int64         
 2   Area Income               1000 non-null   float64       
 3   Daily Internet Usage      1000 non-null   float64       
 4   Ad Topic Line             1000 non-null   object        
 5   City                      1000 non-null   object        
 6   Male                      1000 non-null   int64         
 7   Country                   1000 non-null   object        
 8   Timestamp                 1000 non-null   datetime64[ns]
 9   Clicked on Ad             1000 non-null   int64         
dtypes: datetime64[ns](1), float64(3), int64(3), object(3)
memory usage: 78.2+ KB


In [62]:
# Just to check if all topic lines were different
ad.describe(include='all')                 # unique: 1000 -> all different

  


Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
count,1000.0,1000.0,1000.0,1000.0,1000,1000,1000.0,1000,1000,1000.0
unique,,,,,1000,969,,237,1000,
top,,,,,Networked logistical info-mediaries,Williamsport,,France,2016-05-07 15:16:07,
freq,,,,,1,3,,9,1,
first,,,,,,,,,2016-01-01 02:52:10,
last,,,,,,,,,2016-07-24 00:22:16,
mean,65.0002,36.009,55000.00008,180.0001,,,0.481,,,0.5
std,15.853615,8.785562,13414.634022,43.902339,,,0.499889,,,0.50025
min,32.6,19.0,13996.5,104.78,,,0.0,,,0.0
25%,51.36,29.0,47031.8025,138.83,,,0.0,,,0.0


In my opinion, column 'Ad topic Line' is the most relevant variable. Yet, the datatype is object making it difficult to engage in the regression. At first, all numeric columns will be included in independent data, predicting 'Clicked on Ad'. ML model will be logistic regression.

In [65]:
# See how many categories in 'Clicked on Ad'
import numpy as np
np.unique(ad['Clicked on Ad'])

array([0, 1], dtype=int64)

In [68]:
# Set x, y data
x = ad[['Daily Time Spent on Site', 'Age', 'Area Income', 'Daily Internet Usage', 'Male']]
y = ad['Clicked on Ad']

In [72]:
# Split train & test data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.85, random_state=5)
x_train.shape, y_test.shape

((850, 5), (150,))

In [71]:
# Model object
from sklearn.linear_model import LogisticRegression
logi = LogisticRegression()

In [74]:
# Fit model with train data
logi.fit(x_train, y_train)
logi.classes_, logi.coef_, logi.intercept_

(array([0, 1], dtype=int64),
 array([[-6.66573702e-02,  2.67295583e-01, -1.96761444e-05,
         -2.23789211e-02,  1.47652071e-03]]),
 array([0.00536353]))

In [84]:
# Predict probability with test data
pd.DataFrame(logi.predict_proba(x_test), columns=['Click_0', 'Click_1']).head()

Unnamed: 0,Click_0,Click_1
0,0.968118,0.031882
1,0.04738,0.95262
2,0.000792,0.999208
3,0.884592,0.115408
4,0.973254,0.026746


In [83]:
# Predict class
pd.DataFrame(logi.predict(x_test), columns=['Category']).head()

Unnamed: 0,Category
0,0
1,1
2,1
3,0
4,0


In [85]:
# Compare with the actual data
y_test.head()

544    0
515    1
193    1
11     0
279    0
Name: Clicked on Ad, dtype: int64

In [91]:
# Confirm score
logi.score(x_test, y_test)

0.9

### Model Evaluation

In [90]:
from sklearn.metrics import classification_report, confusion_matrix
y_pred = logi.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.94      0.90        70
           1       0.95      0.86      0.90        80

    accuracy                           0.90       150
   macro avg       0.90      0.90      0.90       150
weighted avg       0.90      0.90      0.90       150



In [92]:
confusion_matrix(y_test, y_pred)

array([[66,  4],
       [11, 69]], dtype=int64)

### Service to the Public
Save this notebook as pickle so that we can use it later on right away for service.

In [99]:
# Dump it into pickle
import pickle
pickle.dump(logi, open('./storage/advertising.pkl', 'wb'))

In [105]:
# Load pickle to read
logi2 = pickle.load(open('./storage/advertising.pkl', 'rb'))
logi2

LogisticRegression()

If a user insert 5 independent variable, we can predict the outcome via the fitted model. 

In [106]:
# Get inputs from users & Predict
time = float(input('Daily time spent on site-----------'))
age = int(input('Age-----------------------------------'))
income = float(input('Area income----------------------'))
daily = float(input('Daily internet usage--------------'))
male = int(input('Male?--------------------------------'))

user_vo = np.array([[time, age, income, daily, male]])
logi2.predict(user_vo)   

Daily time spent on site-----------60.08
Age-----------------------------------32
Area income----------------------68450.16
Daily internet usage--------------101.94
Male?--------------------------------0


array([1], dtype=int64)

In conclusion, ff the user data were like above, **the user would be expected to click the Ad** according to this logistic model.