<b><font size="6">Logistic Regression</font><a class="anchor"><a id='toc'></a></b><br>

**Step 1:** Import the data and pandas

In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from regressors import stats

In [2]:
purchase = pd.read_csv('train.csv')
purchase.set_index('Access_ID', inplace = True)
#fix data types
purchase.Type_of_Traffic = purchase.Type_of_Traffic.astype("str")
purchase.Browser = purchase.Browser.astype("str")
purchase = shuffle(purchase, random_state=0)

In [3]:
purchase.head(3)

Unnamed: 0_level_0,Date,AccountMng_Pages,AccountMng_Duration,FAQ_Pages,FAQ_Duration,Product_Pages,Product_Duration,GoogleAnalytics_BounceRate,GoogleAnalytics_ExitRate,GoogleAnalytics_PageValue,OS,Browser,Country,Type_of_Traffic,Type_of_Visitor,Buy
Access_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
763494975,6-May-20,7,170.8333,1,39.0,4,45.3333,0.0,0.0111,0.0,MacOSX,2,Portugal,2,Returner,0
165226052,18-Sep-20,2,30.2,0,0.0,29,630.3267,0.0067,0.0233,0.0,Windows,2,Other,1,Returner,0
267110026,14-May-20,9,443.2917,1,0.0,74,3615.8892,0.007,0.0169,1.1198,Windows,2,Spain,1,Returner,0


In [4]:
purchase.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9999 entries, 763494975 to 289272922
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Date                        9999 non-null   object 
 1   AccountMng_Pages            9999 non-null   int64  
 2   AccountMng_Duration         9999 non-null   float64
 3   FAQ_Pages                   9999 non-null   int64  
 4   FAQ_Duration                9999 non-null   float64
 5   Product_Pages               9999 non-null   int64  
 6   Product_Duration            9999 non-null   float64
 7   GoogleAnalytics_BounceRate  9999 non-null   float64
 8   GoogleAnalytics_ExitRate    9999 non-null   float64
 9   GoogleAnalytics_PageValue   9999 non-null   float64
 10  OS                          9999 non-null   object 
 11  Browser                     9999 non-null   object 
 12  Country                     9999 non-null   object 
 13  Type_of_Traffic     

In [5]:
num_vars=purchase.select_dtypes(include=np.number).set_index(purchase.index)

def outliers(purchase,num_vars):
    for variable in num_vars:
        var_mean = purchase[variable].mean()
        var_std = purchase[variable].std()
        purchase=purchase.loc[purchase[variable] < var_mean + (7 * var_std)]
        purchase=purchase.loc[purchase[variable] > var_mean - (5 * var_std)]
    return purchase
purchase = outliers(purchase,num_vars)

In [6]:
#purchase = outliers(purchase,num_vars)

In [7]:
#remove outliers
var_mean = purchase["GoogleAnalytics_PageValue"].mean()
var_std = purchase["GoogleAnalytics_PageValue"].std()
purchase=purchase.loc[purchase["GoogleAnalytics_PageValue"] < var_mean + (5 * var_std)]

In [8]:
#balance our data
negative = purchase.loc[purchase.Buy==0]
positive = purchase.loc[purchase.Buy==1]
negative_shrunk = negative[:int(1*len(positive))]
purchase = pd.concat([positive,negative_shrunk],axis=0)
purchase = shuffle(purchase, random_state=15)

In [9]:
#create dummy variables
purchase = pd.get_dummies(purchase, columns = ["Type_of_Traffic"])
purchase = purchase.drop(['Type_of_Traffic_4'], axis=1)
purchase = pd.get_dummies(purchase, columns = ["OS"])
purchase = purchase.drop(['OS_Ubuntu'], axis=1)

In [10]:
purchase.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2988 entries, 787527047 to 180141442
Data columns (total 34 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Date                        2988 non-null   object 
 1   AccountMng_Pages            2988 non-null   int64  
 2   AccountMng_Duration         2988 non-null   float64
 3   FAQ_Pages                   2988 non-null   int64  
 4   FAQ_Duration                2988 non-null   float64
 5   Product_Pages               2988 non-null   int64  
 6   Product_Duration            2988 non-null   float64
 7   GoogleAnalytics_BounceRate  2988 non-null   float64
 8   GoogleAnalytics_ExitRate    2988 non-null   float64
 9   GoogleAnalytics_PageValue   2988 non-null   float64
 10  Browser                     2988 non-null   object 
 11  Country                     2988 non-null   object 
 12  Type_of_Visitor             2988 non-null   object 
 13  Buy                 

**Step 2:** Data partition
- Assign all the variables excluding the DepVar to the object `data`
- Assign the dependent variable to the object `target`
- Import the needed library to make the partition of the dataset
- Split the data and the target to X_train, X_test, y_train, y_test, where `test_size` should be equal to 0.2, `random_state` equal to 5 the `stratify` equal to `target`

In [11]:
data = purchase.drop(['Buy'], axis=1)
target = purchase['Buy']
#feature selection
data = data.drop(['AccountMng_Duration',"FAQ_Duration","Product_Duration","Browser","Country"], axis=1)
#data = data.filter(["GoogleAnalytics_ExitRate","GoogleAnalytics_PageValue",""])
#data

In [12]:
#make the split here
X_train, X_val, y_train, y_val = train_test_split(data, target, test_size=0.35, random_state=6)

In [13]:
#separate numeric and non-numeric
X_train_num = X_train.select_dtypes(include=np.number).set_index(X_train.index)
X_train_cat = X_train.select_dtypes(exclude=np.number).set_index(X_train.index)
# DO IT for validation
X_val_num = X_val.select_dtypes(include=np.number).set_index(X_val.index)
X_val_cat = X_val.select_dtypes(exclude=np.number).set_index(X_val.index)

In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(X_train_num)
X_train_num_scaled = scaler.transform(X_train_num) # this will return an array
# Convert the array to a pandas dataframe
X_train_num_scaled = pd.DataFrame(X_train_num_scaled, columns = X_train_num.columns).set_index(X_train.index)
X_train_num_scaled.head(3)
X_val_num_scaled = scaler.transform(X_val_num)
X_val_num_scaled = pd.DataFrame(X_val_num_scaled, columns = X_val_num.columns).set_index(X_val.index)
#X_val_num_scaled.head(3)

**Step 3:** Import the model and create an instance

In [15]:
from sklearn.linear_model import LogisticRegression
modelLog = LogisticRegression(max_iter=1000) 

**Step 4:** Fit the model to the train data

In [16]:
modelLog.fit(X_train_num,y_train)

LogisticRegression(max_iter=1000)

**Step 5:** Use the model to predict the labels of the test data. Assign them to **y_pred**.

In [17]:
labels_train = modelLog.predict(X_train_num)
labels_train

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [18]:
y_pred = modelLog.predict(X_val_num)
y_pred
pred_prob = modelLog.predict_proba(X_val_num)
pred_prob

array([[0.25834855, 0.74165145],
       [0.50427683, 0.49572317],
       [0.69164874, 0.30835126],
       ...,
       [0.27545172, 0.72454828],
       [0.51037959, 0.48962041],
       [0.75160778, 0.24839222]])

**Step 6:** Evaluate the model

In [None]:
#accuracy_score(y_val,y_pred)
#precision_score(y_val,y_pred)
#recall_score(y_val,y_pred)

The confusion matrix in sklearn is presented in the following format: <br>
[ [ TN  FP  ] <br>
    [ FN  TP ] ]

In [19]:
confusion_matrix(y_val,y_pred)

array([[447,  58],
       [130, 411]], dtype=int64)

 # The F1 Scores

In [20]:
# train data
f1_score(y_train,labels_train)

0.7794871794871796

In [21]:
# validation data
f1_score(y_val,y_pred)

0.8138613861386139

# Submitting to Kaggle

In [37]:
purchase_test = pd.read_csv('test.csv')
purchase_test.set_index('Access_ID', inplace = True)
#fix data types
purchase_test.Type_of_Traffic = purchase_test.Type_of_Traffic.astype("str")
purchase_test.Browser = purchase_test.Browser.astype("str")

In [38]:
purchase_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2300 entries, 798519314 to 962218682
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Date                        2300 non-null   object 
 1   AccountMng_Pages            2300 non-null   int64  
 2   AccountMng_Duration         2300 non-null   float64
 3   FAQ_Pages                   2300 non-null   int64  
 4   FAQ_Duration                2300 non-null   float64
 5   Product_Pages               2300 non-null   int64  
 6   Product_Duration            2300 non-null   float64
 7   GoogleAnalytics_BounceRate  2300 non-null   float64
 8   GoogleAnalytics_ExitRate    2300 non-null   float64
 9   GoogleAnalytics_PageValue   2300 non-null   float64
 10  OS                          2300 non-null   object 
 11  Browser                     2300 non-null   object 
 12  Country                     2300 non-null   object 
 13  Type_of_Traffic     

In [39]:
purchase_test = pd.get_dummies(purchase_test, columns = ["Type_of_Traffic"])
purchase_test = purchase_test.drop(['Type_of_Traffic_4'], axis=1)
purchase_test = pd.get_dummies(purchase_test, columns = ["OS"])
purchase_test = purchase_test.drop(['OS_Ubuntu'], axis=1)
purchase_test = purchase_test.drop(['OS_Other'], axis=1)

In [40]:
#feature selection
purchase_test = purchase_test.drop(['AccountMng_Duration',"FAQ_Duration","Product_Duration","Browser","Country"], axis=1)

In [41]:
X_test_num = purchase_test.select_dtypes(include=np.number).set_index(purchase_test.index)

In [42]:
X_test_num

Unnamed: 0_level_0,AccountMng_Pages,FAQ_Pages,Product_Pages,GoogleAnalytics_BounceRate,GoogleAnalytics_ExitRate,GoogleAnalytics_PageValue,Type_of_Traffic_1,Type_of_Traffic_10,Type_of_Traffic_11,Type_of_Traffic_12,...,Type_of_Traffic_6,Type_of_Traffic_7,Type_of_Traffic_8,Type_of_Traffic_9,OS_Android,OS_Chrome OS,OS_Fedora,OS_MacOSX,OS_Windows,OS_iOS
Access_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
798519314,0,0,86,0.0139,0.0654,0.0000,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
798663101,2,0,55,0.0012,0.0058,0.0000,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
798663221,0,0,36,0.0000,0.0250,0.0000,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
798760918,0,0,2,0.0000,0.1000,0.0000,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
798856982,12,1,129,0.0014,0.0185,1.0353,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
962042726,10,2,220,0.0048,0.0221,0.0000,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
962063064,0,0,3,0.0667,0.1000,0.0000,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
962147749,2,0,22,0.0000,0.0250,9.6503,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
962182167,8,0,9,0.0167,0.0500,0.0000,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [43]:
y_pred =  modelLog.predict(X_test_num)
y_pred

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [44]:
submission = pd.DataFrame([X_test_num.index,y_pred]).T

In [37]:
submission.to_csv("submissiontest.csv")

In [None]:
#data submitted to Kaggle was an F1 score of only .66