In [20]:
# Importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [21]:
# Reading the csv file and putting it into 'df' object.
df = pd.read_csv('ecommerce_consumers.csv')
df.head()

Unnamed: 0,ratio,time,label
0,0.54,17.2,female
1,0.93,18.2,male
2,0.84,13.6,female
3,0.19,6.0,male
4,0.89,13.2,female


In [22]:
df['label'] = df.label.map({'female':0,'male':1})
df.head()

Unnamed: 0,ratio,time,label
0,0.54,17.2,0
1,0.93,18.2,1
2,0.84,13.6,0
3,0.19,6.0,1
4,0.89,13.2,0


In [23]:
# Importing test_train_split from sklearn library
from sklearn.model_selection import train_test_split

In [24]:
# Putting feature variable to X
X = df.drop('label',axis=1)

# Putting response variable to y
y = df['label']

# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [43]:
# Importing decision tree classifier from sklearn library
from sklearn.tree import DecisionTreeClassifier

# Fitting the decision tree with default hyperparameters, apart from
# max_depth which is 5 so that we can plot and read the tree.
dt_default = DecisionTreeClassifier(max_depth=3)
dt_default.fit(X_train, y_train)

# Importing classification report and confusion matrix from sklearn metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Making predictions
y_pred_default = dt_default.predict(X_test)

# Printing classification report
print(classification_report(y_test, y_pred_default))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96        25
           1       1.00      0.94      0.97        35

   micro avg       0.97      0.97      0.97        60
   macro avg       0.96      0.97      0.97        60
weighted avg       0.97      0.97      0.97        60



In [44]:
# Printing confusion matrix and accuracy
print(confusion_matrix(y_test,y_pred_default))
print(accuracy_score(y_test,y_pred_default))

[[25  0]
 [ 2 33]]
0.9666666666666667


In [40]:
# Model building
from sklearn.svm import SVC

# instantiate an object of class SVC()
# note that we are using cost C=1
model = SVC(C = 1, kernel='sigmoid')

# fit
model.fit(X_train, y_train)

# predict
y_pred = model.predict(X_test)

confusion_matrix(y_true=y_test, y_pred=y_pred)

# print other metrics

# accuracy
print("accuracy", accuracy_score(y_test, y_pred))

accuracy 0.5833333333333334


In [29]:
import statsmodels.api as sm
X_train_sm = sm.add_constant(X_train)
logm1 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm1.fit()
res.summary()

# Getting the predicted values on the train set
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

85     0.719528
183    0.690158
127    0.733822
53     0.702300
100    0.737243
93     0.715162
154    0.733267
114    0.679224
118    0.742473
64     0.713200
dtype: float64

In [30]:
y_train_pred = y_train_pred.values.reshape(-1)
y_train_pred[:10]

array([0.71952785, 0.69015794, 0.73382214, 0.70230009, 0.73724319,
       0.71516181, 0.73326672, 0.6792243 , 0.74247264, 0.71320012])

In [31]:
y_train_pred_final = pd.DataFrame({'gender_actual':y_train.values, 'gender_pred':y_train_pred})
y_train_pred_final.head()

Unnamed: 0,gender_actual,gender_pred
0,1,0.719528
1,1,0.690158
2,0,0.733822
3,1,0.7023
4,1,0.737243


In [33]:
y_train_pred_final['predicted'] = y_train_pred_final.gender_pred.map(lambda x: 1 if x > 0.5 else 0)

# Let's see the head
y_train_pred_final.head()

Unnamed: 0,gender_actual,gender_pred,predicted
0,1,0.719528,1
1,1,0.690158,1
2,0,0.733822,1
3,1,0.7023,1
4,1,0.737243,1


In [34]:
# Let's check the overall accuracy.
print(accuracy_score(y_train_pred_final.gender_actual, y_train_pred_final.predicted))

0.7142857142857143


In [38]:
help(SVC)

Help on class SVC in module sklearn.svm.classes:

class SVC(sklearn.svm.base.BaseSVC)
 |  SVC(C=1.0, kernel='rbf', degree=3, gamma='auto_deprecated', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None)
 |  
 |  C-Support Vector Classification.
 |  
 |  The implementation is based on libsvm. The fit time complexity
 |  is more than quadratic with the number of samples which makes it hard
 |  to scale to dataset with more than a couple of 10000 samples.
 |  
 |  The multiclass support is handled according to a one-vs-one scheme.
 |  
 |  For details on the precise mathematical formulation of the provided
 |  kernel functions and how `gamma`, `coef0` and `degree` affect each
 |  other, see the corresponding section in the narrative documentation:
 |  :ref:`svm_kernels`.
 |  
 |  Read more in the :ref:`User Guide <svm_classification>`.
 |  
 |  Parameters
 |  ----------
 | 