# Configuration

In [None]:
import os
import numpy as np
import pandas as pd
import math
import matplotlib.pylab as plt
import seaborn as sns

%matplotlib inline
sns.set(style='ticks', palette='Set2')

# Data ETL

In [None]:
# Import data into Pandas as a data frame
df = pd.read_csv('./Simmons-data-raw.csv')
print(df.shape)
df[:]

In [None]:
df.dtypes

# Main Block
## Define Inputs & Target

In [None]:
predictor_cols = df.loc[:, df.columns != "Coupon-Usage-Indicator"].drop(["Customer"], axis=1)
target_col = df["Coupon-Usage-Indicator"]
predictor_cols[:]

In [None]:
from sklearn import linear_model
linlog_model = linear_model.LogisticRegression(max_iter=10000)
# Fit the Logistic regression model now
linlog_model.fit(predictor_cols, target_col)

In [None]:
# Logistic Regression exposing all default parameters.
# sklearn.linear_model.LogisticRegression(penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)

## Evaluate Accuracy

In [None]:
from sklearn import metrics
print ("Accuracy = %.3f" % (metrics.accuracy_score(linlog_model.predict(predictor_cols), df["Coupon-Usage-Indicator"])))

In [None]:
# col_list = list(df.columns.values.tolist())
print("Number of columns in data frame = ", len(df.columns))
count = 0
for col in df.columns:
    if(col == "Coupon_Usage_Indicator"):
        break
        print(col,"  ", linlog_model.coef_[0,count],"\n")
    count = count + 1
print("Beta0 = ", linlog_model.intercept_)
print("Beta1 = ", linlog_model.coef_)
print("Classes = ", linlog_model.classes_)
print("Number of features = ", linlog_model.n_features_in_)
#print("get_params", lin_model.get_params(deep=True))

# Cross Validation

In [None]:
# Logistic Regression with Cross Validation exposing all paramters
# sklearn.linear_model.LogisticRegressionCV(*, Cs=10, fit_intercept=True, cv=None, dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=0.0001, max_iter=100, class_weight=None, n_jobs=None, verbose=0, refit=True, intercept_scaling=1.0, multi_class='auto', random_state=None, l1_ratios=None)

In [None]:
linlog_modelcv = linear_model.LogisticRegressionCV(cv=10,max_iter=10000)
linlog_modelcv.fit(predictor_cols, target_col)
print ("Accuracy = %.3f" % (metrics.accuracy_score(linlog_modelcv.predict(predictor_cols), target_col)))

In [None]:
print("Number of columns in data frame = ", len(df.columns))
count = 0
for col in df.columns:
    if(col == "Coupon_Usage_Indicator"):
        break
        print(col,"  ", linlog_modelcv.coef_[0,count],"\n")
    count = count + 1
print("CV Beta0 = ", linlog_modelcv.intercept_)
print("CV Beta1 = ", linlog_modelcv.coef_)
print("CV Classes = ", linlog_model.classes_)
print("CV Number of features = ", linlog_model.n_features_in_)
#print("get_params", linlog_modelcv.get_params(deep=True))

# Predictive Code

I used this site to eliminate the 'feature name' error from the two functions below.

Problem Identification: https://stackoverflow.com/questions/69326639/sklearn-warning-valid-feature-names-in-version-1-0?answertab=trending#tab-top

Pandas Data Frame constructor: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html#pandas.DataFrame

In [None]:
# Define the prediction function
def Predict_for_new_customer(X_new):
    pred_val = linlog_model.predict(X_new)
    print("Prediction for new value = ", pred_val)
    if(pred_val == 1):
        pred_valstr = "Yes"
    elif(pred_val == 0):
        pred_valstr = "No"

    return(pred_valstr)

In [None]:
#Jack
X_new1 = pd.DataFrame([[2,1]],columns = ["Spending(000)","Card"])
print("Shape:", X_new1.shape)
print("Predicted value for Jack = " , Predict_for_new_customer(X_new1))
print("Predicted probability of class 0 (Coupon_Usage_Indicator = 0) = ", (1-linlog_model.predict_proba(X_new1)[:,1]))

In [None]:
#Jill
X_new2 = pd.DataFrame([[4,0]],columns = ["Spending(000)","Card"])
print("Shape:", X_new2.shape)
print("Predicted value for Jill = " , Predict_for_new_customer(X_new2))
print("Predicted probability of class 0 (Coupon_Usage_Indicator = 0) = ", (1-linlog_model.predict_proba(X_new2)[:,1]))

## Predictive Cross-Validated Code

In [None]:
# Define the prediction function
def CVPredict_for_new_customer(X_new):
    pred_val = linlog_modelcv.predict(X_new)
    print("Prediction for new value = ", pred_val)
    if(pred_val == 1):
        pred_valstr = "Yes"
    elif(pred_val == 0):
        pred_valstr = "No"

    return(pred_valstr)

In [None]:
#Jack
print("Shape:", X_new1.shape)
print("Predicted value for Jack = " , CVPredict_for_new_customer(X_new1))
print("Predicted probability of class 0 (Coupon_Usage_Indicator = 0) = ", (1-linlog_modelcv.predict_proba(X_new1)[:,1]))

In [None]:
# Jill
print("Shape:", X_new2.shape)
print("Predicted value for Jill = " , CVPredict_for_new_customer(X_new2))
print("Predicted probability of class 0 (Coupon_Usage_Indicator = 0) = ", (1-linlog_modelcv.predict_proba(X_new2)[:,1]))

# Establishing the cut-off

In [None]:
# Import the libraries we will be using
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Our custom libraries!
import sys
sys.path.append("..")
# from ds_utils.sample_data import *

import matplotlib.pylab as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 15, 12

In [None]:
probabilities = linlog_model.predict(predictor_cols)
probabilities.shape

In [None]:
def confusion_matrix_iter(prediction):
    cutoff = probabilities > prediction
    confusion_matrix = pd.DataFrame(metrics.confusion_matrix(target_col,cutoff))
    confusion_matrix_normal = pd.DataFrame(metrics.confusion_matrix(target_col,cutoff, normalize='true'))

    print("Count")
    print(confusion_matrix)
    print()
    print("Normalized")
    print(confusion_matrix_normal.round(decimals=3))

In [None]:
confusion_matrix_iter(0.5)

In [None]:
for i in [0.5, 0.6, 0.7, 0.8, 0.9]:
    print ("probabilities > ",i)
    confusion_matrix_iter(i)
    print()