In [None]:
# Excercise 1: Logistic Regression with Python

Load the dataset Data_LogReg.csv as a DataFrame and visualize the first rows of the dataset.

In [None]:
import pandas as pd

dataset = pd.read_csv("../../data/logregdata.csv")
dataset.head()

Plot a scatter plot of the data using matplotlib.pyplot

In [None]:
import matplotlib.pyplot as plt

plt.scatter(dataset['Income'], dataset['Purchase(Y/N)'], color = 'red')
plt.xlabel('Income in €')
plt.ylabel('Purchase (Y/N)')
plt.title('Purchase vs Income')
plt.show()

Scale the data using the Standard Scaler from the sklearn package (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler)

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np

scaler = StandardScaler()
X_sc = scaler.fit_transform(dataset[['Income']])

print(np.mean(X_sc))
print(np.std(X_sc))

Split the data into a training and test data set with a fraction of 80%/20%, respectively.

In [None]:
from sklearn.model_selection import train_test_split

X_train_sc, X_test_sc, y_train, y_test = train_test_split(X_sc, dataset['Purchase(Y/N)'], test_size = 0.2, random_state = 0)

Instantiate a logistic Regressor, train it, do a reverse scaling on the X values and plot it with the data in a scatter plot. Determine how you can plot the lineplot of the classifier. Sorting things can be useful here.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train_sc, y_train)

X_train = scaler.inverse_transform(X_train_sc)
X_test = scaler.inverse_transform(X_test_sc)

# classifier returns output for each class!
y_train_pred = classifier.predict_proba(np.sort(X_train_sc, axis=0))[:,1]
plt.scatter(X_train, y_train, color='red')
plt.plot(np.sort(X_train, axis=0), y_train_pred, color='green')
plt.xlabel('Income in €')
plt.ylabel('Purchase (Y/N)')
plt.title('Purchase vs Income')
plt.show()

y_test_pred = classifier.predict_proba(np.sort(X_test_sc, axis=0))[:,1]
plt.scatter(X_test, y_test, color='blue')
plt.plot(np.sort(X_test, axis=0), y_test_pred, color='green')
plt.xlabel('Income in €')
plt.ylabel('Purchase (Y/N)')
plt.title('Purchase vs Income')
plt.show()



# Exercise 2: Performance metrics for classifiers

Take the classifier you trained above, predict the test data and calculate the accuracy, precision, sensitivity (recall) and specificity of the classifier.

Remark: you can use again the heatmap method from the seaborn package to plot the confusion matrix in a nice way.

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Predicting the Test set results
y_pred = classifier.predict(X_test_sc)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n " + str(cm))

# Plot confusion matrix
# confusion matrix sns heatmap 
ax = plt.axes()
df_cm = cm
sns.heatmap(df_cm, annot=True, annot_kws={"size": 30}, fmt='d',cmap="Blues", ax = ax )
ax.set_title('Confusion Matrix')
plt.show()

# Sensitivity
print("Sensitivity: " + str(cm[1,1]/(cm[1,1]+cm[1,0])))

# Specificity
print("Specificity: " + str(cm[0,0]/(cm[0,0]+cm[0,1])))

# Accuracy
print("Accuracy: " + str((cm[0,0]+cm[1,1])/(cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1])))

Calculate the AUC score and plot the ROC curve of the classifier using the roc_curve method from the sklearn package. Look into the documentation to see what parameters the methods needs.

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(y_test, classifier.predict(X_test_sc))

print("AUC: ", logit_roc_auc)
fpr, tpr, thresholds = roc_curve(y_test, classifier.predict_proba(X_test_sc)[:,1])

plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.05, 1.0])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC.png', dpi=200)
plt.show()