In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix

from sklearn.metrics import classification_report

In [None]:
# Importing the method needed to apply LDA classification

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

Scikit-learn documentation about discriminant analysis:

https://scikit-learn.org/stable/modules/lda_qda.html#lda-qda

# LDA applied to the Default dataset

In [None]:
Default_df= pd.read_csv('C:\\Users\\jheredi2\\Documents\\PythonDataAnalytics\\1-Datasets\\Default.csv')

In [None]:
Default_df_dummies= pd.get_dummies(Default_df,columns=['student'], drop_first=True)

In [None]:
X_train_def, X_test_def, y_train_def, y_test_def= train_test_split (Default_df_dummies.iloc[:,1:], Default_df_dummies['default'], test_size=0.2, random_state=1)

Use CV via GridSearch to select the values of two hyperparameters: shrinkage level and the solver

Choose based on overall accuracy

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
shrinkage_parameter= np.linspace(0, 1, 21)
shrinkage_parameter

In [None]:
hyperparam_grid = {
    'shrinkage': shrinkage_parameter,
    'solver': ['lsqr', 'eigen'] 
}

In [None]:
grid_search= GridSearchCV(estimator= LinearDiscriminantAnalysis(), param_grid=hyperparam_grid, cv=10, scoring='accuracy')

In [None]:
grid_search.fit(X_train_def, y_train_def)

In [None]:
grid_search.best_params_

In [None]:
lda_model_default= LinearDiscriminantAnalysis(shrinkage=0, solver='lsqr')

In [None]:
lda_model_default.fit(X_train_def, y_train_def)

In [None]:
y_pred_default= lda_model_default.predict(X_test_def)

In [None]:
confusion_matrix(y_test_def, y_pred_default)

In [None]:
print (classification_report(y_test_def, y_pred_default))

Use CV via GridSearch to select the values of two hyperparameters: shrinkage level and the solver

Choose based on f1-score

In [None]:
from sklearn.metrics import f1_score

In [None]:
from sklearn.metrics import make_scorer

In [None]:
f1_scorer = make_scorer(f1_score, pos_label='Yes')

In [None]:
grid_search2= GridSearchCV(estimator= LinearDiscriminantAnalysis(), param_grid=hyperparam_grid, cv=10, scoring=f1_scorer)

In [None]:
grid_search2.fit(X_train_def, y_train_def)

In [None]:
grid_search2.best_params_

__No change in the recommended hyperparameter values when compare to the selection based on overall accuracy; therefore, no need to continue__

### Checking that the linear discriminant function is actually used to make the classification

When discussing the slides we saw that, for a given test observation, the prediction of Y is the class for which the __linear discriminant function is maximum.__

There are separate discriminant functions for Y=0 and Y=1

In Python, there is NO method that directly returns the linear discriminant function for each class, but there is one method that returns something that serves the same purpose!

Scikit-learn has a method that returns __the difference between__ the linear discriminant function for Y=1 __and__ the linear discriminant function for Y=0. This method is called 'decision_function()'.

The output of decision_function() is the following difference:

decision_function () = Linear discriminant function for Y=Yes - Linear discriminant function for Y=No

Let's get __the first 10 values__ returned by decision_function() applied on the test data

In [None]:
lda_model_default.decision_function(X_test_def)[:10]

If the decision function was negative for the first 10 test obs, that means the prediction for those obs should be
'No'. As we can see next, that is the case:

In [None]:
lda_model_default.predict(X_test_def)[:10]

Answer the following question: __What needs to happen for the decision_function to be positive?__


Based on your answer to that question, __SHOW__ that Y is classified as Y= Yes only when the decision_function value is positive

__Work on this independently !!!!!__

### Changing the probability threshold to see if we get better results

##### Find a threshold based on f1-score (since what we want to enhace is class-specific performance rather than overall accuracy)

In [None]:
array_prob= np.arange(0.05, 0.51, 0.05)

In [None]:
# Create an array with all the probabilities of Yes

prob_yes= lda_model_default.predict_proba(X_test_def)[:,1]

In [None]:
dict_predictions = dict()

In [None]:
dict_f1_scores= dict()

In [None]:
# Notice that I had to change the name from y_test to y_test_def

for j in array_prob:
    dict_predictions[j]=np.empty(y_test_def.size, dtype=object)
    for i in np.arange(0, dict_predictions[j].size):
        if prob_yes[i] > j:
            dict_predictions[j][i]= 'Yes'
        else:
            dict_predictions[j][i]= 'No'
    dict_f1_scores[j]= np.round (f1_score(y_test_def, dict_predictions[j],pos_label='Yes'),3)

In [None]:
dict_f1_scores

In [None]:
max(dict_f1_scores, key= dict_f1_scores.get)

In [None]:
# This loop computes the prediction of Y (No or Yes) for each test observation
# The predictions of Y are stored in an array called 'y_predicted_prob040' 
# The prediction uses a prob threshold of 0.40

y_predicted_prob040=np.empty(y_test_def.size, dtype=object)

for i in np.arange(0,y_predicted_prob040.size):
    if prob_yes[i] > 0.40:
        y_predicted_prob040[i]= 'Yes'
    else:
        y_predicted_prob040[i]= 'No'

In [None]:
print (classification_report (y_test_def, y_predicted_prob040))

# QDA applied to the Default dataset

In [None]:
# Importing the method needed to apply QDA classification

from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
qda_model_default= QuadraticDiscriminantAnalysis()

In [None]:
qda_model_default.fit(X_train_def, y_train_def)

In [None]:
y_pred_default_qda= qda_model_default.predict(X_test_def)

In [None]:
confusion_matrix(y_test_def, y_pred_default_qda)

In [None]:
print(classification_report(y_test_def, y_pred_default_qda))

Now, we could change the probability threshold to see if we get better results. However, we will not do it since we have practiced that task a lot !!!

## SONAR DATASET

Dataset description:

https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.names

It looks like class specific-performance is not important in this case (i.e., predicting both classes is equally important); therefore, our goal will be to get a high overall accuracy

In [None]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv"

In [None]:
sonar_df = pd.read_csv(url, header=None)

In [None]:
sonar_df.info()

In [None]:
sonar_df.iloc[:,-1].value_counts()

### LDA applied to the sonar dataset

In [None]:
X_train_sonar, X_test_sonar, y_train_sonar, y_test_sonar= train_test_split (sonar_df.iloc[:,:-1], sonar_df.iloc[:,-1], test_size=0.2, random_state=1)

In [None]:
grid_search_sonar= GridSearchCV(estimator= LinearDiscriminantAnalysis(), param_grid=hyperparam_grid, cv=10, scoring='accuracy')

In [None]:
grid_search_sonar.fit(X_train_sonar, y_train_sonar)

In [None]:
grid_search_sonar.best_params_

In [None]:
lda_model_sonar= LinearDiscriminantAnalysis(shrinkage=0.1, solver='lsqr')

In [None]:
lda_model_sonar.fit(X_train_sonar, y_train_sonar)

In [None]:
y_pred_sonar= lda_model_sonar.predict(X_test_sonar)

In [None]:
confusion_matrix (y_test_sonar, y_pred_sonar)

In [None]:
print(classification_report (y_test_sonar, y_pred_sonar))

### QDA applied to the sonar dataset

WORK ON IT INDEPENDENTLY FOR THREE MINUTES

### Logistic regression applied to the sonar dataset

WORK ON IT INDEPENDENTLY FOR FIVE MINUTES

Apply logistic regression following these requirements:

a) Of course, standardize the predictors

b) Select the hyperparameters using CV via Grid Search

c) Select the best hyperparameters based on accuracy

In [None]:
from sklearn.linear_model import LogisticRegression

### KNN applied to the sonar dataset

WORK ON IT INDEPENDENTLY FOR FIVE MINUTES

Apply KNN following these requirements:

a) Of course, standardize the predictors

b) Select the hyperparameters using CV via Grid Search

c) Select the best hyperparameters based on accuracy

In [None]:
from sklearn.neighbors import KNeighborsClassifier