In [None]:
#importing
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import f1_score
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeClassifier
from mpl_toolkits import mplot3d
from sklearn.neural_network import MLPClassifier
plt.rcParams["figure.figsize"] = [7,7]

In the following project we will examine a German Credit Risk dataset. This dataset has 1000 entries with each entry representing an individual seeking credit. Each individual has been classified as a good(+1) or bad credit(-1) risk according to a set of features. Our goal here is to train a Support Vector Machine Classifier on the dataset to predict Credit Risk

In this notebook we will first import and prepare the data to be used in the SVMs. We will then investigate different tpyes of SVMs and different parameters to be used, and will then choose a set of parameters offering the best level of correct predicition of credit. We will then run these classifiers on test data to see how well our final classifier performs. 

### Importing and Preparing Data

In [None]:
ger = pd.read_csv("german.data-numeric-withheader.csv")

In [None]:
#Checking data read in correctly
ger

In [None]:
#irst 5 rows
ger.head()

In [None]:
#Extracting design matrix X and labels y
X = ger.iloc[:,1:]
y = ger["CREDITRATING"]

In [None]:
#Checking shapes
print(X.shape)
print(y.shape)

In [None]:
#Plotting historgrams of a few features

fig, ax = plt.subplots(2,2)
ax[0,0].hist(X["Loan_NurnMonth"])
ax[0,0].set_title('Loan Duration Months')
ax[0,1].hist(X["CreditAmt"])
ax[0,1].set_title('Credit Amount')
ax[1,0].hist(X["AgeInYears"])
ax[1,0].set_title('Age')
ax[1,1].hist(X["ForeignWorker"])
ax[1,1].set_title('Foriegn Worker Status')
fig.suptitle('Histograms of certain features')
plt.show()
plt.rcParams["figure.figsize"] = [7,7]

In [None]:
#Loan Duration in Years historgram
LoanDurYr = X["Loan_NurnMonth"]/12
plt.hist(LoanDurYr)
plt.title("Loan Duration in Years")
plt.show()

In [None]:
#Creating train test split using 25% (the default value) of data as test data.

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25, random_state = 7 )

In [None]:
#Cross Validation Score with no scaling

clf = svm.SVC()
clf.fit(X_train, y_train)
print("The mean classifier score is", clf.score(X_test, y_test))



In [None]:
#Scaling Using Standard Scaler
#setting our scaler
scaler_std = preprocessing.StandardScaler()

#fitting scaler to training data
scaler_std.fit(X_train)
X_train_scaled_std = scaler_std.transform(X_train)

#applying scaler to test data
X_test_scaled_std = scaler_std.transform(X_test)

In [None]:
#Repeat Scaling Using MinMaxScaler

scaler_mm = preprocessing.MinMaxScaler()
scaler_mm.fit(X_train)
X_train_scaled_mm = scaler_mm.transform(X_train)
X_test_scaled_mm = scaler_mm.transform(X_test)

### Testing different kernels and different scaling techniques
-------

### No Scaling

In [None]:

#Linear

clf = svm.SVC(kernel = 'linear')
clf.fit(X_train, y_train)
scores = cross_val_score(clf,X_train,y_train)
mean = scores.mean()
print("The mean classifier score for linear is", '{0:.4g}'.format(mean))

#poly
clf = svm.SVC(kernel = 'poly')
clf.fit(X_train, y_train)
scores = cross_val_score(clf,X_train,y_train)
mean = scores.mean()
print("The mean classifier score for poly is", '{0:.4g}'.format(mean))

#rbf
clf = svm.SVC(kernel = 'rbf')
clf.fit(X_train, y_train)
scores = cross_val_score(clf,X_train,y_train)
mean = scores.mean()
print("The mean classifier score for rbf is", '{0:.4g}'.format(mean))

#sigmoid
clf = svm.SVC(kernel = 'sigmoid')
clf.fit(X_train, y_train)
scores = cross_val_score(clf,X_train,y_train)
mean = scores.mean()
print("The mean classifier score for sigmoid is", '{0:.4g}'.format(mean))

### Standard Scaling

In [None]:
#Standard Scaling

clf = svm.SVC(kernel = 'linear')
clf.fit(X_train_scaled_std, y_train)
scores = cross_val_score(clf,X_train_scaled_std,y_train)
mean = scores.mean()
print("The mean classifier score for linear is", '{0:.4g}'.format(mean))

clf = svm.SVC(kernel = 'poly')
clf.fit(X_train_scaled_std, y_train)
scores = cross_val_score(clf,X_train_scaled_std,y_train)
mean = scores.mean()
print("The mean classifier score for poly is", '{0:.4g}'.format(mean))

clf = svm.SVC(kernel = 'rbf')
clf.fit(X_train_scaled_std, y_train)
scores = cross_val_score(clf,X_train_scaled_std,y_train)
mean = scores.mean()
print("The mean classifier score for rbf is", '{0:.4g}'.format(mean))

clf = svm.SVC(kernel = 'sigmoid')
clf.fit(X_train_scaled_std, y_train)
scores = cross_val_score(clf,X_train_scaled_std,y_train)
mean = scores.mean()
print("The mean classifier score for sigmoid is", '{0:.4g}'.format(mean))

### MinMaxScaling

In [None]:


clf = svm.SVC(kernel = 'linear')
clf.fit(X_train_scaled_mm, y_train)
scores = cross_val_score(clf,X_train_scaled_mm,y_train)
mean = scores.mean()
print("The mean classifier score for linear is", '{0:.4g}'.format(mean))

clf = svm.SVC(kernel = 'poly')
clf.fit(X_train_scaled_mm, y_train)
scores = cross_val_score(clf,X_train_scaled_mm,y_train)
mean = scores.mean()
print("The mean classifier score for poly is", '{0:.4g}'.format(mean))

clf = svm.SVC(kernel = 'rbf')
clf.fit(X_train_scaled_mm, y_train)
scores = cross_val_score(clf,X_train_scaled_mm,y_train)
mean = scores.mean()
print("The mean classifier score for rbf is", '{0:.4g}'.format(mean))

clf = svm.SVC(kernel = 'sigmoid')
clf.fit(X_train_scaled_mm, y_train)
scores = cross_val_score(clf,X_train_scaled_mm,y_train)
mean = scores.mean()
print("The mean classifier score for sigmoid is", '{0:.4g}'.format(mean))

Its clear that both methods of scaling data overall performed better than the unscaled data.

And RBF and linear seem to perform the best purely based off of mean classifier scores so we will only consider those moving forward.

We will move forward using the MinMax scaling. Although it did perform marginally worse than StandardScaling for rbf kernel, the utilisation of standard scaling has the possibilty of not performing as well as we would like to if the underlying data isnt normally distributed data, which may not be true from this dataset. Although it performed fine for us here, if one of the features has an extremely large variance, it may affect our ability of the estimator to learn from other features correctly.

The motivation to minmax scaling include robustness to very small standard deviations of features and preserving zero entries in sparse data.


### Hyperparameter Tuning

In [None]:
X_train_scaled_mm = scaler_mm.transform(X_train)
X_test_scaled_mm = scaler_mm.transform(X_test)

#### Tuning for Linear kernel

In [None]:
#Hyperparameter tuning for linear

# set the grid of cavalues
C_vals = [0.001, 0.005, 0.01,0.05,0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000]
#creating emptpy arrays for crossval scores

Ncases = len(C_vals)
score_mean = np.zeros(Ncases)
score_std = np.zeros(Ncases)
for k in range(Ncases):
    # set the classifier with the corresponding hyperparameter
    clf = svm.SVC(kernel = 'linear', C = C_vals[k])
    # This the cross-validation and the important and expensive part of the code.
    scores = cross_val_score(clf,X_train_scaled_mm,y_train)  
    # record the mean and std of the score
    score_mean[k] = scores.mean()
    score_std[k] = scores.std()
    print("C = ",  (C_vals[k]), ", Avg Score = ",'{0:.4g}'.format(score_mean[k]))
    
# plot the scores as function of hyperparameter
plt.semilogx(C_vals,score_mean,'r',label = 'Cross Val Score')
plt.fill_between(C_vals,score_mean-score_std,score_mean+score_std,alpha=0.2,label = 'Score +/- std')
plt.xlabel("C", fontsize="14")
plt.ylabel("mean score +/- std", fontsize="14")
plt.title('Cross Validation Scores for varying values of C, linear kernel')
plt.legend()
plt.show()

##### We note how the scores for 0.1,0.5,1 are essentially the same, but the 95% CI for C = 1 is slightly smaller, hence we will move forward using C = 1 when using our kernel as linear

### Tuning for rbf kernel

In [None]:
#Hyperparameter C tuning for rbf

from sklearn.model_selection import cross_val_score

# set the grid of C vals
C_vals = [0.001, 0.005, 0.01,0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000]
Ncases = len(C_vals)
score_mean = np.zeros(Ncases)
score_std = np.zeros(Ncases)

for k in range(Ncases):
    # set the classifier with the corresponding hyperparameter
    clf = svm.SVC(kernel = 'rbf', C = C_vals[k])
    # This the cross-validation. It is the important and expensive part of the code.
    scores = cross_val_score(clf,X_train_scaled_mm,y_train)  
    # record the mean and std of the score
    score_mean[k] = scores.mean()
    score_std[k] = scores.std()
    print("C = ",  (C_vals[k]), ", Avg Score = ",'{0:.4g}'.format(score_mean[k]))
    
# plot the scores as function of hyperparameter
plt.semilogx(C_vals,score_mean,'r',label = 'Cross Val Score')
plt.fill_between(C_vals,score_mean-score_std,score_mean+score_std,alpha=0.2,label = 'Score +/- std')
plt.xlabel("C", fontsize="14")
plt.ylabel("mean score +/- std", fontsize="14")
plt.title('Cross Validation Scores for varying values of C, rbf kernel')
plt.legend()
plt.show()

In [None]:
#Hyperparameter gamma tuning for rbf
from sklearn.model_selection import cross_val_score

# set the grid of gamma vals
gam_vals = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 50, 100, 500, 1000]
Ncases = len(gam_vals)
score_mean = np.zeros(Ncases)
score_std = np.zeros(Ncases)

for k in range(Ncases):
    # set the classifier with the corresponding hyperparameter
    clf = svm.SVC(kernel = 'rbf', C = 1, gamma = gam_vals[k])
    # This the cross-validation. It is the important and expensive part of the code.
    scores = cross_val_score(clf,X_train_scaled_mm,y_train)  
    # record the mean and std of the score
    score_mean[k] = scores.mean()
    score_std[k] = scores.std()
    print("Gamma = ",  (gam_vals[k]), ", Avg Score = ",'{0:.4g}'.format(score_mean[k]))
print('\n')

# plot the scores as function of hyperparameter
plt.semilogx(gam_vals,score_mean,'r',label = 'Cross Val Score')
plt.fill_between(gam_vals,score_mean-score_std,score_mean+score_std,alpha=0.2,label = 'Score +/- std')
plt.xlabel("Gamma", fontsize="14")
plt.ylabel("mean score +/- std", fontsize="14")
plt.title('Cross Validation Scores for varying values of gamma, rbf kernel')
plt.legend()
plt.show()

From these two tests it seems the average score is maximised when C is one of our values between 0.5 and 10, and gamma is either 0.1 or 1
Further testing for these parameters is below

In [None]:
C_vals = [0.5,1,5,10]
Ncases = len(C_vals)
score_mean = np.zeros(Ncases)
score_std = np.zeros(Ncases)

In [None]:
#Fixing gamma as 0.1
for k in range(Ncases):
    # set the classifier with the corresponding hyperparameter
    clf = svm.SVC(kernel = 'rbf', C = C_vals[k], gamma = 0.1)
    # This the cross-validation. It is the important and expensive part of the code.
    scores = cross_val_score(clf,X_train_scaled_mm,y_train)  
    # record the mean and std of the score
    score_mean[k] = scores.mean()
    score_std[k] = scores.std()
    print("C = ",  (C_vals[k]), ", Avg Score = ",'{0:.4g}'.format(score_mean[k]))

In [None]:
#Fixing gamma as 1
for k in range(Ncases):
    # set the classifier with the corresponding hyperparameter
    clf = svm.SVC(kernel = 'rbf', C = C_vals[k], gamma = 1)
    # This the cross-validation. It is the important and expensive part of the code.
    scores = cross_val_score(clf,X_train_scaled_mm,y_train)  
    # record the mean and std of the score
    score_mean[k] = scores.mean()
    score_std[k] = scores.std()
    print("C = ",  (C_vals[k]), ", Avg Score = ",'{0:.4g}'.format(score_mean[k]))

Hence just basing our decision off of mean cross validation scores we will set C = 5 and gamma = 0.1 for the rbf kernel, however it is possible that this pair of parameters may have higher variance than another pair of parameters that may have a slightly lower CV score.

### Testing and comparing classifiers

#### Default SVC classifier

In [None]:

clf = svm.SVC()
clf.fit(X_train_scaled_mm, y_train)

y_predict = clf.predict(X_test_scaled_mm)
print("The classifier score is", clf.score(X_test_scaled_mm, y_test))

print("\nThe confusion matrix is")
#Plot confusion matix
plot_confusion_matrix(clf, X_test_scaled_mm, y_test)
plt.title('confusion matrix for Default SVC')
plt.show()

print("\n","Classification Report:","\n",classification_report(y_test,y_predict))

#### Tuned Linear classifier

In [None]:
#Tuned SVC classifier with linear kernel, C = 1
clf = svm.SVC(kernel = 'linear', C = 1)
clf.fit(X_train_scaled_mm, y_train)

y_predict = clf.predict(X_test_scaled_mm)
print("The classifier score is", clf.score(X_test_scaled_mm, y_test),"\n")



plot_confusion_matrix(clf, X_test_scaled_mm, y_test)
plt.title('confusion matrix for tuned linear SVC')
plt.show()

print("\n","Classification Report:","\n",classification_report(y_test,y_predict))

#### Tuned Rbf Classifier

In [None]:
#Tuned SVC classifier with rbf kernel, C = 5, gamma = 0.1
clf = svm.SVC(kernel = 'rbf', C = 5, gamma = 0.1)
clf.fit(X_train_scaled_mm, y_train)

y_predict = clf.predict(X_test_scaled_mm)
print("The classifier score is", clf.score(X_test_scaled_mm, y_test),"\n")


plot_confusion_matrix(clf, X_test_scaled_mm, y_test)
plt.title('confusion matrix for tuned rbf SVC')
plt.show()

print("\n","Classification Report:","\n",classification_report(y_test,y_predict))

#### Decision Tree classifier

In [None]:
tree = DecisionTreeClassifier(random_state=7)
tree.fit(X_train_scaled_mm,y_train)

y_predict = tree.predict(X_test_scaled_mm)

print("The classifier score is", tree.score(X_test_scaled_mm, y_test),"\n")
plot_confusion_matrix(tree, X_test_scaled_mm, y_test)
plt.title('Confusion Matrix for Decision Tree Classifier')
plt.show()
print("\n","Classification Report:","\n",classification_report(y_test,y_predict))

#### MLP classifier

In [None]:
regr = MLPClassifier(learning_rate_init=0.01, max_iter=5000)
regr.fit(X_train_scaled_mm, y_train)

y_predict = regr.predict(X_test_scaled_mm)

print("The classifier score is", regr.score(X_test_scaled_mm, y_test),"\n")


plot_confusion_matrix(regr, X_test_scaled_mm, y_test)
plt.title('Confusion Matrix for MLP Classifier')
plt.show()

print("\n","Classification Report:","\n",classification_report(y_test,y_predict))

### Discussion 

We note how the default classifier performs pretty well, however we can improve upon it thanks to our hyper parameter tuning which shows how important carrying our the hyperparameter search is. The default decision tree performs the worst, but we have not tuned any of the parameters for the decision tree so it is possible we could improve its prediction power. The MLP Classifer performs slightly worse than the default SVC classifier.

Finally after our hyper parameter search we were able to predict credit ratings based off of our training data with ~80% accuracy using SVM and hyperparameter tuning. This could potentially be improved by increasing the size of our training data if we were given a bigger overall sample size. This test study could also be improved by performing a grid search for the hyperparametrs but may take significantly more time and be more computer intensive. We could also exmaine the underlying data before carrying out the study and correct for outliers or see if the data follows any underlying distributions which may affect our choices in data scaling or choosing another method not mentioned here.

We also note how in general all of the different classifiers perform better at correctly classifying someone who has bad credit in reality, into the bad credit category, given info from the 24 features. This isn't too surprising as there are 700 people with bad credit in the dataset and 300 with good credit so we will logically have a much smaller training and test sample for people with good credit ratings. So if we had a data set with equal number of people with good and bad credit would possibly help us improve our algorithm's accuracy. 