In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cancer_set = pd.read_csv('/content/drive/MyDrive/Data_Science/cancer_set.csv')
cancer_set.head()

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,1,P10,17,1,3,1,5,3,4,2,...,1,3,7,8,6,2,1,7,2,Medium
2,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
3,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
4,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High


In [None]:
cancer_set.shape

(1000, 26)

In [None]:
filtered_data = cancer_set[cancer_set['Age'] >= 30]
filtered_data = filtered_data.reset_index(drop=True)
filtered_data.head()

Unnamed: 0,index,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swallowing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,0,P1,33,1,2,4,5,4,3,2,...,3,4,2,2,3,1,2,3,4,Low
1,2,P100,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High
2,3,P1000,37,1,7,7,7,7,6,7,...,4,2,3,1,4,5,6,7,5,High
3,4,P101,46,1,6,8,7,7,7,6,...,3,2,4,1,4,2,4,2,3,High
4,5,P102,35,1,4,5,6,5,5,4,...,8,7,9,2,1,4,6,7,2,High


In [None]:
filtered_data['High Risk'] = filtered_data['Level'] == 'High'
filtered_data['High Risk'] = filtered_data['High Risk'].astype(int)

Accessing all values in the final column and changing them to binary values
High becomes 1 everything else becomes 0

In [None]:
filtered_data.shape

(699, 27)

In [None]:
X = filtered_data.iloc[:, [4, 6, 13, 18]]
Y = filtered_data.iloc[:, [-1]]
X.head()

Unnamed: 0,Air Pollution,Dust Allergy,Passive Smoker,Shortness of Breath
0,2,5,2,2
1,4,6,3,9
2,7,7,7,3
3,6,7,7,4
4,4,6,3,9


In [None]:
Y.head()

Unnamed: 0,High Risk
0,0
1,1
2,1
3,1
4,1


Check for imbalance

In [None]:
X_trainval, X_test, Y_trainval, Y_test = train_test_split(X, Y, test_size = 0.2, random_state= 0)
X_train, X_val, Y_train, Y_val = train_test_split(X_trainval, Y_trainval, test_size = 0.25, random_state = 0)

Splitting the data into test, training and validation data 60/20/20

In [None]:
print('Training Data: ', X_train.shape)
print('Test Data: ', X_test.shape)
print('Validation Data: ', X_val.shape)

Training Data:  (419, 4)
Test Data:  (140, 4)
Validation Data:  (140, 4)


Seeing imbalance in data in training set

In [None]:
high_vals = sum(Y_train['High Risk'] == 1)
low_vals = len(Y_train['High Risk']) - high_vals
print('High vals: ', high_vals)
print('Low vals: ', low_vals)

High vals:  160
Low vals:  259


In [None]:
undersampler = RandomUnderSampler(random_state=0)
X_train_resampled, Y_train_resampled = undersampler.fit_resample(X_train, Y_train)
print('Balanced Training Data Shape: ', X_train_resampled.shape)

Balanced Training Data Shape:  (320, 4)


In [None]:
model_linear = SVC(C = 1, kernel = 'linear', gamma = 0.005)
model_linear = model_linear.fit(X_train_resampled, Y_train_resampled)

model_rbf = SVC(C = 1, kernel = 'rbf', gamma = 0.005)
model_rbf = model_rbf.fit(X_train_resampled, Y_train_resampled)

model_sig = SVC(C = 1, kernel = 'sigmoid', gamma = 0.005)
model_sig = model_sig.fit(X_train_resampled, Y_train_resampled)

model_poly = SVC(C = 1, kernel = 'poly', gamma = 0.005)
model_poly = model_poly.fit(X_train_resampled, Y_train_resampled)

ypred_linear_val = model_linear.predict(X_val)
ypred_rbf_val = model_rbf.predict(X_val)
ypred_sig_val = model_sig.predict(X_val)
ypred_poly_val = model_poly.predict(X_val)

#assign confusion matrix to these test values
cm_linear_val = confusion_matrix(Y_val, ypred_linear_val)
cm_rbf_val = confusion_matrix(Y_val, ypred_rbf_val)
cm_sig_val = confusion_matrix(Y_val, ypred_sig_val)
cm_poly_val = confusion_matrix(Y_val, ypred_poly_val)

#predict accuracy, precision and recall for test values
acc_linear_val = accuracy_score(Y_val.ravel(), ypred_linear_val)
prec_linear_val = precision_score(Y_val.ravel(), ypred_linear_val)
rec_linear_val = recall_score(Y_val.ravel(), ypred_linear_val)

acc_rbf_val = accuracy_score(Y_val.ravel(), ypred_rbf_val)
prec_rbf_val = precision_score(Y_val.ravel(), ypred_rbf_val)
rec_rbf_val= recall_score(Y_val.ravel(), ypred_rbf_val)

acc_sig_val = accuracy_score(Y_val.ravel(), ypred_sig_val)
prec_sig_val = precision_score(Y_val.ravel(), ypred_sig_val)
rec_sig_val = recall_score(Y_val.ravel(), ypred_sig_val)

acc_poly_val = accuracy_score(Y_val.ravel(), ypred_poly_val)
prec_poly_val = precision_score(Y_val.ravel(), ypred_poly_val)
rec_poly_val = recall_score(Y_val.ravel(), ypred_poly_val)

In [None]:
# same again for training data

ypred_linear_train = model_linear.predict(X_train_resampled)
ypred_rbf_train = model_rbf.predict(X_train_resampled)
ypred_sig_train = model_sig.predict(X_train_resampled)
ypred_poly_train = model_poly.predict(X_train_resampled)

#predict accuracy, precision and recall for training values
acc_linear_train = accuracy_score(Y_train_resampled, ypred_linear_train)
prec_linear_train = precision_score(Y_train_resampled, ypred_linear_train)
rec_linear_train = recall_score(Y_train_resampled, ypred_linear_train)

acc_rbf_train = accuracy_score(Y_train_resampled, ypred_rbf_train)
prec_rbf_train = precision_score(Y_train_resampled, ypred_rbf_train)
rec_rbf_train = recall_score(Y_train_resampled, ypred_rbf_train)

acc_sig_train = accuracy_score(Y_train_resampled, ypred_sig_train)
prec_sig_train = precision_score(Y_train_resampled, ypred_sig_train)
rec_sig_train = recall_score(Y_train_resampled, ypred_sig_train)

acc_poly_train = accuracy_score(Y_train_resampled, ypred_poly_train)
prec_poly_train = precision_score(Y_train_resampled, ypred_poly_train)
rec_poly_train = recall_score(Y_train_resampled, ypred_poly_train)


In [None]:
print(cm_linear_val, "\n")
print(cm_rbf_val, "\n")
print(cm_sig_val, "\n")
print(cm_poly_val)

In [None]:
kernel_names = ['linear', 'rbf', 'sigmoid', 'poly']
accuracies_val = [acc_linear_val, acc_rbf_val, acc_sig_val, acc_poly_val]
accuracies_train = [acc_linear_train, acc_rbf_train, acc_sig_train, acc_poly_train]
plt.plot(kernel_names, accuracies_val, marker='o', label= 'validation accuracy')
plt.plot(kernel_names, accuracies_train, marker='o', label= 'training accuracy')

plt.xlabel('Kernel Types')
plt.ylabel('Accuracy')
plt.title('Accuracy of Different Kernels')
plt.grid(True)
plt.legend()

In [None]:
kernel_names = ['linear', 'rbf', 'sigmoid', 'poly']
precisions_val = [prec_linear_val, prec_rbf_val, prec_sig_val, prec_poly_val]
precisions_train = [prec_linear_train, prec_rbf_train, prec_sig_train, prec_poly_train]
plt.plot(kernel_names, precisions_val, marker='o', label = 'validation precision')
plt.plot(kernel_names, precisions_train, marker='o', label = 'training precision')

plt.xlabel('Kernel Types')
plt.ylabel('Precision')
plt.title('Precision of Different Kernels')
plt.grid(True)
plt.legend()

In [None]:
kernel_names = ['linear', 'rbf', 'sigmoid', 'poly']
recall_val = [rec_linear_val, rec_rbf_val, rec_sig_val, rec_poly_val]
recall_train = [rec_linear_train, rec_rbf_train, rec_sig_train, rec_poly_train]
plt.plot(kernel_names, recall_val, marker='o', label = 'validation recall')
plt.plot(kernel_names, recall_train, marker='o', label = 'training recall')

plt.xlabel('Kernel Types')
plt.ylabel('Recall')
plt.title('Recall of Different Kernels')
plt.grid(True)
plt.legend()

In [None]:
#increasing values of C

# c_training = []
# for c in np.arange(0.1, 7, 0.005):
#   c_training.append(c)

# print(c_training)

c_training = [0.001, 0.01, 0.1, 1, 10, 100]

In [None]:
accuracy_c = []

for c in c_training:
  svm_model = SVC(C = c)
  svm_model.fit(X_train, Y_train)
  train_accuracy = svm_model.score(X_train, Y_train)
  accuracy_c.append(train_accuracy)

plt.plot(c_training, accuracy_c, marker='o')
plt.xlabel('C values')
plt.ylabel('Training Accuracy')
plt.title('Effect of C on Training Accuracy')
plt.xscale('log')
plt.show()


In [None]:
model_test = SVC(C = 1, kernel = 'rbf', gamma = 0.005)
model_test = model_rbf.fit(X_train_resampled, Y_train_resampled)

ypred_test = model_test.predict(X_test)

acc_test = accuracy_score(Y_test, ypred_test)
prec_test = precision_score(Y_test, ypred_test)
rec_test = recall_score(Y_test, ypred_test)

print(acc_test)
print(prec_test)
print(rec_test)


In [None]:
print(acc_rbf_train)
print(prec_rbf_train)
print(rec_rbf_train)

In [None]:
print(acc_rbf_val)
print(prec_rbf_val)
print(rec_rbf_val)

0.9785714285714285
0.9411764705882353
1.0


In [None]:
print(acc_linear_val)
print(prec_linear_val)
print(rec_linear_val)

0.9785714285714285
0.9411764705882353
1.0


In [None]:
X_val.head()

Unnamed: 0,Air Pollution,Dust Allergy,Passive Smoker,Shortness of Breath
57,6,7,2,6
326,3,1,2,2
15,4,6,3,9
563,2,5,4,2
412,1,7,4,6


In [None]:
X_train.head()

Unnamed: 0,Air Pollution,Dust Allergy,Passive Smoker,Shortness of Breath
431,6,7,2,6
59,3,4,2,6
19,2,5,4,2
463,8,7,7,4
254,3,1,2,1


Modelling how increasing the values of C affects results