In [None]:
# importing libraries
import pandas as pd
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn import datasets
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [None]:
diabetes = pd.read_csv('diabetes.csv')

X = diabetes.iloc[:,0:8].values 
Y = diabetes.iloc[:, 8].values

# Splitting the data for problems 1 and 2
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8, test_size = 0.2, random_state=42)

In [None]:
# Problem 1

# Create a scaler object
sc = StandardScaler()

# Fit the scaler to the training data and transform
X_train_std = sc.fit_transform(X_train)

# Apply the scaler to the test data
X_test_std = sc.transform(X_test)

model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train)
predicted = model.predict(X_test)

In [None]:
matrix = confusion_matrix(Y_test, predicted)
report = classification_report(Y_test, predicted)
print('For Problem 1:')
print('')
print("Accuracy: %.3f%%" % (accuracy_score(Y_test, predicted)*100.0))
print("Precision: %.3f%%" % (precision_score(Y_test, predicted)*100.0))
print("Recall: %.3f%%" % (recall_score(Y_test, predicted)*100.0))

In [None]:
class_names=[0,1] #name of classes
fig, ax =plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
#create heatmap
sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
print('For Problem 1:')
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label');

In [None]:
# Problem 2
# Create a scaler object
sc = StandardScaler()

# Fit the scaler to the training data and transform
X_train = sc.fit_transform(X_train)

# Apply the scaler to the test data
X_test = sc.transform(X_test)

In [None]:
# 5 folds selected
kfold = KFold(n_splits=5, random_state=42, shuffle=True)
metrics = ['accuracy', 'precision', 'recall']
model = LogisticRegression(solver='liblinear')
results = cross_validate(model, X, Y,scoring=metrics, cv=kfold)
print('For Problem 2:')
print('')
print("Accuracy for K=5: %.3f%%" % (results['test_accuracy'].mean()*100.0))
print("Rrecision for K=5: %.3f%%" % (results['test_precision'].mean()*100.0))
print("Recall for K=5: %.3f%%" % (results['test_recall'].mean()*100.0))

In [None]:
# 10 folds selected
kfold = KFold(n_splits=10, random_state=42, shuffle=True)
metrics = ['accuracy', 'precision', 'recall']
model = LogisticRegression(solver='liblinear')
results = cross_validate(model, X, Y,scoring=metrics, cv=kfold)
print('For Problem 2:')
print('')
print("Accuracy for K=10: %.3f%%" % (results['test_accuracy'].mean()*100.0))
print("Rrecision for K=10: %.3f%%" % (results['test_precision'].mean()*100.0))
print("Recall for K=10: %.3f%%" % (results['test_recall'].mean()*100.0))

In [None]:
# Problem 3
# Part 1
from sklearn.datasets import load_breast_cancer

breast = load_breast_cancer()

breast_data = breast.data

In [None]:
breast_input=pd.DataFrame(breast_data)

In [None]:
breast_labels = breast.target

In [None]:
labels = np.reshape(breast_labels,(569,1))

In [None]:
final_breast_data = np.concatenate([breast_data,labels],axis=1)

In [None]:
breast_dataset=pd.DataFrame(final_breast_data)
features = breast.feature_names

In [None]:
features_labels=np.append(features,'label')
breast_dataset.columns=features_labels

In [None]:
breast_dataset['label'].replace(0, 'Benign',inplace=True)
breast_dataset['label'].replace(1, 'Malignant',inplace=True)

In [None]:
X = breast_dataset.iloc[: , 0:30].values
Y = breast_dataset.iloc[: , 30].values

In [None]:
# Splitting the data for problems 3 and 4
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8, test_size = 0.2, random_state=42)

In [None]:
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train)
predicted = model.predict(X_test)

In [None]:
matrix = confusion_matrix(Y_test, predicted)

In [None]:
pos_label =['Benign', 'Malignant']
matrix = confusion_matrix(Y_test, predicted)
report = classification_report(Y_test, predicted)

print('For Problem 3.1:')
print('')
print("Accuracy: %.3f%%" % (accuracy_score(Y_test, predicted)*100.0))
print("Precision: %.3f%%" % (precision_score(Y_test, predicted, pos_label='Benign')*100.0))
print("Recall: %.3f%%" % (recall_score(Y_test, predicted, pos_label='Malignant')*100.0))

In [None]:
class_names=[0,1] #name of classes
fig, ax =plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
#create heatmap
sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
print('For Problem 3.1:')
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label');

In [None]:
# Problem 3
# Part 2
from sklearn.datasets import load_breast_cancer

breast = load_breast_cancer()

breast_data = breast.data

In [None]:
breast_input=pd.DataFrame(breast_data)

breast_labels = breast.target

labels = np.reshape(breast_labels,(569,1))

final_breast_data = np.concatenate([breast_data,labels],axis=1)

breast_dataset=pd.DataFrame(final_breast_data)
features = breast.feature_names

features_labels=np.append(features,'label')
breast_dataset.columns=features_labels

breast_dataset['label'].replace(0, 'Benign',inplace=True)
breast_dataset['label'].replace(1, 'Malignant',inplace=True)

In [None]:
X = breast_dataset.iloc[: , 0:30].values
Y = breast_dataset.iloc[: , 30].values

In [None]:
# Splitting the data for problems 3 and 4
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8, test_size = 0.2, random_state=42)

In [None]:
print('For Problem 3.2:')
print('')
C = [10, 1, .1, .001, .005]
for c in C:
    model = LogisticRegression(penalty='l1',C=c,solver='liblinear',max_iter=1000000)
    model.fit(X_train, Y_train)
    predicted = model.predict(X_test)
    pos_label =['Benign', 'Malignant']
    matrix = confusion_matrix(Y_test, predicted)
    report = classification_report(Y_test, predicted)
    print('C:', c)
    print("Accuracy: %.3f%% " % (accuracy_score(Y_test, predicted)*100.0))
    print("Precision: %.3f%%" % (precision_score(Y_test, predicted, pos_label='Benign')*100.0))
    print("Recall: %.3f%%" % (recall_score(Y_test, predicted, pos_label='Malignant')*100.0))
    print('')

In [None]:
# Problem 4
# Part 1
from sklearn.datasets import load_breast_cancer

breast = load_breast_cancer()

breast_data = breast.data

In [None]:
breast_input=pd.DataFrame(breast_data)

breast_labels = breast.target

labels = np.reshape(breast_labels,(569,1))

final_breast_data = np.concatenate([breast_data,labels],axis=1)

breast_dataset=pd.DataFrame(final_breast_data)
features = breast.feature_names

features_labels=np.append(features,'label')
breast_dataset.columns=features_labels

breast_dataset['label'].replace(0, 'Benign',inplace=True)
breast_dataset['label'].replace(1, 'Malignant',inplace=True)

In [None]:
X = breast_dataset.iloc[: , 0:30].values
Y = breast_dataset.iloc[: , 30].values

In [None]:
# Splitting the data for problems 3 and 4
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8, test_size = 0.2, random_state=42)
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train)
predicted = model.predict(X_test)

In [None]:
# 5 folds selected
matrix = confusion_matrix(Y_test, predicted)
kfold = KFold(n_splits=5, random_state=42, shuffle=True)

model = LogisticRegression(solver='liblinear')
results = cross_val_score(model, X, Y, cv=kfold)

print('For Problem 4.1:')
print('')
print("Accuracy for K=5: %.3f%%" % (results.mean()*100.0))

In [None]:
# 10 folds selected
kfold = KFold(n_splits=10, random_state=42, shuffle=True)

model = LogisticRegression(solver='liblinear')
results = cross_val_score(model, X, Y, cv=kfold)
print('For Problem 4.1:')
print('')
print("Accuracy for K=10: %.3f%%" % (results.mean()*100.0))

In [None]:
# Problem 4
# Part 2
from sklearn.datasets import load_breast_cancer

breast = load_breast_cancer()

breast_data = breast.data

In [None]:
breast_input=pd.DataFrame(breast_data)

breast_labels = breast.target

labels = np.reshape(breast_labels,(569,1))

final_breast_data = np.concatenate([breast_data,labels],axis=1)

breast_dataset=pd.DataFrame(final_breast_data)
features = breast.feature_names

features_labels=np.append(features,'label')
breast_dataset.columns=features_labels

breast_dataset['label'].replace(0, 'Benign',inplace=True)
breast_dataset['label'].replace(1, 'Malignant',inplace=True)

In [None]:
X = breast_dataset.iloc[: , 0:30].values
Y = breast_dataset.iloc[: , 30].values

In [None]:
# Splitting the data for problems 3 and 4
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8, test_size = 0.2, random_state=42)
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)

In [None]:
# 5 folds selected
print('For Problem 4.2:')
print('')
kfold = KFold(n_splits=5, random_state=42, shuffle=True)
C=[10, 1, .1, .001, .005]
for c in C:
    model = LogisticRegression(penalty='l1',C=c, solver='liblinear', max_iter=100000)
    results = cross_val_score(model, X, Y, cv=kfold)
    print('C:',c)
    print("Accuracy for K=5: %.3f%%" % (results.mean()*100.0))
    print('')

In [None]:
# 10 folds selected
print('For Problem 4.2:')
print('')
kfold = KFold(n_splits=10, random_state=42, shuffle=True)
C=[10, 1, .1, .001, .005]
for c in C:
    model = LogisticRegression(penalty='l1',C=c, solver='liblinear',max_iter=100000)
    results = cross_val_score(model, X, Y, cv=kfold)
    print('C:', c)
    print("Accuracy for K=5: %.3f%%" % (results.mean()*100.0))
    print('')