In [19]:
import csv
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

In [20]:
data = np.loadtxt("./breast_cancer.csv", delimiter=",",skiprows=1,dtype=float)
header_row = np.loadtxt("./breast_cancer.csv", delimiter=",",max_rows=1, dtype=str)

# In the data the Class 2 represents Benign and Class 4 represents Malignant
for row,index in enumerate(header_row):
    print(f"{row}:{index}")



0:Clump Thickness
1:Uniformity of Cell Size
2:Uniformity of Cell Shape
3:Marginal Adhesion
4:Single Epithelial Cell Size
5:Bare Nuclei
6:Bland Chromatin
7:Normal Nucleoli
8:Mitoses
9:Class


In [21]:
m,n = len(data),len(header_row)-1

x_data = np.array([row[:8] for row in data])
y_data = np.array([0 if row[9] == 2.0 else 1 for row in data])

x_train,x_,y_train,y_ = train_test_split(x_data,y_data,train_size=0.6,random_state=1)
x_cv,x_test,y_cv,y_test = train_test_split(x_,y_,train_size=0.5,random_state=1)



print(f"Training Data: {x_train.shape[0]}")
print(f"Cross Validation Data: {x_cv.shape[0]}")
print(f"Test Data: {x_test.shape[0]}")


Training Data: 409
Cross Validation Data: 137
Test Data: 137


In [22]:
# Add Polynomial Features
poly = PolynomialFeatures(degree=2)
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
x_train_normalized = scaler.fit_transform(x_train)
x_cv_normalized = scaler.transform(x_cv)
x_test_normalized = scaler.transform(x_test)

# Apply polynomial transformation
x_train_poly = poly.fit_transform(x_train_normalized)
x_cv_poly = poly.transform(x_cv_normalized)
x_test_poly = poly.transform(x_test_normalized)

# Train logistic regression models
lr_model1 = LogisticRegression()
lr_model1.fit(x_train_normalized, y_train)

lr_model2 = LogisticRegression()
lr_model2.fit(x_train_poly, y_train)

In [24]:
# Predictions
y_pred1 = lr_model1.predict(x_cv_normalized)
y_pred2 = lr_model2.predict(x_cv_poly)

# Precision and Recall for lr_model1
precision1 = precision_score(y_cv, y_pred1)
recall1 = recall_score(y_cv, y_pred1)

# Precision and Recall for lr_model2
precision2 = precision_score(y_cv, y_pred2)
recall2 = recall_score(y_cv, y_pred2)

print(f"Model 1 - Precision: {precision1}, Recall: {recall1}")
print(f"Model 2 - Precision: {precision2}, Recall: {recall2}")

# F1 Score for lr_model1
lr_model1_f1 = f1_score(y_cv, y_pred1)

# F1 Score for lr_model2
lr_model2_f1 = f1_score(y_cv, y_pred2)

print(f"Model 1 - F1 Score: {lr_model1_f1}")
print(f"Model 2 - F1 Score: {lr_model2_f1}")

# I turns out that the polynomial model is better than the linear model

Model 1 - Precision: 1.0, Recall: 0.94
Model 2 - Precision: 1.0, Recall: 0.96
Model 1 - F1 Score: 0.9690721649484536
Model 2 - F1 Score: 0.9795918367346939
