In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

"""
Linear classifier is a basic and common Machine Learning Model
Restricted by Data Features and Classification Objects
Logistic regression takes a long time to calculate, and the performance of the model is slightly better.
The calculation time of Stochastic Gradient Descent classifier is short and the performance of the 
model is slightly low.
"""

'\nLinear classifier is a basic and common Machine Learning Model\nRestricted by Data Features and Classification Objects\nLogistic regression takes a long time to calculate, and the performance of the model is slightly better.\nThe calculation time of Stochastic Gradient Descent classifier is short and the performance of the \nmodel is slightly low.\n'

In [2]:
# 创建特征列表 Create the list of column names
column_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size',
                'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell size',
                'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']

In [6]:
data = pd.read_csv('./Data/breast/breast-cancer-wisconsin.data', names=column_names)

In [6]:
# 将?替换为标准缺失值表示 Replace '?' With missing data representation (np.nan)
data = data.replace(to_replace='?', value=np.nan)
# 丢失带有缺失值的数据 只要有一个维度有缺失就丢弃 Return a filtered version of the data, drop data which has missing value
data = data.dropna(how='any')
# 输出data数据的数量和维度 
print(data.shape)

(683, 11)


In [7]:
# 随机采样25%数据用于测试 75%数据用于训练 Random sampling 25% data for testing 75% data for training
x_train, x_test, y_train, y_test = train_test_split(data[column_names[1:10]],
                                                    data[column_names[10]],
                                                    test_size=0.25,
                                                    random_state=33)

In [8]:
# Check the number and type distribution of training samples and test samples
print(y_train.value_counts())
print(y_test.value_counts())

2    344
4    168
Name: Class, dtype: int64
2    100
4     71
Name: Class, dtype: int64


In [9]:
# 数据标准化，保证每个维度特征的方差为1 均值为0 预测结果不会被某些维度过大的特征值主导 
#Data standardization ensures that the variance of each dimension feature is 0 and the predicted results are not affected by some excessive eigenvalues.
ss = StandardScaler()
x_train = ss.fit_transform(x_train)     # 对x_train进行标准化 Standardization of x_train
x_test = ss.transform(x_test)       # 用与x_train相同的规则对x_test进行标准化 Standardization of x_test with the same rules as x_train

In [10]:
lr = LogisticRegression()   # 初始化逻辑斯蒂回归模型 Initialization
sgdc = SGDClassifier()  # 初始化随机参数估计模型 Initialization

In [11]:
# 使用 逻辑回归 在训练集合上训练 Using Logical Regression to train training sets
lr.fit(x_train, y_train)
# 训练好后 Prediction results for test sets are stored in lr_y_prediction
lr_y_predict = lr.predict(x_test)

# 使用 随机参数估计 在训练集合上训练 Using SGD to train training sets
sgdc.fit(x_train, y_train)
# 训练好后 Prediction results for test sets are stored in sgdc_y_predict
sgdc_y_predict = sgdc.predict(x_test)



In [12]:
# 逻辑回归模型自带评分函数score获得模型在测试集合上的准确率 check the score of accuracy 
print("Accuracy of Logical Regression model：", lr.score(x_test, y_test))
# 逻辑斯蒂回归的其他指标 
print("Other indicators of Logical Regression model：\n", classification_report(y_test, lr_y_predict, target_names=["Benign", "Malignant"]))

# 随机参数估计的性能分析 
print("Accuracy of SGD Classifier model：", sgdc.score(x_test, y_test))
# 随机参数估计的其他指标
print("Other indicators of SGD Classifier model:\n", classification_report(y_test, sgdc_y_predict, target_names=["Benign", "Malignant"]))

Accuracy of Logical Regression model： 0.9883040935672515
Other indicators of Logical Regression model：
               precision    recall  f1-score   support

      Benign       0.99      0.99      0.99       100
   Malignant       0.99      0.99      0.99        71

    accuracy                           0.99       171
   macro avg       0.99      0.99      0.99       171
weighted avg       0.99      0.99      0.99       171

Accuracy of SGD Classifier model： 0.9883040935672515
Other indicators of SGD Classifier model:
               precision    recall  f1-score   support

      Benign       1.00      0.98      0.99       100
   Malignant       0.97      1.00      0.99        71

    accuracy                           0.99       171
   macro avg       0.99      0.99      0.99       171
weighted avg       0.99      0.99      0.99       171

