In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File    : wine_logistic.py
"""
使用logistic回归鉴别红酒的种类。
数据集：130条数据，红酒种类有2种
"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn import linear_model


def get_data(filename):
    df = pd.read_csv(filename)
    # print(df.head())
    # print(df.describe())
    x = df.iloc[:, 1:]
    y = df.iloc[:, 0]
    # # 自己手写的逻辑回归，只能训练表示为1，0的两个类别，所以手动将类别2改为类别0；
    # y.replace(2, 0, inplace=True)

    return x, y
    # test = df.sample(frac=0.2)    # 采样，随机获取测试集
    # train = df.drop(test.index)   # 留一法，获取训练集
    # # train = df[~ df.index.isin(test.index)]
    #
    # # 根据位置所以，分割数据，使用iloc方法
    # x_test = test.iloc[:, 1:]
    # y_test = test.iloc[:, 0]
    # x_train = train.iloc[:, 1:]
    # y_train = train.iloc[:, 0]
    #
    # return x_train, y_train, x_test, y_test


def deal_data(x, y):
    # 梯度下降法应保证各特征值相差不大
    # print('均值：', x.mean(axis=0))
    # print('方差：', x.var(axis=0))
    # 对各特征值进行缩放，使用标准差标准化
    # 无纲量化
    # 标准化：（x - 均值）/ 方差
    # 最大-最小归一化：（x - 最大）/（最大 - 最小）
    ss = StandardScaler()
    ss.fit(x)

    # 将数据分为训练集和测试集
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True)

    x_train_std = ss.transform(x_train)
    x_test_std = ss.transform(x_test)

    #return x_train, y_train, x_test, y_test
    return x_train_std, y_train, x_test_std, y_test


def train_test_evaluate(x_train, y_train, x_test, y_test):
    # #  模型训练：LogistivRegression
    # #  先以超参数 n_iter=2000, eta=0.01, tol=0.0001创建模型
    # clf = LogisticRegression(n_iter=500, eta=0.01, tol=0.0001)
    # clf.train(x_train, y_train)    # 训练模型
    # y_pred = clf.predict(x_test)   # 预测
    # print(y_pred)

    # 调用sklearn包中的逻辑回归
    clf = linear_model.LogisticRegression(penalty='l2', solver='saga', max_iter=100, C=1.0)
    clf.fit(x_train, y_train)

    y_train_pred = clf.predict(x_train)
    y_pred = clf.predict(x_test)

    #  计算训练集的f1/AUC
    f1 = f1_score(y_train, y_train_pred)
    print("训练集的f1: " + str(f1))
    auc_score = roc_auc_score(y_train, y_train_pred)
    print("训练集的AUC: " + str(auc_score))

    #  计算预测的f1/AUC
    f1 = f1_score(y_test, y_pred)
    print("测试集的f1: " + str(f1))
    auc_score = roc_auc_score(y_test, y_pred)
    print("测试集的AUC: " + str(auc_score))

if __name__ == '__main__':
    filename = '/Users/mac/Desktop/wine.data'
    # 1、获取数据
    x, y = get_data(filename)
    # 2、 归一化、数据划分
    x_train, y_train, x_test, y_test = deal_data(x, y)
    # 3、训练模型、测试模型、评估模型
    train_test_evaluate(x_train, y_train, x_test, y_test)



训练集的f1: 1.0
训练集的AUC: 1.0
测试集的f1: 1.0
测试集的AUC: 1.0
