In [1]:
# 加载相关包
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 加载数据
train_df = pd.read_csv("./data/cs-training.csv")
test_df = pd.read_csv("./data/cs-test.csv")
combine=[train_df, test_df]
train_df.head(10)

FileNotFoundError: [Errno 2] No such file or directory: './data/abalone.data'

In [None]:
# 观察数据
data.info()

In [None]:
# 观察标签分布
sns.countplot(x=train_df[train_df.columns[1]])

In [None]:
# decribe查看数值型数据的信息，（没有非数值）
train_df.describe()

In [None]:
# 去掉Unnamed:0 此列为序号列
train_df.drop(train_df.columns[0],axis=1,inplace=True)
train_df.info()

In [None]:
# 查找关联度
corr_matrix = train_df.corr()
print(corr_matrix["SeriousDlqin2yrs"].sort_values(ascending=False))
# 展示各特征之间的相关性
fig, ax = plt.subplots(figsize=(12,12))
sns.heatmap(corr_matrix, annot=True, vmax=1, square=True, cmap='Blues')
plt.show()

In [None]:
# 替换空值
for data in combine:
    data["MonthlyIncome"].fillna(data["MonthlyIncome"].mean(), inplace=True)
    data["NumberOfDependents"].fillna(data["MonthlyIncome"].mode()[0], inplace=True)
train_df.info()

In [None]:
# 处理异常值
train_df.NumberOfDependents.value_counts()

In [None]:
# 填补前先看一下家属数和目标值的相关性
corr_matrix = train_df.corr()
corr_matrix["SeriousDlqin2yrs"].sort_values(ascending=False)

In [None]:
# 观察相关性
for data in combine:
  data["NumberOfDependents"][data["NumberOfDependents"]>30] = 0
train_df.corr()["SeriousDlqin2yrs"]["NumberOfDependents"]

In [None]:
# 设置标签
y=train_df['SeriousDlqin2yrs']
X=train_df
X.drop('SeriousDlqin2yrs',axis=1,inplace=True)
print(X)
print(y)

In [None]:
# 模型训练过程，输出准确率
X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=6)
lr = LogisticRegression(penalty='l1',
                            solver='liblinear'
                            )
lr.fit(X_train, y_train)
y_test_pred=lr.predict(X_test)
print(accuracy_score(y_test, y_test_pred))


In [None]:
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(y, y_pred, cmap=plt.cm.Blues, title='混淆矩阵'):
    """
    绘制混淆矩阵
    :param y: 真实值
    :param y_pred: 预测值
    :param cmap: 热力图的颜色
    :param title: 图像标题
    :return:
    """
    cm = confusion_matrix(y, y_pred)
    classes = list(set(y))
    classes.sort()
    plt.imshow(cm, cmap)
    indices = range(len(cm))
    plt.xticks(indices, classes)
    plt.yticks(indices, classes)
    # 热度表
    plt.colorbar()
    # 坐标轴含义
    plt.xlabel('guess')
    plt.ylabel('fact')
    plt.title(title)
    # 显示数据的值（Numbers）
    for first_index in range(len(cm)):
        for second_index in range(len(cm[first_index])):
            plt.text(first_index, second_index, cm[first_index][second_index])

In [None]:
# 输出混淆矩阵
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['FangSong']  # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题
plot_confusion_matrix(y_test,
                          y_test_pred,
                          title='测试集混淆矩阵')
plt.show()

In [None]:
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1,
                        train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True):
    """
    画出data在某模型上的learning curve.
    参数解释
    ----------
    estimator : 分类器。
    title : 标题。
    X : 输入的feature，numpy类型
    y : 输入的target vector
    ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点
    cv : 做cross-validation的时候，数据分成的份数，其中一份作为cv集，其余n-1份作为training(默认为3份)
    n_jobs : 并行的的任务数(默认1)
    """
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        verbose=verbose)

    train_scores_mean = np.mean(train_scores, axis=1).astype(np.float32)
    train_scores_std = np.std(train_scores, axis=1).astype(np.float32)
    test_scores_mean = np.mean(test_scores, axis=1).astype(np.float32)
    test_scores_std = np.std(test_scores, axis=1).astype(np.float32)

    if plot:
        plt.figure()
        plt.title(title)
        if ylim is not None:
            plt.ylim(*ylim)
        plt.xlabel(u"训练样本数")
        plt.ylabel(u"得分")
        plt.gca().invert_yaxis()
        plt.grid()

        plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std,
                         alpha=0.1, color="b")
        plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std,
                         alpha=0.1, color="r")
        plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"训练集上得分")
        plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"验证集上得分")

        plt.legend(loc="best")

        plt.draw()
        plt.show()
        plt.gca().invert_yaxis()

    midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2
    diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1])
    return midpoint, diff

In [None]:
plot_learning_curve(lr, u"学习曲线", X_train, y_train)