In [17]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn 
import torch.nn.functional as F # 内置的激活函数

from torch.utils.data import TensorDataset, DataLoader # 管理批次分割
from sklearn.model_selection import train_test_split # 训练集和测试集

In [None]:
data = pd.read_csv('./第5章/HR.csv')

data.part.unique() # 查看部门取值
data.salary.unique() # 查看薪资取值
data.groupby(['salary','part']).size() # 查看薪资对应分布

# 数值化文本 插入data中
data = data.join(pd.get_dummies(data.salary))
data = data.join(pd.get_dummies(data.part))
# 删除文本列
data.drop(columns=['part', 'salary'], inplace=True)

data.left.value_counts() # 统计数值

# 是否离职
Y_data = data.left.values.reshape(-1,1)
Y = torch.from_numpy(Y_data).type(torch.FloatTensor)
'''
[expression for item in iterable if condition] 列表推导式
expression是对每个元素进行的操作
item是迭代的变量 和expression关联
iterable是可迭代的对象 
condition是可选的过滤条件
'''
X_data = (data[[column for column in data.columns if column != 'left']].values)
X = torch.from_numpy(X_data.astype(int)).type(torch.FloatTensor)

# 创建模型
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        # 输入为20，假设中间层输出为64，最终输出1
        self.linear1 = nn.Linear(20,64)
        self.linear2 = nn.Linear(64,64)
        self.linear3 = nn.Linear(64,1)        
    def forward(self,input):
        x = F.relu(self.linear1(input))
        x = F.relu(self.linear2(x))
        x = F.sigmoid(self.linear3(x))
        return x
model = Model()
    
# 定义获取模型的方法   
def getModel():
    model = Model()
    opt = torch.optim.Adam(model.parameters(),lr=0.0001)
    return model,opt
model,opt = getModel()

loss_fn = nn.BCELoss()
batch_size = 64 # 每批64个
epoches = 50 # 分批训练100轮

HR_ds = TensorDataset(X,Y) # 分割
# 进行批次管理，乱序处理
HR_dl = DataLoader(HR_ds,batch_size=batch_size,shuffle=True)

for epoch in range(epoches):
    for x,y in HR_dl:
        y_pred = model(x)
        loss = loss_fn(y_pred,y)
        
        # 三大步
        opt.zero_grad()
        loss.backward()
        opt.step()
    # 截断跟踪
    with torch.no_grad():
        # python打印每轮损失值
        print('epoch: ',epoch,'loss: ',loss_fn(model(X),Y).data.item())

#### 添加测试数据和训练数据

In [None]:
trainX,testX,trainY,testY = train_test_split(X_data,Y_data,test_size=0.25)

trainX = torch.from_numpy(trainX).type(torch.FloatTensor)
trainY = torch.from_numpy(trainY).type(torch.FloatTensor)
testX = torch.from_numpy(testX).type(torch.FloatTensor)
testY = torch.from_numpy(testY).type(torch.FloatTensor)

train_ds = TensorDataset(trainX,trainY) # 分割
# 进行批次管理，乱序处理
train_dl = DataLoader(train_ds,batch_size=batch_size,shuffle=True)

test_ds = TensorDataset(testX,testY) # 分割
# 进行批次管理，无乱序处理
test_dl = DataLoader(test_ds,batch_size=batch_size)

# 验证正确率
def accuracy(y_pred,y_real):
    # sigmoid函数 0.5 分界
    acc = ((y_pred > 0.5) == y_real).float().mean()
    return acc

model,opt = getModel()

for epoch in range(epoches):
    for x,y in train_dl:
        y_pred = model(x)
        loss = loss_fn(y_pred,y)
        
        # 三大步
        opt.zero_grad()
        loss.backward()
        opt.step()
    # 截断跟踪
    with torch.no_grad():
        trainLoss = loss_fn(model(trainX),trainY).data
        trainAcc = accuracy(model(trainX),trainY)
        
        testLoss = loss_fn(model(testX),testY).data
        testAcc = accuracy(model(testX),testY)
        # python打印每轮损失值
        print('epoch: ',epoch,'trainLoss: ',round(trainLoss.item(),2),
                               'trainAcc: ',round(trainAcc.item(),2),
                               'testLoss: ',round(testLoss.item(),2),
                               'testAcc: ',round(testAcc.item(),2))