# 这是一个baseline

author：lq

<del>把边信息加入节点特征，做lstm</del>

发现节点之间在每天都有不同的连接，不止一个，因此无法将边特征加到节点上

之前的lstm设置的时间步是1，相当于没有用到任何时序信息，网络退化为前馈网络

所以本baseline中修改时间步 > 1

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader, Subset
import csv
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 统一设置随机种子
seed = 9999
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f756d69b270>

# 数据处理

<del>
1. 读入节点和边的数据
2. 把边的数据插入节点数据
</del>

边的信息仍然先不用

In [3]:
df_train = pd.read_csv("data/train_90.csv", dtype={"geohash_id": str})
df_test = pd.read_csv("data/A/node_test_4_A.csv", dtype={"geohash_id": str})

In [4]:
df_train

Unnamed: 0,geohash_id,date_id,F_1,F_2,F_3,F_4,F_5,F_6,F_7,F_8,...,F_28,F_29,F_30,F_31,F_32,F_33,F_34,F_35,active_index,consume_index
0,4885e281g,20230104,-0.711,-0.696,-0.794,-0.727,-0.747,-0.792,1.539,2.433,...,0.073,0.344,0.006,-0.446,-0.502,-0.456,-0.457,-0.830,69.306,63.78
1,4885e281g,20230105,-0.909,-0.903,-0.947,-0.844,-0.856,-0.908,-0.371,0.990,...,0.055,0.298,0.007,-0.523,-0.558,-0.533,0.113,-0.887,68.881,61.62
2,4885e281g,20230106,-0.920,-0.925,-0.923,-0.852,-0.853,-0.915,-0.334,0.792,...,0.067,0.324,0.006,-0.535,-0.564,-0.540,0.367,-1.021,69.738,61.03
3,4885e281g,20230107,-0.926,-0.931,-0.943,-0.837,-0.850,-0.907,-0.993,-0.006,...,0.076,0.276,0.010,-0.534,-0.554,-0.521,0.550,-0.211,68.721,62.02
4,4885e281g,20230108,-0.750,-0.764,-0.818,-0.749,-0.764,-0.816,1.116,1.447,...,0.079,0.328,0.008,-0.468,-0.500,-0.419,-0.236,0.644,69.960,64.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102595,1d3640fad,20230330,2.049,1.751,2.777,2.968,3.008,2.544,-0.503,-1.525,...,0.070,0.295,0.019,2.920,2.533,3.210,-0.703,0.908,74.582,85.81
102596,1d3640fad,20230331,1.871,1.733,2.613,3.218,3.286,2.700,-1.191,-1.861,...,0.073,0.304,0.023,3.457,2.748,3.267,-0.908,-0.250,73.194,86.31
102597,1d3640fad,20230401,-0.099,-0.057,0.575,0.759,0.944,0.566,-2.486,-2.534,...,0.070,0.299,0.018,1.517,0.975,1.275,-0.974,-0.765,72.713,79.83
102598,1d3640fad,20230402,-0.233,-0.232,0.498,0.599,0.842,0.394,-2.585,-2.794,...,0.067,0.287,0.016,1.194,0.779,1.017,-0.755,-0.707,72.394,78.72


简单处理一下，做个标准化，去掉F_23和F_27

In [5]:
df_train.drop(["F_23", "F_27"], axis=1, inplace=True)
df_test.drop(["F_23", "F_27"], axis=1, inplace=True)

In [6]:
# 标准化
id_and_date_columns = ["geohash_id", "date_id"]
feature_columns = df_train.drop(
    ["geohash_id", "date_id", "active_index", "consume_index"], axis=1
).columns
label_columns = ["active_index", "consume_index"]


feature_scaler = StandardScaler()
df_train.loc[:, feature_columns] = feature_scaler.fit_transform(
    df_train[feature_columns]
)
label_scaler = StandardScaler()
df_train.loc[:, label_columns] = label_scaler.fit_transform(df_train[label_columns])

df_test.loc[:, feature_columns] = feature_scaler.transform(df_test[feature_columns])

In [7]:
# 合并两个数据集，并且按照geohash_id和date_id排序
df_all = pd.concat([df_train, df_test], axis=0, ignore_index=True)
df_all.sort_values(["geohash_id", "date_id"], inplace=True)

In [8]:
df_all[df_all["geohash_id"] == "007e3e4ef"]

Unnamed: 0,geohash_id,date_id,F_1,F_2,F_3,F_4,F_5,F_6,F_7,F_8,...,F_28,F_29,F_30,F_31,F_32,F_33,F_34,F_35,active_index,consume_index
60840,007e3e4ef,20230104,-0.572895,-0.582853,-0.576396,-0.468212,-0.488368,-0.555685,0.089299,0.134324,...,0.738499,-0.065663,-0.457655,-0.579776,-0.640004,-0.559142,-0.518954,-0.999565,-0.061534,-0.203339
60841,007e3e4ef,20230105,-0.864671,-0.868830,-0.798404,-0.719249,-0.731408,-0.783150,-1.631071,-1.076339,...,-0.352381,-0.695002,-0.341666,-0.720750,-0.784080,-0.662337,-0.346178,-0.771125,-0.576133,-0.513200
60842,007e3e4ef,20230106,-0.911600,-0.917854,-0.805700,-0.774579,-0.781657,-0.823588,-1.989649,-1.315047,...,-0.263931,-0.675931,-0.457655,-0.772216,-0.823486,-0.711068,-0.016452,-0.949248,-0.464884,-0.683681
60843,007e3e4ef,20230107,-0.875894,-0.888235,-0.844264,-0.733594,-0.732433,-0.799325,-1.806331,-1.392602,...,-0.352381,-0.847569,-0.689633,-0.748721,-0.793932,-0.678581,0.180065,-0.598033,0.054083,-0.592681
60844,007e3e4ef,20230108,-0.668794,-0.710521,-0.553465,-0.554281,-0.592968,-0.657792,-0.351873,-0.683528,...,0.178317,-0.618718,-0.225677,-0.643550,-0.678178,-0.566786,-0.223520,-0.241787,-0.313155,-0.232136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60929,007e3e4ef,20230403,-0.419866,-0.436801,-0.368980,-0.348329,-0.406330,-0.568828,-0.108120,-0.267552,...,0.001418,1.269297,1.514155,-0.209440,-0.354312,-0.392884,-1.188958,-1.683880,-0.831830,-0.093909
105304,007e3e4ef,20230404,-0.590239,-0.623707,-0.543043,-0.514320,-0.553999,-0.696208,0.240385,-0.006685,...,-0.263931,1.154872,0.122289,-0.485793,-0.621532,-0.579208,-1.278643,-1.711051,,
105305,007e3e4ef,20230405,-0.340290,-0.346923,-0.267878,-0.221273,-0.256609,-0.497050,0.348160,0.261232,...,-0.175481,1.364651,-0.109689,-0.210559,-0.366627,-0.395750,-1.237757,-1.674823,,
105306,007e3e4ef,20230406,-0.314786,-0.319347,-0.280385,-0.213076,-0.268915,-0.499072,0.499247,0.422385,...,-0.263931,1.688856,1.050200,-0.215034,-0.364164,-0.403394,-1.199509,-1.613436,,


## 制作LSTM需要的数据格式

注意这里用到了时序：get_item时，返回seq_len个时间步的所有特征，输出的label是最后一个时间步的label

In [9]:
# 每周有7天，所以先考虑以7天为一个周期
seq_len = 7


# 该dataset只包含一个geohash_id的数据
class LSTMPerGeoDataset(Dataset):
    def __init__(self, df_per_geo, seq_len):
        self.features = df_per_geo.loc[:, feature_columns].values
        self.labels = df_per_geo.loc[:, label_columns].values
        self.seq_len = seq_len

    def __getitem__(self, index):
        # feature返回值是一个seq_len*feature_num的矩阵
        feature = self.features[index : index + self.seq_len, :]
        if self.labels is not None:
            # label返回值是最后一个时间步的label
            label = self.labels[index + self.seq_len - 1, :]
            return feature, label
        else:
            return feature, None

    def __len__(self):
        return len(self.features) - self.seq_len + 1

测一下维度对不对

In [10]:
for i, df_per_geo in df_all.groupby("geohash_id"):
    print("geohash_id", i, len(df_per_geo))
    dataset = LSTMPerGeoDataset(df_per_geo, seq_len)
    break

geohash_id 007e3e4ef 94


### 划分数据集

In [11]:
# 划分训练、验证、测试集
val_len = seq_len
test_len = seq_len
train_dataset = Subset(dataset, range(len(dataset) - val_len - test_len))
valid_dataset = Subset(
    dataset, range(len(dataset) - val_len - test_len, len(dataset) - test_len)
)
test_dataset = Subset(dataset, range(len(dataset) - test_len, len(dataset)))

In [12]:
len(train_dataset)

74

In [13]:
dataset.features.shape, dataset.labels.shape

((94, 33), (94, 2))

In [14]:
dataset.__getitem__(0)[0].shape, dataset.__getitem__(0)[1].shape

((7, 33), (2,))

In [15]:
batch_size = 16

In [16]:
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_data_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# dataloader制作完毕，开始设计模型

In [17]:
class LSTMRegression(nn.Module):
    def __init__(self, num_outputs, input_size, hidden_size, num_layers, seq_length):
        super(LSTMRegression, self).__init__()
        self.num_classes = num_outputs  # 输出值的个数
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size
        # seq_length实际上没有用处，只是为了记录
        self.seq_length = seq_length

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
        )

        self.relu = nn.ReLU()
        self.fc_1 = nn.Linear(hidden_size, 64)
        self.fc_2 = nn.Linear(64, num_outputs)

    def forward(self, x):
        # 初始隐藏层状态和cell状态
        h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
        c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))

        output, (hn, cn) = self.lstm(x, (h_0, c_0))
        # debug
        # print("inside LSTM", "output:", output.shape, "hn:", hn.shape, "cn:", cn.shape)
        # print(output[0,-1,:], hn[0,0,:])
        hn = hn.view(-1, self.hidden_size)
        out = self.relu(hn)
        out = self.fc_1(out)
        out = self.relu(out)
        out = self.fc_2(out)
        return out

In [18]:
# LSTM参数
input_size = df_train.shape[1] - 4
# hidden_size是要调的超参数，这里先随便写一个
hidden_size = 16
num_layers = 1

# 线性层参数
# 最终的输出值个数
num_outputs = 2

In [19]:
# 测试维度
x = torch.randn(batch_size, seq_len, input_size)
model = LSTMRegression(num_outputs, input_size, hidden_size, num_layers, seq_len)
output = model(x)
output.shape

torch.Size([16, 2])

# 模型设计完成，做一些准备工作

In [20]:
num_epochs = 36
lr = 0.001

In [21]:
# 定义模型、优化器、损失函数
model = LSTMRegression(num_outputs, input_size, hidden_size, num_layers, seq_len)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [22]:
for i in range(num_epochs):
    model.train()
    total_train_loss = 0
    # step是一个batch的计数器
    for step, (b_x, b_y) in enumerate(train_data_loader):
        b_x = b_x.float()
        b_y = b_y.float()
        # 前向传播
        outputs = model(b_x)
        loss = criterion(outputs, b_y)
        total_train_loss += loss.item()

        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    total_train_loss /= len(train_data_loader)

    # 验证集
    model.eval()
    total_val_loss = 0
    for step, (b_x, b_y) in enumerate(val_data_loader):
        b_x = b_x.float()
        b_y = b_y.float()
        outputs = model(b_x)
        loss = criterion(outputs, b_y)
        total_val_loss += loss.item()
    total_val_loss /= len(val_data_loader)
    print(
        "Epoch: {}, train loss: {:.4f}, val loss: {:.4f}".format(
            i, total_train_loss, total_val_loss
        )
    )

Epoch: 0, train loss: 0.1612, val loss: 0.1071
Epoch: 1, train loss: 0.1491, val loss: 0.1224
Epoch: 2, train loss: 0.1410, val loss: 0.1387
Epoch: 3, train loss: 0.1349, val loss: 0.1553
Epoch: 4, train loss: 0.1299, val loss: 0.1716
Epoch: 5, train loss: 0.1259, val loss: 0.1864
Epoch: 6, train loss: 0.1226, val loss: 0.1978
Epoch: 7, train loss: 0.1198, val loss: 0.2052
Epoch: 8, train loss: 0.1174, val loss: 0.2074
Epoch: 9, train loss: 0.1151, val loss: 0.2046
Epoch: 10, train loss: 0.1128, val loss: 0.1976
Epoch: 11, train loss: 0.1106, val loss: 0.1886
Epoch: 12, train loss: 0.1084, val loss: 0.1791
Epoch: 13, train loss: 0.1062, val loss: 0.1692
Epoch: 14, train loss: 0.1038, val loss: 0.1589


Epoch: 15, train loss: 0.1014, val loss: 0.1488
Epoch: 16, train loss: 0.0989, val loss: 0.1394
Epoch: 17, train loss: 0.0962, val loss: 0.1303
Epoch: 18, train loss: 0.0933, val loss: 0.1205
Epoch: 19, train loss: 0.0902, val loss: 0.1096
Epoch: 20, train loss: 0.0872, val loss: 0.0982
Epoch: 21, train loss: 0.0843, val loss: 0.0883
Epoch: 22, train loss: 0.0815, val loss: 0.0802
Epoch: 23, train loss: 0.0788, val loss: 0.0737
Epoch: 24, train loss: 0.0762, val loss: 0.0672
Epoch: 25, train loss: 0.0737, val loss: 0.0611
Epoch: 26, train loss: 0.0713, val loss: 0.0563
Epoch: 27, train loss: 0.0689, val loss: 0.0527
Epoch: 28, train loss: 0.0666, val loss: 0.0495
Epoch: 29, train loss: 0.0644, val loss: 0.0467
Epoch: 30, train loss: 0.0622, val loss: 0.0446
Epoch: 31, train loss: 0.0601, val loss: 0.0434
Epoch: 32, train loss: 0.0580, val loss: 0.0426
Epoch: 33, train loss: 0.0560, val loss: 0.0408
Epoch: 34, train loss: 0.0541, val loss: 0.0400
Epoch: 35, train loss: 0.0521, val loss:

In [23]:
# 测试集
model.eval()
for step, (b_x, _) in enumerate(test_data_loader):
    with torch.no_grad():
        b_x = b_x.float()
        outputs = model(b_x)
        print(label_scaler.inverse_transform(outputs.numpy()))

[[69.695915 69.07723 ]
 [68.93668  67.43579 ]
 [68.65284  68.5836  ]
 [69.02105  69.02451 ]
 [69.36147  69.66171 ]
 [69.52764  70.77702 ]
 [69.85547  71.80455 ]]


# 一个geo的数据训练完毕，没啥问题，下面训练所有的

In [24]:
res_map = {}

In [25]:
for geohash_id, df_per_geo in tqdm(df_all.groupby("geohash_id")):
    # print("geohash_id", geohash_id)
    dataset = LSTMPerGeoDataset(df_per_geo, seq_len)

    test_len = 4
    train_dataset = Subset(dataset, range(len(dataset) - test_len))
    test_dataset = Subset(dataset, range(len(dataset) - test_len, len(dataset)))
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
    test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    model = LSTMRegression(num_outputs, input_size, hidden_size, num_layers, seq_len)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for i in range(num_epochs):
        model.train()
        # total_train_loss = 0
        # step是一个batch的计数器
        for step, (b_x, b_y) in enumerate(train_data_loader):
            b_x = b_x.float()
            b_y = b_y.float()
            # 前向传播
            outputs = model(b_x)
            loss = criterion(outputs, b_y)
            total_train_loss += loss.item()

            # 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        # total_train_loss /= len(train_data_loader)

    # 测试集
    model.eval()
    for step, (b_x, _) in enumerate(test_data_loader):
        with torch.no_grad():
            b_x = b_x.float()
            outputs = model(b_x)
            res_map[geohash_id] = label_scaler.inverse_transform(outputs.numpy())

  0%|          | 0/1140 [00:00<?, ?it/s]

  5%|▍         | 56/1140 [00:40<13:03,  1.38it/s]


KeyboardInterrupt: 

In [None]:
len(res_map), res_map

In [None]:
# 结果写入csv，分隔符为\t
date_id = [20230404, 20230405, 20230406, 20230407]
with open("base_lstm.csv", "w", newline='') as f:
    writer = csv.writer(f, delimiter="\t")  # 设置分隔符为制表符
    writer.writerow(["geohash_id", "consumption_level", "activity_level", "date_id"])
    for geohash_id, res in res_map.items():
        for i in range(res.shape[0]):
            # 注意不要写反了
            writer.writerow([geohash_id, res[i, 1], res[i, 0], date_id[i]])