In [18]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [19]:
def get_train_data():
    file = "./data/train_90.csv"
    train_data = pd.read_csv(file)
    return train_data

def get_test_data():
    file = "./data/A榜/node_test_4_A.csv"
    test_data = pd.read_csv(file)
    return test_data

In [20]:
import math

def calScore(loss):
    return 1 / (1 + math.sqrt(2 * loss))

In [21]:
class DNN(nn.Module):
    def __init__(self):
        super(DNN, self).__init__()
        # Layer 1
        self.fc1 = nn.Linear(36, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.drop1 = nn.Dropout(p=0.3)

        # Layer 2
        self.fc2 = nn.Linear(128, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.drop2 = nn.Dropout(p=0.4)

        # Layer 3
        self.fc3 = nn.Linear(256, 512)
        self.bn3 = nn.BatchNorm1d(512)
        self.drop3 = nn.Dropout(p=0.5)

        # Layer 4
        self.fc4 = nn.Linear(512, 1024)
        self.bn4 = nn.BatchNorm1d(1024)
        self.drop4 = nn.Dropout(p=0.6)

        # Output Layer
        self.fc5 = nn.Linear(1024, 2)

    def forward(self, x):
        out = self.drop1(F.relu(self.bn1(self.fc1(x))))
        out = self.drop2(F.relu(self.bn2(self.fc2(out))))
        out = self.drop3(F.relu(self.bn3(self.fc3(out))))
        out = self.drop4(F.relu(self.bn4(self.fc4(out))))
        out = self.fc5(out)
        return out

In [22]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        # Layer 1
        self.conv1 = nn.Conv1d(in_channels=36, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool1 = nn.MaxPool1d(kernel_size=1, stride=1)
        self.drop1 = nn.Dropout(p=0.1)

        # Layer 2
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.pool2 = nn.MaxPool1d(kernel_size=1, stride=1)
        self.drop2 = nn.Dropout(p=0.1)

        # Fully Connected Layers
        self.fc1 = nn.Linear(128, 256)
        self.drop3 = nn.Dropout(p=0.1)
        self.fc2 = nn.Linear(256, 2)

    def forward(self, x):
        x = x.view(x.size(0), x.size(1), -1)
        out = self.drop1(self.pool1(F.relu(self.conv1(x))))
        out = self.drop2(self.pool2(F.relu(self.conv2(out))))
        out = out.view(out.size(0), -1)
        out = self.drop3(F.relu(self.fc1(out)))
        out = self.fc2(out)
        return out

In [23]:
def train(model, train_loader, criterion, optimizer, scheduler, val_loader=None, epochs=50):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        val_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
        if val_loader:
            model.eval()
            with torch.no_grad():
                for inputs, labels in val_loader:
                    output = model(inputs)
                    loss = criterion(output, labels)
                    val_loss += loss.item()
                val_loss /= len(val_loader)
            
        scheduler.step()
        if val_loader:
            print(f'Epoch {epoch+1}/{epochs}, train Loss: {running_loss/len(train_loader)}, val Loss: {val_loss}, score: {calScore(val_loss)}')
        else:
            print(f'Epoch {epoch+1}/{epochs}, train Loss: {running_loss/len(train_loader)}')

In [24]:
def predict(model, test_data):
    model.eval()
    with torch.no_grad():
        outputs = model(test_data)
    return outputs

In [25]:
from sklearn.preprocessing import RobustScaler
import numpy as np

d = get_train_data()
t = get_test_data()
ans = []
data = d.iloc[:, 1:]
test = t.iloc[:, 1:]
# X = data.iloc[:, 1:37]
# X=RobustScaler().fit_transform(X)
scaler = RobustScaler()
# data=RobustScaler().fit_transform(data)
data.iloc[:, :36] = scaler.fit_transform(data.iloc[:, :36])
data = pd.concat([data.iloc[:, :36], data.iloc[:, 36:]], axis=1).values
test=RobustScaler().fit_transform(test)
# y = data.iloc[:, 37:]
# y = np.array(y.values)
# scaler = StandardScaler()
# y = scaler.fit_transform(y)

print(data.shape)
print(test.shape)
print(type(data))

(102600, 38)
(4560, 36)
<class 'numpy.ndarray'>


In [26]:
# 划分训练集和验证集，使用sklearn的train_test_split函数
from sklearn.model_selection import train_test_split

train_data, val = train_test_split(data, test_size=0.2, random_state=42)
val, my_test = train_test_split(val, test_size=0.5, random_state=43)

In [27]:
train_x = train_data[:, :36]
train_y = train_data[:, 36:]
val_x = val[:, :36]
val_y = val[:, 36:]
my_test_x = my_test[:, :36]
my_test_y = my_test[:, 36:]

In [28]:
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)
print(my_test_x.shape)
print(my_test_y.shape)

(82080, 36)
(82080, 2)
(10260, 36)
(10260, 2)
(10260, 36)
(10260, 2)


In [29]:
model = CNN()

# 定义损失函数和优化器
criterion = nn.MSELoss()
# optimizer = optim.SGD(model.parameters(), lr=0.001, weight_decay=1e-5)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

X_train = torch.Tensor(train_x)
X_test = torch.Tensor(test)
y_train = torch.Tensor(train_y)
X_val = torch.Tensor(val_x)
y_val = torch.Tensor(val_y)
X_my_test = torch.Tensor(my_test_x)
y_my_test = torch.Tensor(my_test_y)

batch_size = 256
epoch_num = 30

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TensorDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

train(model, train_loader, criterion, optimizer, scheduler, val_loader=val_loader, epochs=epoch_num)

Epoch 1/30, train Loss: 884.4234476416282, val Loss: 48.51936535718964, score: 0.0921589008292849
Epoch 2/30, train Loss: 13.242033998542857, val Loss: 7.294661969673343, score: 0.20748621238438864
Epoch 3/30, train Loss: 4.0067471425481305, val Loss: 3.8704810462346892, score: 0.2643922970231967
Epoch 4/30, train Loss: 3.023655709819259, val Loss: 2.883815364139836, score: 0.2939802364686252
Epoch 5/30, train Loss: 2.3779641478975244, val Loss: 2.3945151509308236, score: 0.3136383780918273
Epoch 6/30, train Loss: 2.1841910752551947, val Loss: 1.921187566547859, score: 0.33781527177071563
Epoch 7/30, train Loss: 1.837560369960987, val Loss: 1.9612675054771145, score: 0.3355097773412285
Epoch 8/30, train Loss: 1.6508639683233244, val Loss: 2.1919955015182495, score: 0.32322728490379976
Epoch 9/30, train Loss: 1.7849375629722144, val Loss: 1.8287253001841104, score: 0.34335397787195443
Epoch 10/30, train Loss: 1.5370462646365537, val Loss: 1.415054085778027, score: 0.37281541336247004
Ep

In [30]:
my_test_dataset = TensorDataset(X_my_test)
my_test_loader = DataLoader(my_test_dataset)
# 进行预测
predictions = []
for test_data in my_test_loader:
    pred = predict(model, test_data[0])
    predictions.append(pred)

In [31]:
p = torch.cat(predictions, dim=0).numpy()
# p = torch.tensor(RobustScaler().inverse_transform(p)).numpy()
print(p)

[[68.20496  66.55721 ]
 [67.41379  65.77437 ]
 [73.320854 82.62299 ]
 ...
 [67.67718  66.11994 ]
 [71.65931  75.46871 ]
 [67.7411   62.01935 ]]


In [32]:
mse = np.mean((my_test_y - p) ** 2)
score = calScore(mse)
print(score)

0.5179920198605967


In [33]:
test_dataset = TensorDataset(X_test)
test_loader = DataLoader(test_dataset)

# 进行预测
predictions = []
for test_data in test_loader:
    pred = predict(model, test_data[0])
    predictions.append(pred)

In [34]:
# 将包含 PyTorch Tensor 的列表连接成一个 4560x2 的矩阵
p = torch.cat(predictions, dim=0).numpy()
# 打印矩阵的形状和第一个元素
print(p.shape)
print(p[0])

(4560, 2)
[68.574844 60.86804 ]


In [35]:
ans = []
for i in range(len(p)):
        s = str(t['geohash_id'].iloc[i])
        s += '\t'
        s += str(p[i][1])
        s += '\t'
        s += str(p[i][0])
        s += '\t'
        s += str(t['date_id'].iloc[i])
        ans.append(s)

df = pd.DataFrame(data=ans, columns=["geohash_id\tconsumption_level\tactivity_level\tdate_id"])
df.to_csv("CNN.csv", index=False)
print("finished")

finished
