1.加载数据

In [1]:
import os
import sys
import numpy as np
import torch
import pandas as pd

data_path = 'train_data'
train_data = []
labels = []
for file in os.listdir(data_path):
    # 获取文件的绝对路径
    file_path = os.path.join(data_path, file)
    # 截取文件名
    label = file.split('_')[-1]
    # 检查文件是否是目录
    if os.path.isdir(file_path):
        for data in os.listdir(file_path):
            # 读取文件内容
            a = np.load(os.path.join(file_path, data))
            train_data.append(a)
            labels.append(int(label))


2.数据预处理

In [2]:
length = [i.shape[0] for i in train_data]
width = [i.shape[1] for i in train_data]
max_length = max(length)
width = max(width)

# 将所有数据填充到最大长度
for i in range(len(train_data)):
    if train_data[i].shape[0] < max_length:
        train_data[i] = np.vstack((train_data[i], np.zeros((max_length - train_data[i].shape[0], width))))

train_data = np.array(train_data)
labels = np.array(labels)

# 将数据和标签转换为张量
train_data_tensor = torch.tensor(train_data).unsqueeze(1).float()
print(train_data_tensor.shape)
labels_tensor = torch.tensor(labels).long()

torch.Size([4000, 1, 734, 80])


上述代码获取了数据的最大长度，并将所有数据扩充至最大长度。

3.建立模型

In [3]:
# 构建神经网络
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# 定义网络
# 该网络输入为一个data序列，输出为该序列的labels分类
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, (3, 3))
        self.conv2 = nn.Conv2d(32, 64, (3, 3))
        self.conv3 = nn.Conv2d(64, 128, (3, 3))
        self.fc1 = nn.Linear(90 * 8 * 128, 128)  
        self.fc2 = nn.Linear(128, 2)
        self.pool = nn.MaxPool2d((2, 2))
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5) 

    def forward(self, x):
        # print('-' * 20)
        x = self.pool(self.relu(self.conv1(x)))
        # print(x.shape)
        x = self.pool(self.relu(self.conv2(x)))
        # print(x.shape)
        x = self.pool(self.relu(self.conv3(x)))
        # print(x.shape)
        x = x.view(-1, 90 * 8 * 128)
        x = self.dropout(x) 
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

4.训练数据并输出准确率

In [19]:
# 利用torch.utils.data构建数据集
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset

print("Data shape:", train_data_tensor.shape)
print("Labels shape:", labels_tensor.shape)
# 创建 TensorDataset 对象
dataset = TensorDataset(train_data_tensor, labels_tensor)

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [int(len(dataset) * 0.8), len(dataset) - int(len(dataset) * 0.8)])

# 创建 DataLoader 对象
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

# 定义损失函数和优化器
net = Model()  
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

# 训练网络
for epoch in range(10):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        # print(inputs.shape)
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 10 == 9:
            print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
            running_loss = 0.0
            
# 保存模型
torch.save(net.state_dict(), 'model.pth')

Data shape: torch.Size([4000, 1, 734, 80])
Labels shape: torch.Size([4000])
[1,    10] loss: 1.740
[1,    20] loss: 0.677
[1,    30] loss: 0.672
[1,    40] loss: 0.667
[1,    50] loss: 0.655
[1,    60] loss: 0.619
[1,    70] loss: 0.659
[1,    80] loss: 0.634
[1,    90] loss: 0.627
[1,   100] loss: 0.624
[2,    10] loss: 0.623
[2,    20] loss: 0.598
[2,    30] loss: 0.616
[2,    40] loss: 0.594
[2,    50] loss: 0.564
[2,    60] loss: 0.578
[2,    70] loss: 0.543
[2,    80] loss: 0.521
[2,    90] loss: 0.529


In [5]:
# 加载模型
net = Model()
net.load_state_dict(torch.load('model.pth'))

验证网络的精度

In [None]:
# 测试模型
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        inputs, labels = data
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, dim=1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print('Accuracy: %d %%' % (100 * correct / total))

输出结果

In [14]:
# 构建测试集
result_data_path = 'test_data'
result_data = []
result_labels = []
for file in os.listdir(result_data_path):
    a = np.load(os.path.join(result_data_path, file))
    result_data.append(a)
    result_labels.append(-1)

# 截断或补全到 max_length x width
for i in range(len(result_data)):
    if result_data[i].shape[0] < max_length:
        result_data[i] = np.vstack((result_data[i], np.zeros((max_length - result_data[i].shape[0], width))))
    elif result_data[i].shape[0] > max_length:
        result_data[i] = result_data[i][:max_length, :]

result_data = np.array(result_data)
result_labels = np.array(result_labels)

# 将数据和标签转换为张量
result_data_tensor = torch.tensor(result_data).unsqueeze(1).float()
print(result_data_tensor.shape)
result_labels_tensor = torch.tensor(result_labels).long()

torch.Size([2000, 1, 734, 80])


In [18]:
# 构建预测集
result_dataset = TensorDataset(result_data_tensor, result_labels_tensor)
result_data_loader = DataLoader(result_dataset, batch_size=32, shuffle=True)

# 测试网络
predictions = []
with torch.no_grad():
    for inputs, _ in result_data_loader:
        outputs = net(inputs)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.numpy())

In [17]:
import pandas as pd
result = pd.read_csv('test.csv')


# 将预测结果保存
result['label'] = predictions
result.to_csv('23210980049.csv', index=False)
