In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt

# 数据读取和预处理
train = pd.read_csv('/kaggle/input/Kannada-MNIST/train.csv')
test = pd.read_csv('/kaggle/input/Kannada-MNIST/test.csv')

# 将训练数据划分为训练集和验证集
train_df, val_df = train_test_split(train, test_size=0.2, random_state=42)

print("训练数据集:", train_df.shape)
print("验证数据集:", val_df.shape)
print("测试数据集:", test.shape)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

train = pd.read_csv('/kaggle/input/Kannada-MNIST/train.csv')

# 从DataFrame中提取图像像素和标签
images = train.iloc[:, 1:].values  
labels = train.iloc[:, 0].values   

# 将图像从平铺的数组转换成28x28的矩阵
images = images.reshape(-1, 28, 28)

# 绘制前16张图像
plt.figure(figsize=(8, 8))
for i in range(16):
    plt.subplot(4, 4, i + 1)  # (4 rows, 4 columns)
    plt.imshow(images[i], cmap='gray')  # 显示灰度图像
    plt.title(f'Label: {labels[i]}')  # 设置标题显示标签
    plt.axis('off')  
plt.tight_layout()
plt.show()


In [None]:
class KannadaMNISTDataset(Dataset):
    def __init__(self, dataframe, transform=None, is_test=False):
        self.data = dataframe
        self.transform = transform
        self.is_test = is_test

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image = np.array(self.data.iloc[idx, 1:], dtype=np.uint8).reshape((28, 28, 1))
        if self.transform:
            image = self.transform(image)
        if self.is_test:
            return image
        label = self.data.iloc[idx, 0]
        return image, label

# 数据加载器
def get_data_loaders(train_df, val_df, test_df, batch_size):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])
    train_dataset = KannadaMNISTDataset(train_df, transform=transform)
    val_dataset = KannadaMNISTDataset(val_df, transform=transform)
    test_dataset = KannadaMNISTDataset(test_df, transform=transform, is_test=True)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, val_loader, test_loader

batch_size = 64
train_loader, val_loader, test_loader = get_data_loaders(train_df, val_df, test, batch_size)


In [None]:
# LeNet模型定义
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5)
        self.fc1 = nn.Linear(16 * 4 * 4, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.view(-1, 16 * 4 * 4)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = LeNet()

In [None]:
# 训练函数
def Ktrain(model, criterion, optimizer, train_loader, val_loader, num_epochs=10):
    model.train()
    train_losses = []
    val_losses = []
    for epoch in range(num_epochs):
        running_train_loss = 0.0
        running_val_loss = 0.0
        
        # 训练阶段
        for images, labels in train_loader:  # 遍历训练数据集中的每一批图像和标签
            optimizer.zero_grad()  # 清除上一轮的梯度
            outputs = model(images)  # 将图像输入模型，获得输出结果
            loss = criterion(outputs, labels)  # 计算模型输出与实际标签之间的损失
            loss.backward()  # 反向传播，计算梯度
            optimizer.step()  # 根据梯度更新模型参数
            running_train_loss += loss.item()  # 累加当前批次的训练损失
        
        # 验证阶段
        model.eval()  # 将模型设置为评估模式，禁用dropout等正则化方法
        with torch.no_grad():  # 禁用梯度计算，以减少内存消耗
            for images, labels in val_loader:  # 遍历验证数据集中的每一批图像和标签
                outputs = model(images)  # 将图像输入模型，获得输出结果
                loss = criterion(outputs, labels)  # 计算模型输出与实际标签之间的损失
                running_val_loss += loss.item()  # 累加当前批次的验证损失

        model.train()  # 将模型设置回训练模式
        
        average_train_loss = running_train_loss / len(train_loader)
        average_val_loss = running_val_loss / len(val_loader)
        train_losses.append(average_train_loss)
        val_losses.append(average_val_loss)
        print(f"Epoch {epoch + 1}, Train Loss: {average_train_loss}, Val Loss: {average_val_loss}")
    
    return train_losses, val_losses

# 定义损失函数
criterion = nn.CrossEntropyLoss()  # 使用交叉熵损失函数，适用于多分类任务
# 定义优化器
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)  # 使用随机梯度下降（SGD）优化器，学习率为0.001，动量为0.9
num_epochs = 50  # 训练50个周期

train_losses, val_losses = Ktrain(model, criterion, optimizer, train_loader, val_loader, num_epochs)

In [None]:
# 可视化训练和验证损失
def plot_losses(train_losses, val_losses):
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.show()

plot_losses(train_losses, val_losses)

# 模型评估
def evaluate_model(model, val_loader):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for images, labels in val_loader:
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(labels.numpy())
            y_pred.extend(predicted.numpy())
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    return accuracy, precision, recall, f1, cm

accuracy, precision, recall, f1, cm = evaluate_model(model, val_loader)
print(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")
print("Confusion Matrix:")
print(cm)


In [None]:
# 生成提交文件
def generate_submission_file(model, test_loader, filename="submission.csv"):
    model.eval()
    results = []
    with torch.no_grad():
        for images in test_loader:
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            results.extend(predicted.numpy())
    submission_df = pd.DataFrame({"id": np.arange(len(results)), "label": results})
    submission_df.to_csv(filename, index=False)

generate_submission_file(model, test_loader)

# 检查生成的提交文件
#submission = pd.read_csv("E:/b/submission.csv")
#print(submission.head())
#print("提交文件行数:", len(submission))
#print("测试数据集行数:", len(test))

# 确认提交文件行数与测试数据集行数相同
#assert len(submission) == len(test), "提交文件的行数应与测试数据集的行数相同"
