In [1]:
import torch
import torch.nn as nn

# 定义一个简单的全连接层
fc = nn.Linear(10, 5)  # 输入特征维度为 10，输出维度为 5

# 输入数据
input_data = torch.randn(3, 10)  # 3 个样本，每个样本 10 个特征

# 前向传播
output = fc(input_data)
print("Output shape:", output.shape)
print("Output data:", output)

Output shape: torch.Size([3, 5])
Output data: tensor([[ 0.8077,  0.7671, -1.3354, -0.8978,  1.0544],
        [ 0.3077,  0.3016, -0.4576, -0.8128,  0.9606],
        [ 0.0640,  0.3347,  0.4794,  0.2482,  0.0634]],
       grad_fn=<AddmmBackward0>)


nn.Linear的初始化是kaiming初始化，权重随机，并且服从kaiming正态分布，或者kaiming均匀分布。

In [2]:
# 查看权重
print("Weights shape:", fc.weight.shape)
print("Weights:", fc.weight)

# 查看偏置
print("Bias shape:", fc.bias.shape)
print("Bias:", fc.bias)

Weights shape: torch.Size([5, 10])
Weights: Parameter containing:
tensor([[ 0.2481,  0.2493,  0.0071, -0.1585, -0.2911,  0.2051, -0.2996,  0.1113,
         -0.0541,  0.1248],
        [-0.2266, -0.2820,  0.1351,  0.0176, -0.2856, -0.2698,  0.2831, -0.1588,
          0.1559,  0.0903],
        [-0.2083, -0.0895, -0.1894, -0.0167,  0.2028,  0.0179, -0.3083,  0.0767,
         -0.1286, -0.3130],
        [-0.0173, -0.2401,  0.1969, -0.2898,  0.2351, -0.2757, -0.0765, -0.0886,
         -0.1438, -0.3039],
        [-0.0906,  0.1289, -0.1114,  0.2279, -0.1368,  0.2503,  0.1781, -0.1895,
          0.1165,  0.2085]], requires_grad=True)
Bias shape: torch.Size([5])
Bias: Parameter containing:
tensor([ 0.2559, -0.1903, -0.2190, -0.1814,  0.2980], requires_grad=True)


In [3]:
import torch.nn.init as init

# 创建一个权重张量
weight_tensor = torch.empty(3, 5)  # 假设输出维度为 3，输入维度为 5

# mode：可以是 'fan_in' 或 'fan_out'。'fan_in' 用于正向传播，'fan_out' 用于反向传播。
# nonlinearity：指定激活函数，通常为 'relu' 或 'leaky_relu'。

# 使用 Kaiming Normal 初始化
init.kaiming_normal_(weight_tensor, mode='fan_in', nonlinearity='relu')

tensor([[ 0.6059,  0.1285,  0.4780, -0.4500, -0.0201],
        [-0.8102,  0.7428, -0.3083,  0.4793, -0.0139],
        [-0.9811,  0.0890, -1.1209, -0.1263, -1.2466]])

In [4]:
# 使用 Kaiming Uniform 初始化
init.kaiming_uniform_(weight_tensor, mode='fan_in', nonlinearity='relu')

tensor([[-0.2740, -0.3445, -0.3596,  0.3580,  0.4891],
        [ 0.6418,  0.8994, -0.2666,  0.1239, -0.7271],
        [-0.5327, -1.0143,  1.0899, -0.5319, -0.3720]])

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

def basic_linear_layer():
    """nn.Linear 基础用法"""
    print("🎯 nn.Linear 基础用法")
    print("="*50)
    
    # 创建线性层：输入3维，输出5维
    linear = nn.Linear(in_features=3, out_features=5)
    
    print(f"📊 线性层信息:")
    print(f"   输入维度: {linear.in_features}")
    print(f"   输出维度: {linear.out_features}")
    print(f"   权重形状: {linear.weight.shape}")
    print(f"   偏置形状: {linear.bias.shape}")
    
    # 查看参数
    print(f"\n🔍 参数详情:")
    print(f"   权重矩阵 W:\n{linear.weight.data}")
    print(f"   偏置向量 b:\n{linear.bias.data}")
    
    # 前向传播
    input_data = torch.tensor([[1.0, 2.0, 3.0]])  # [1, 3]
    output = linear(input_data)
    
    print(f"\n⚡ 前向传播:")
    print(f"   输入: {input_data}")
    print(f"   输入形状: {input_data.shape}")
    print(f"   输出: {output}")
    print(f"   输出形状: {output.shape}")
    
    # 手动计算验证
    manual_output = torch.matmul(input_data, linear.weight.T) + linear.bias
    print(f"\n✅ 手动计算验证:")
    print(f"   y = xW^T + b")
    print(f"   手动计算结果: {manual_output}")
    print(f"   是否相等: {torch.allclose(output, manual_output)}")

def batch_processing():
    """批处理示例"""
    print("\n📦 批处理示例")
    print("="*50)
    
    # 创建线性层
    linear = nn.Linear(4, 2)
    
    # 单个样本
    single_input = torch.randn(4)
    single_output = linear(single_input)
    print(f"单个样本:")
    print(f"   输入形状: {single_input.shape}")
    print(f"   输出形状: {single_output.shape}")
    
    # 批处理
    batch_input = torch.randn(3, 4)  # 3个样本，每个4维
    batch_output = linear(batch_input)
    print(f"\n批处理:")
    print(f"   输入形状: {batch_input.shape}")
    print(f"   输出形状: {batch_output.shape}")
    
    # 更高维度的批处理
    high_dim_input = torch.randn(2, 5, 4)  # [batch, seq, features]
    high_dim_output = linear(high_dim_input)
    print(f"\n高维批处理:")
    print(f"   输入形状: {high_dim_input.shape}")
    print(f"   输出形状: {high_dim_output.shape}")
    print(f"   📝 注意: Linear只对最后一维进行变换")

def parameter_initialization():
    """参数初始化示例"""
    print("\n🎲 参数初始化")
    print("="*50)
    
    # 默认初始化
    linear_default = nn.Linear(3, 2)
    print(f"默认初始化:")
    print(f"   权重范围: [{linear_default.weight.min():.3f}, {linear_default.weight.max():.3f}]")
    print(f"   偏置范围: [{linear_default.bias.min():.3f}, {linear_default.bias.max():.3f}]")
    
    # Xavier初始化
    linear_xavier = nn.Linear(3, 2)
    nn.init.xavier_uniform_(linear_xavier.weight)
    nn.init.zeros_(linear_xavier.bias)
    print(f"\nXavier初始化:")
    print(f"   权重范围: [{linear_xavier.weight.min():.3f}, {linear_xavier.weight.max():.3f}]")
    print(f"   偏置: {linear_xavier.bias}")
    
    # 自定义初始化
    linear_custom = nn.Linear(3, 2)
    with torch.no_grad():
        linear_custom.weight.fill_(0.1)
        linear_custom.bias.fill_(0.0)
    print(f"\n自定义初始化:")
    print(f"   权重:\n{linear_custom.weight}")
    print(f"   偏置: {linear_custom.bias}")

def no_bias_example():
    """无偏置示例"""
    print("\n🚫 无偏置层")
    print("="*50)
    
    # 创建无偏置的线性层
    linear_no_bias = nn.Linear(3, 2, bias=False)
    
    print(f"无偏置线性层:")
    print(f"   权重形状: {linear_no_bias.weight.shape}")
    print(f"   是否有偏置: {linear_no_bias.bias is not None}")
    
    # 对比有偏置和无偏置
    linear_with_bias = nn.Linear(3, 2, bias=True)
    
    input_data = torch.randn(1, 3)
    output_no_bias = linear_no_bias(input_data)
    output_with_bias = linear_with_bias(input_data)
    
    print(f"\n📊 对比:")
    print(f"   输入: {input_data}")
    print(f"   无偏置输出: {output_no_bias}")
    print(f"   有偏置输出: {output_with_bias}")

def mlp_example():
    """多层感知机示例"""
    print("\n🧠 多层感知机 (MLP)")
    print("="*50)
    
    class SimpleMLP(nn.Module):
        def __init__(self, input_size, hidden_size, output_size):
            super().__init__()
            self.fc1 = nn.Linear(input_size, hidden_size)
            self.fc2 = nn.Linear(hidden_size, hidden_size)
            self.fc3 = nn.Linear(hidden_size, output_size)
            
        def forward(self, x):
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = self.fc3(x)
            return x
    
    # 创建MLP
    mlp = SimpleMLP(input_size=10, hidden_size=20, output_size=3)
    
    print(f"MLP结构:")
    print(mlp)
    
    # 统计参数
    total_params = sum(p.numel() for p in mlp.parameters())
    print(f"\n📊 参数统计:")
    for name, param in mlp.named_parameters():
        print(f"   {name}: {param.shape} -> {param.numel()} 个参数")
    print(f"   总参数量: {total_params}")
    
    # 前向传播
    input_data = torch.randn(5, 10)  # 5个样本
    output = mlp(input_data)
    print(f"\n⚡ 前向传播:")
    print(f"   输入形状: {input_data.shape}")
    print(f"   输出形状: {output.shape}")

def transformer_linear_usage():
    """Transformer中的Linear层使用"""
    print("\n🤖 Transformer中的Linear层")
    print("="*50)
    
    d_model = 512
    heads = 8
    d_k = d_model // heads
    
    print(f"Transformer参数:")
    print(f"   d_model: {d_model}")
    print(f"   heads: {heads}")
    print(f"   d_k: {d_k}")
    
    # 注意力机制中的Linear层
    q_linear = nn.Linear(d_model, d_model)
    k_linear = nn.Linear(d_model, d_model)
    v_linear = nn.Linear(d_model, d_model)
    out_linear = nn.Linear(d_model, d_model)
    
    print(f"\n🔍 注意力线性层:")
    print(f"   Q线性层: {d_model} -> {d_model}")
    print(f"   K线性层: {d_model} -> {d_model}")
    print(f"   V线性层: {d_model} -> {d_model}")
    print(f"   输出层: {d_model} -> {d_model}")
    
    # 前馈网络中的Linear层
    d_ff = 2048
    ff_linear1 = nn.Linear(d_model, d_ff)
    ff_linear2 = nn.Linear(d_ff, d_model)
    
    print(f"\n🍽️ 前馈网络:")
    print(f"   第一层: {d_model} -> {d_ff}")
    print(f"   第二层: {d_ff} -> {d_model}")
    
    # 模拟数据流
    batch_size, seq_len = 2, 10
    input_tensor = torch.randn(batch_size, seq_len, d_model)
    
    print(f"\n📊 数据流:")
    print(f"   输入: {input_tensor.shape}")
    
    # 注意力计算
    q = q_linear(input_tensor)
    k = k_linear(input_tensor)
    v = v_linear(input_tensor)
    print(f"   Q,K,V: {q.shape}")
    
    # 前馈网络
    ff_output = ff_linear2(F.relu(ff_linear1(input_tensor)))
    print(f"   前馈输出: {ff_output.shape}")

def practical_tips():
    """实用技巧"""
    print("\n💡 实用技巧")
    print("="*50)
    
    print("🎯 选择输入输出维度:")
    print("   - 输入维度必须与数据最后一维匹配")
    print("   - 输出维度根据任务需求确定")
    print("   - 常用维度: 64, 128, 256, 512, 768, 1024")
    
    print("\n🔧 参数初始化:")
    print("   - 默认: Kaiming初始化 (适合ReLU)")
    print("   - Xavier: 适合Sigmoid/Tanh")
    print("   - 自定义: 根据具体需求")
    
    print("\n⚡ 性能优化:")
    print("   - 批处理提高效率")
    print("   - GPU加速计算")
    print("   - 混合精度训练")
    
    print("\n🐛 常见错误:")
    print("   - 维度不匹配: 检查in_features")
    print("   - 梯度消失: 注意初始化和激活函数")
    print("   - 过拟合: 添加dropout或正则化")

def linear_vs_conv():
    """Linear vs Conv 对比"""
    print("\n🆚 Linear vs Conv 对比")
    print("="*50)
    
    # Linear层 - 全连接
    linear = nn.Linear(784, 10)  # MNIST分类
    print(f"Linear层 (全连接):")
    print(f"   参数量: {784 * 10 + 10:,}")
    print(f"   特点: 每个输入都连接到每个输出")
    
    # Conv层 - 局部连接
    conv = nn.Conv2d(1, 10, kernel_size=3)
    print(f"\nConv层 (卷积):")
    print(f"   参数量: {1 * 10 * 3 * 3 + 10}")
    print(f"   特点: 局部连接，权重共享")
    
    print(f"\n📋 使用场景:")
    print("   Linear: 分类层、全连接网络、Transformer")
    print("   Conv: 图像处理、特征提取、CNN")

def debug_linear():
    """调试Linear层"""
    print("\n🔧 调试Linear层")
    print("="*50)
    
    linear = nn.Linear(5, 3)
    
    # 检查参数
    print("🔍 参数检查:")
    print(f"   权重是否需要梯度: {linear.weight.requires_grad}")
    print(f"   偏置是否需要梯度: {linear.bias.requires_grad}")
    print(f"   权重梯度: {linear.weight.grad}")
    
    # 前向传播
    input_data = torch.randn(2, 5, requires_grad=True)
    output = linear(input_data)
    
    # 反向传播
    loss = output.sum()
    loss.backward()
    
    print(f"\n⚡ 梯度信息:")
    print(f"   输入梯度形状: {input_data.grad.shape}")
    print(f"   权重梯度形状: {linear.weight.grad.shape}")
    print(f"   偏置梯度形状: {linear.bias.grad.shape}")
    
    # 梯度检查
    print(f"\n✅ 梯度检查:")
    print(f"   权重梯度范围: [{linear.weight.grad.min():.4f}, {linear.weight.grad.max():.4f}]")
    print(f"   是否有NaN: {torch.isnan(linear.weight.grad).any()}")

if __name__ == "__main__":
    print("🎉 nn.Linear 完全使用指南")
    print("="*70)
    
    # 运行所有示例
    basic_linear_layer()
    batch_processing()
    parameter_initialization()
    no_bias_example()
    mlp_example()
    transformer_linear_usage()
    practical_tips()
    linear_vs_conv()
    debug_linear()
    
    print("\n🎊 nn.Linear 教程完成!")
    print("记住: Linear层就是 y = xW^T + b 的矩阵运算！")
    print("="*70)

🎉 nn.Linear 完全使用指南
🎯 nn.Linear 基础用法
📊 线性层信息:
   输入维度: 3
   输出维度: 5
   权重形状: torch.Size([5, 3])
   偏置形状: torch.Size([5])

🔍 参数详情:
   权重矩阵 W:
tensor([[-0.4768,  0.3516,  0.0821],
        [-0.0012, -0.2304, -0.2823],
        [-0.1321, -0.1995,  0.0219],
        [ 0.2142, -0.3229,  0.3560],
        [ 0.0937,  0.1607, -0.2104]])
   偏置向量 b:
tensor([-0.0685, -0.4356,  0.2827,  0.3736, -0.5558])

⚡ 前向传播:
   输入: tensor([[1., 2., 3.]])
   输入形状: torch.Size([1, 3])
   输出: tensor([[ 0.4041, -1.7445, -0.1827,  1.0100, -0.7719]],
       grad_fn=<AddmmBackward0>)
   输出形状: torch.Size([1, 5])

✅ 手动计算验证:
   y = xW^T + b
   手动计算结果: tensor([[ 0.4041, -1.7445, -0.1827,  1.0100, -0.7719]], grad_fn=<AddBackward0>)
   是否相等: True

📦 批处理示例
单个样本:
   输入形状: torch.Size([4])
   输出形状: torch.Size([2])

批处理:
   输入形状: torch.Size([3, 4])
   输出形状: torch.Size([3, 2])

高维批处理:
   输入形状: torch.Size([2, 5, 4])
   输出形状: torch.Size([2, 5, 2])
   📝 注意: Linear只对最后一维进行变换

🎲 参数初始化
默认初始化:
   权重范围: [-0.441, 0.501]
   偏置范围: [-0.530, 0.16