In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [9]:
df_sub = pd.read_csv("..\..\..\datasets\ps4e8\sample_submission.csv")
df_train=pd.read_csv(r"..\..\..\datasets\ps4e8\train.csv")
df_test=pd.read_csv(r"..\..\..\datasets\ps4e8\test.csv")

In [10]:
df_train.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [11]:
df_test.head()

Unnamed: 0,id,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,3116945,8.64,x,,n,t,,,w,11.13,...,b,,w,u,w,t,g,,d,a
1,3116946,6.9,o,t,o,f,,c,y,1.27,...,,,n,,,f,f,,d,a
2,3116947,2.0,b,g,n,f,,c,n,6.18,...,,,n,,,f,f,,d,s
3,3116948,3.47,x,t,n,f,s,c,n,4.98,...,,,w,,n,t,z,,d,u
4,3116949,6.17,x,h,y,f,p,,y,6.73,...,,,y,,y,t,,,d,u


In [12]:
df_test.shape,df_train.shape

((2077964, 21), (3116945, 22))

In [13]:
print(df_train.columns)

Index(['id', 'class', 'cap-diameter', 'cap-shape', 'cap-surface', 'cap-color',
       'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing', 'gill-color',
       'stem-height', 'stem-width', 'stem-root', 'stem-surface', 'stem-color',
       'veil-type', 'veil-color', 'has-ring', 'ring-type', 'spore-print-color',
       'habitat', 'season'],
      dtype='object')


In [16]:
X_train = df_train.drop(columns=['class'])
y_train = df_train['class']
X_test = df_test

In [17]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [18]:
def softmax(Z):
    Z = np.exp(Z - Z.max(axis=-1, keepdims=True))
    return Z / Z.sum(axis=-1, keepdims=True)

def self_attention(X, mask, W_KQV, W_out):
    K,Q,V = np.split(X@W_KQV, 3, axis=1)
    attn = softmax(K@Q.T / np.sqrt(X.shape[1])+ mask)
    return attn@V@W_out, attn

In [19]:
T, d = 100, 64
attn = nn.MultiheadAttention(d, 1, bias=False, batch_first=True)
M = torch.triu(-float("inf")*torch.ones(T,T,),1)
X = torch.randn(1,T,d)
Y_, A_ = attn(X, X, X, attn_mask=M)

In [20]:
attn.in_proj_weight.shape

torch.Size([192, 64])

In [21]:
attn.out_proj

NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=False)

In [22]:
attn.out_proj.weight.shape

torch.Size([64, 64])

In [23]:
Y, A = self_attention(X[0].numpy(), M.numpy(),
                      attn.in_proj_weight.detach().numpy().T,
                      attn.out_proj.weight.detach().numpy().T)

In [24]:
print("Calculated attention values:", Y)
print("Attention Matrix:", A)

Calculated attention values: [[-1.0312602e-01 -1.0556796e-01  5.1911581e-02 ... -6.5739357e-01
  -1.3639674e-01 -3.1646889e-01]
 [-6.5050326e-02 -1.4822330e-01 -1.3400508e-01 ... -3.7282974e-01
   7.5513534e-02  6.9892681e-03]
 [-2.5370991e-02 -9.4005503e-02 -3.8268209e-01 ... -1.6891960e-02
   2.2221136e-01  4.0362960e-01]
 ...
 [ 1.1137899e-02 -4.2848799e-02  3.4418553e-02 ... -8.4043398e-02
  -1.6340315e-02  1.3400581e-02]
 [-3.5166431e-02 -5.5076215e-02  2.4853507e-02 ... -8.0920979e-02
  -1.4554206e-02 -3.3958629e-04]
 [ 2.8600185e-03 -3.7217088e-02  7.9458803e-02 ... -7.1697466e-02
  -3.0567693e-02  5.9191268e-02]]
Attention Matrix: [[1.         0.         0.         ... 0.         0.         0.        ]
 [0.6512558  0.34874427 0.         ... 0.         0.         0.        ]
 [0.20920448 0.60293174 0.18786381 ... 0.         0.         0.        ]
 ...
 [0.00637854 0.00391593 0.00964773 ... 0.00966536 0.         0.        ]
 [0.01378596 0.00392865 0.01682338 ... 0.00528185 0.0088

In [25]:
np.linalg.norm(Y - Y_[0].detach().numpy())

1.2068743e-06

In [26]:
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class AttentionModel(nn.Module):
    def __init__(self, input_dim, attn_dim, num_heads, output_dim):
        super(AttentionModel, self).__init__()
        self.attn = nn.MultiheadAttention(attn_dim, num_heads, batch_first=True)
        self.fc = nn.Linear(attn_dim, output_dim)

    def forward(self, x):
        attn_output, attn_weights = self.attn(x, x, x)
        output = self.fc(attn_output[:, 0, :])
        return output, attn_weights

input_dim = X_train.shape[1]
attn_dim = 64
num_heads = 8
output_dim = 2

model = AttentionModel(input_dim, attn_dim, num_heads, output_dim).to(device)
