# Word2Vec

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./train/train.csv')

### Clean Data

In [3]:
import re

# Lowercase and remove punctuation
to_remove = ['.', ',', '?', '!', ':', ';']
replacement = " "

pattern = "|".join(map(re.escape, to_remove))
df["text"] = df["text"].str.replace(pattern, replacement, regex=True)
df["text"] = df["text"].str.lower()

print(df.head())

        id                                               text author
0  id26305  this process  however  afforded me no means of...    EAP
1  id17569  it never once occurred to me that the fumbling...    HPL
2  id11008  in his left hand was a gold snuff box  from wh...    EAP
3  id27763  how lovely is spring as we looked from windsor...    MWS
4  id12958  finding nothing else  not even gold  the super...    HPL


In [4]:
# Remove Samples with too Many Words
print(df.shape)
max_words = 100
df = df[df['text'].str.split().apply(len) <= max_words]
print(df.shape)

(19579, 3)
(19491, 3)


### Word2Vec

In [7]:
import gensim
from gensim.models import Word2Vec

data = []

# Tokenize the text data
for i in range(len(df)):
    temp = []
    for j in range(len(df['text'].iloc[i].split())):
        temp.append(df['text'].iloc[i].split()[j])
    data.append(temp)

print(data[:2])

[['this', 'process', 'however', 'afforded', 'me', 'no', 'means', 'of', 'ascertaining', 'the', 'dimensions', 'of', 'my', 'dungeon', 'as', 'i', 'might', 'make', 'its', 'circuit', 'and', 'return', 'to', 'the', 'point', 'whence', 'i', 'set', 'out', 'without', 'being', 'aware', 'of', 'the', 'fact', 'so', 'perfectly', 'uniform', 'seemed', 'the', 'wall'], ['it', 'never', 'once', 'occurred', 'to', 'me', 'that', 'the', 'fumbling', 'might', 'be', 'a', 'mere', 'mistake']]


In [8]:
# Train Word2Vec model
wvmodel = gensim.models.Word2Vec(data, min_count=1,vector_size=100, window=5)

In [9]:
word1 = 'chicken'
word2 = 'butter'

word_sim = wvmodel.wv.similarity(word1, word2) * 100
word_sim = word_sim

print(f"Similarity between {word1} and {word2}: {word_sim : .2f}%")

Similarity between chicken and butter:  87.62%


In [10]:
print(wvmodel.wv.most_similar('penetrate', topn=5))

[('art', 0.9899576306343079), ('luxury', 0.9893335700035095), ('display', 0.9892018437385559), ('similar', 0.9891251921653748), ('purposes', 0.9890239834785461)]


In [11]:
import gensim.downloader

# Load model
word2vec_model = gensim.downloader.load('glove-wiki-gigaword-100')

In [26]:
from gensim.utils import simple_preprocess
import torch
from torch.nn.utils.rnn import pad_sequence
tokens_list = []
vectorized_samples = []
kept_samples = []
for i, quote in enumerate(df['text']):
    tokens = simple_preprocess(quote)
    tokens_list.append(tokens)
    word_vectors = [word2vec_model[token] for token in tokens if token in word2vec_model]
    if(len(word_vectors) > 5000):
        kept_samples.append(False)
        continue
    kept_samples.append(True)
    vectorized_samples.append(torch.tensor(word_vectors))

padded_sequence = pad_sequence(vectorized_samples, batch_first=True)
print(padded_sequence[0])

tensor([[-0.5706,  0.4418,  0.7010,  ..., -0.6610,  0.4720,  0.3725],
        [-0.5547,  0.0997, -0.1659,  ..., -0.2018,  0.6674,  0.8620],
        [-0.2306, -0.0931,  0.2010,  ..., -0.1355,  0.1209, -0.1590],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])


In [14]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import math
from PositionalEncoding import PositionalEncoding

class TransformerModel(nn.Transformer):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, noutput, dropout=0.5):
        super(TransformerModel, self).__init__(d_model=ninp, nhead=nhead, dim_feedforward=nhid, num_encoder_layers=nlayers)
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)

        self.input_emb = nn.TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.ninp = ninp
        self.flatten = nn.Flatten()
        self.decoder = nn.Linear(ninp * ntoken, noutput)
        self.softmax = nn.Softmax(dim=1)

    def _generate_square_subsequent_mask(self, sz):
        return torch.log(torch.tril(torch.ones(sz,sz)))

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.input_emb(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.encoder(src, mask=self.src_mask)
        output = self.flatten(output)
        output = self.decoder(output)
        return F.log_softmax(output, dim=-1)

In [38]:
%matplotlib inline
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

X = padded_sequence.to(device)
encoder = LabelEncoder()
y = encoder.fit_transform(df['author'][kept_samples].values.reshape(-1, 1))
y = torch.tensor(y, dtype=torch.long).to(device)
print(y.shape)
print(X.shape[1])
dataset = TensorDataset(X, y)
model = TransformerModel(X.shape[1], 100, 5, 6, 1, 3).to(device)
train_dataset, val_dataset, test_datset = random_split(dataset, [0.7, 0.2, 0.1])

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
criterion = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001) 
num_epochs = 30
training_loss = []
for epoch in tqdm(range(num_epochs)):
    cumulative_loss = 0
    model.eval()
    for X_batch, y_batch in train_dataloader:
        outputs = model.forward(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        cumulative_loss += loss.item()
    cumulative_loss /= len(X_batch)

plt.plot(training_loss)

torch.save(model.state_dict(), "transformer_model.pt")

  y = column_or_1d(y, warn=True)


torch.Size([19491])
98


  3%|â–Ž         | 1/30 [01:32<44:32, 92.14s/it]


KeyboardInterrupt: 

In [None]:
# Load the model
model.load_state_dict(torch.load("transformer_model.pt"))