In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import csv

torch.manual_seed(1)

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### Loading Dataset - Preprocessing on the Tweets

In [None]:
def remove_writing_marks(string):
  for char in ['?','!','#','@',',','.','$','%','^','&','*',')','(','/','~','`','&','+','+','_',';',':',"'"]:
    if char == '#':
      replaced = "Hashtag "
    elif  char == '@':
      replaced = "Atsign "
    else:
      replaced = ''
    string = string.replace(char, replaced)
  return string

def change_label(label):
  if '0' in label:
    label = 0
  elif '2' in label:
    label = 1
  elif '4' in label:
    label = 2
  return label

In [None]:
train_data_dir = r'/content/drive/My Drive/Courses/DeepLearning/HW03/Q01/Datas/train.csv'
test_data_dir = r'/content/drive/My Drive/Courses/DeepLearning/HW03/Q01/Datas/test.csv'
data_train = []
data_test = []
words = []

with open(train_data_dir, 'r') as csvfile:
  csvreader = csv.reader(csvfile)
  try:
    for row in csvreader:
      if 'http' in row[5]:
        continue
      else:
        text = remove_writing_marks(row[5])
        label = change_label(row[0])
        data_train.append([label, text])

        for word in text.split():
          if word in words:
            pass
          else:
            words.append(word)
  except:
      pass

with open(test_data_dir, 'r') as csvfile:
  csvreader = csv.reader(csvfile,)
  try:
    for row in csvreader:
      text = remove_writing_marks(row[5])
      label = change_label(row[0])
      data_test.append([label, text])

      for word in text.split():
          if word in words:
            pass
          else:
            words.append(word)
  except:
    pass


word_to_ix = {word: i for i, word in enumerate(words)}
word_to_ix[' '] = len(words)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### Text Embedding

In [None]:
import pickle 
with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q01/words_dict.pkl', 'wb') as f:
    pickle.dump(word_to_ix, f)

with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q01/data_test.pkl', 'wb') as f:
    pickle.dump(data_test, f)

with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q01/data_train.pkl', 'wb') as f:
    pickle.dump(data_train, f)

In [2]:
import pickle 
with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q01/words_dict.pkl', 'rb') as f:
    word_to_ix = pickle.load(f)

with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q01/data_test.pkl', 'rb') as f:
    data_test = pickle.load(f)

with open('/content/drive/My Drive/Courses/DeepLearning/HW03/Q01/data_train.pkl', 'rb') as f:
    data_train = pickle.load(f)

In [3]:
# embedding_dim = 203379
vocab_size = 38

for data in data_train:
  Size = len(data[1])
  if Size < vocab_size:
    for i in range(vocab_size-Size):
      data[1].append(' ')

for data in data_test:
  data[1] = data[1].split()
  Size = len(data[1])
  if Size < vocab_size:
    for i in range(vocab_size-Size):
      data[1].append(' ')

In [7]:
embeddings = nn.Embedding(embedding_dim, vocab_size)

In [16]:
context = data_test[19][1]
context = [word_to_ix[w] for w in context]
context = torch.tensor(context, dtype=torch.long)
embeddings(context)

tensor([[ 2.2178e-02,  9.8350e-01,  1.3378e+00,  ..., -6.1217e-04,
          1.5526e+00, -9.2554e-01],
        [-1.8072e+00,  1.9737e-01,  2.3170e-02,  ...,  9.1976e-01,
         -1.9582e+00, -9.0427e-01],
        [-4.0210e-01,  5.7361e-01,  7.9091e-01,  ..., -4.0950e-01,
         -6.2774e-02,  1.0174e-01],
        ...,
        [ 5.6288e-01,  5.5102e-01, -4.8822e-01,  ..., -6.3118e-01,
          3.0253e-01,  4.2783e-01],
        [ 5.6288e-01,  5.5102e-01, -4.8822e-01,  ..., -6.3118e-01,
          3.0253e-01,  4.2783e-01],
        [ 5.6288e-01,  5.5102e-01, -4.8822e-01,  ..., -6.3118e-01,
          3.0253e-01,  4.2783e-01]], grad_fn=<EmbeddingBackward0>)

In [4]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [72]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        packed_input = pack_padded_sequence(embeds, self.vocab_size, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        tag_space = self.hidden2tag(output)
        tag_scores = F.softmax(tag_space, dim=1)
        return tag_scores

In [73]:
embedding_dim = 300
model = LSTMTagger(embedding_dim, 150, len(word_to_ix), 3)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

In [33]:
inputs = prepare_sequence(data_train[10000][1], word_to_ix)
emb = nn.Embedding(len(word_to_ix), embedding_dim)

torch.Size([1, 38])


In [74]:
with torch.no_grad():
    inputs = prepare_sequence(data_train[0][1], word_to_ix)
    tag_scores = model(inputs[None, :].to)
    print(tag_scores.shape)

RuntimeError: ignored