In [5]:
import torch
import torch.nn as nn
import random
import tqdm
import torch.optim as optim
import pandas as pd
from nltk.tokenize import TweetTokenizer, word_tokenize
import re
from collections import defaultdict
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import random_split
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy

In [6]:
train_data = pd.read_csv('../train.csv')
Y = list((train_data['Type'] == 'Quality').astype(int))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [7]:
def findUrl(string):
  
    # findall() has been used 
    # with valid conditions for urls in string
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    found = re.search(regex, string)
    return found

In [8]:
no_attag = []
for data in train_data['Tweet']:
  words = data.split(' ')
  output = []
  for word in words:
    if word and word[0] == '@':
      continue
    else:
      output.append(word)
  
  no_attag.append(" ".join(output))
  

In [9]:
kwargs = {
            'ngram_range': (1,1),  # Use 1-grams + 2-grams.
            'analyzer': 'word',  # Split text into word tokens.
            'min_df': 1,
            'stop_words': "english",
         }
vectorizer = TfidfVectorizer(**kwargs)
vectorized_data = vectorizer.fit_transform(no_attag)
X = torch.tensor(scipy.sparse.csr_matrix.todense(vectorized_data)).float()

In [12]:
example = 1
[X[example] != 0]
train_data['Tweet'][example]


'Eren sent a glare towards Mikasa then nodded and stood up to go help his lovely girlfriend @SincerePyrrhic. Once he arrived in the kitchen⎯'

In [1]:
class biLSTM(nn.Module):

    def __init__ (self, h_dim = 10, input_dim = 100, lstm_layers = 1, dropout_rate = 0.2):
        super(biLSTM, self).__init__()
        
        self.h_dim = h_dim
        self.pool = torch.nn.AdaptiveAvgPool1d(output_size=1)

        self.lstm = nn.LSTM(input_size = input_dim, 
                            hidden_size = self.h_dim, 
                            num_layers = lstm_layers,
                            batch_first = True,
                            bidirectional = True)
        
        self.drop = nn.Dropout(p = dropout_rate)
        self.linear = nn.Linear(2 * self.h_dim, 2)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, X):
        X, _ = self.lstm(X)
        X = self.linear(X)       
        X = self.sigmoid(X)

        return X



NameError: name 'nn' is not defined

In [None]:
def shuffle_data(X, Y):
    indices = list(range(len(X)))
    random.shuffle(indices)
    shuffled_X = X[indices]
    shuffled_Y = Y[indices]
    return (shuffled_X, shuffled_Y)

def train(X, Y, lstm, print_results = False, n_epochs = 50):
    optimizer = optim.Adam(lstm.parameters(), lr = 0.001)
    # optimizer = optim.Adagrad(lstm.parameters(), lr = 0.001)
    # optimizer = optim.SGD(lstm.parameters(), lr = 0.001)
    loss_f = nn.BCELoss()

    train_size = len(X) // 10 * 8
    test_size = len(X) - train_size
    train_ind, test_ind = random_split(range(len(X)), [train_size, test_size], generator=torch.Generator().manual_seed(42))

    batchSize = 10

    train_X = X.clone()[train_ind].to(device = device)
    train_Y = torch.tensor(Y, device = device, dtype = float)[train_ind]
    test_X = X.clone()[test_ind].to(device = device)
    test_Y = torch.tensor(Y, device = device, dtype = float)[test_ind]


    max_f1 = 0
    max_acc = 0
    max_epoch = 0

    for epoch in range(n_epochs):
        lstm.train()
        totalLoss = 0.0

        train_X_shuffle, train_Y_shuffle = shuffle_data(train_X, train_Y)

        for batch in tqdm.notebook.tqdm(range(0, len(train_X_shuffle), batchSize), leave=False):
          lstm.zero_grad()
          x = train_X_shuffle[batch: batch + batchSize]
          y = train_Y_shuffle[batch: batch + batchSize]
          output = lstm(x)
          loss = loss_f(output.squeeze().to(dtype = float), y)
          totalLoss += loss.item()
          loss.backward()
          optimizer.step()

        
        lstm.eval()
        output = lstm(test_X).squeeze()
        output = (output > 0.5).int()
        y_pred = output.tolist()

        results = classification_report(test_Y.cpu(), y_pred, labels = [1, 0], digits = 4, output_dict=True)
        if print_results:
          print("============================================")
          print(f"epoch {epoch + 1}")
          print(f"loss: {totalLoss:.4f}")
          print(f"accuracy: {results['accuracy']:.4f}")
          print(f"f1-score: {results['macro avg']['f1-score']:.4f}")
        if results['macro avg']['f1-score'] > max_f1:
          max_f1 = results['macro avg']['f1-score']
          max_acc = results['accuracy']
          max_epoch = epoch + 1
    print("============================================")
    print(f"Best result at epoch {max_epoch} f1-score: {max_f1:.4f} accuracy: {max_acc:.4f}")
        
        


In [None]:
lstm = biLSTM(h_dim = 100, lstm_layers = 2, input_dim=X.shape[1]).to(device)
train(X, Y, lstm, print_results=True, n_epochs=10)

torch.save(lstm, 'model.pt')

RuntimeError: CUDA out of memory. Tried to allocate 1.14 GiB (GPU 0; 5.94 GiB total capacity; 1.53 GiB already allocated; 490.88 MiB free; 1.53 GiB reserved in total by PyTorch)