In [10]:
import torch
import torch.nn as nn
import random
import tqdm
import torch.optim as optim
import pandas as pd
from nltk.tokenize import TweetTokenizer, word_tokenize
import re
from collections import defaultdict
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from torch.utils.data import random_split
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy

In [11]:
train_data = pd.read_csv('train.csv')
Y = list((train_data['Type'] == 'Quality').astype(int))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [12]:
def findUrl(string):
  
    # findall() has been used 
    # with valid conditions for urls in string
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    found = re.search(regex, string)
    return found

In [13]:
no_attag = []
for data in train_data['Tweet']:
  words = data.split(' ')
  output = []
  for word in words:
    if word and word[0] == '@':
      continue
    else:
      output.append(word)
  
  no_attag.append(" ".join(output))
  

In [14]:
kwargs = {
            'ngram_range': (1,1),  # Use 1-grams + 2-grams.
            'analyzer': 'word',  # Split text into word tokens.
            'min_df': 1,
            'stop_words': "english",
         }
vectorizer = TfidfVectorizer(**kwargs)
vectorized_data = vectorizer.fit_transform(no_attag)
X = torch.tensor(scipy.sparse.csr_matrix.todense(vectorized_data)).float()

In [15]:
example = 1
print(vectorizer.get_feature_names_out()[X[example] != 0])
train_data['Tweet'][example]

['arrived' 'eren' 'girlfriend' 'glare' 'help' 'kitchen' 'lovely' 'mikasa'
 'nodded' 'sent' 'stood']


'Eren sent a glare towards Mikasa then nodded and stood up to go help his lovely girlfriend @SincerePyrrhic. Once he arrived in the kitchen⎯'

In [34]:
class CNN(nn.Module):
    def __init__(self, NUM_CLASSES=2, input_dim=32067, DIM_EMB=200):
        super(CNN, self).__init__()

        self.Embedding = nn.Embedding(input_dim , DIM_EMB)
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=DIM_EMB,
                      out_channels=2,
                      kernel_size=ks)
            for ks in range(2, 5)
        ])
        self.ReLU = nn.ReLU()
        self.MaxPool = nn.MaxPool1d
        self.Dropout = nn.Dropout()
        self.Linear = nn.Linear(6, NUM_CLASSES)
        self.LogSoftmax = nn.LogSoftmax(dim=1)
        
    def forward(self, X):
        # X = X.to(dtype=int)
        # print(X.shape)
        X = X.to(dtype=int)
        E = self.Embedding(X).permute(0, 2, 1)
        print("E", E.shape)
        R = [self.ReLU(conv1d(X)) for conv1d in self.conv1d_list]
        # print("R")
        M = [self.MaxPool(kernel_size=r.shape[2])(r) for r in R]
        # print("M")
        C = torch.cat([m.squeeze(dim=2) for m in M], dim = 1)
        # print("C")
        L = self.Linear(C)
        # print("L")
        X = self.LogSoftmax(L)
        # print("X")
        return X

In [23]:
def shuffle_data(X, Y):
    indices = list(range(len(X)))
    random.shuffle(indices)
    shuffled_X = X[indices]
    shuffled_Y = Y[indices]
    return (shuffled_X, shuffled_Y)

def train(X, Y, cnn, print_results = False, n_epochs = 50):
    optimizer = optim.Adam(cnn.parameters(), lr = 0.001)
    # optimizer = optim.Adagrad(cnn.parameters(), lr = 0.001)
    # optimizer = optim.SGD(cnn.parameters(), lr = 0.001)
    loss_f = nn.NLLLoss()

    train_size = len(X) // 10 * 8
    test_size = len(X) - train_size
    train_ind, test_ind = random_split(range(len(X)), [train_size, test_size], generator=torch.Generator().manual_seed(42))

    batchSize = 10

    train_X = X.clone()[train_ind].to(device = device)
    train_Y = torch.tensor(Y, device = device, dtype = float)[train_ind]
    test_X = X.clone()[test_ind].to(device = device)
    test_Y = torch.tensor(Y, device = device, dtype = float)[test_ind]


    max_f1 = 0
    max_acc = 0
    max_epoch = 0

    for epoch in range(n_epochs):
        cnn.train()
        totalLoss = 0.0

        train_X_shuffle, train_Y_shuffle = shuffle_data(train_X, train_Y)

        for batch in tqdm.notebook.tqdm(range(0, len(train_X_shuffle), batchSize), leave=False):
          cnn.zero_grad()
          x = train_X_shuffle[batch: batch + batchSize]
          y = train_Y_shuffle[batch: batch + batchSize]
          output = cnn(x)
          loss = loss_f(output.squeeze().to(dtype = float), y)
          totalLoss += loss.item()
          loss.backward()
          optimizer.step()
        
        cnn.eval()
        y_pred = cnn(input).squeeze().argmax(dim=1).tolist()
        y_true = test_Y
        results = classification_report(y_true, y_pred, labels=[1, 0], digits = 4)

        if print_results:
          print("============================================")
          print(f"epoch {epoch + 1}")
          print(f"loss: {totalLoss:.4f}")
          print(f"accuracy: {results['accuracy']:.4f}")
          print(f"f1-score: {results['macro avg']['f1-score']:.4f}")
        if results['macro avg']['f1-score'] > max_f1:
          max_f1 = results['macro avg']['f1-score']
          max_acc = results['accuracy']
          max_epoch = epoch + 1
    print("============================================")
    print(f"Best result at epoch {max_epoch} f1-score: {max_f1:.4f} accuracy: {max_acc:.4f}")


In [35]:
cnn = CNN().to(device)
train(X, Y, cnn, print_results=True, n_epochs=10)

RuntimeError: ignored