In [354]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [355]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [356]:
import json
import gensim.downloader as api
import numpy as np
import torch
import string
import transformers
from transformers import DistilBertModel, DistilBertTokenizer, logging

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import matplotlib.pyplot as plt

nltk.download('stopwords')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [357]:
with open("drive/MyDrive/ITcup/ranking_train.jsonl", "r") as f:
    data = [json.loads(line) for line in f]

In [358]:
data = data[:1000]

In [359]:
stop_words = stopwords.words('english')
for i in range(len(stop_words)):
  stop_words[i] = "".join([char for char in stop_words[i] if char not in "'"])

porter = PorterStemmer()

In [360]:
nltk.download('punkt')
logging.set_verbosity_error()

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True)
model     = DistilBertModel.from_pretrained('distilbert-base-uncased')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [361]:
scores   = []
for i in range(len(data)):
  for j in range(len(data[i]['comments'])):
    scores.append(data[i]['comments'][j]['score'])

exclamations = []
questions = []
links = []
lengths = []
for i in range(len(data)):
  avg_com_len=0
  for j in range(len(data[i]['comments'])):
    avg_com_len += len(data[i]['comments'][j]['text'])
  avg_com_len/=5
  tmp_link = 0
  tmp_length = 0
  for j in range(len(data[i]['comments'])):
    tmp_length = len(data[i]['comments'][j]['text'])/avg_com_len
    tmp_exclamation = data[i]['comments'][j]['text'].count("!")#/tmp_length
    tmp_question = data[i]['comments'][j]['text'].count("?")#/tmp_length
    if ("http" or "www") in data[i]['comments'][j]['text']:
      tmp_link += 1

  exclamations.append(tmp_exclamation)
  questions.append(tmp_question)
  links.append(tmp_link)
  lengths.append(tmp_length)

In [362]:
import pandas as pd
df = pd.DataFrame(list(zip(scores[:1000], exclamations, questions, links, lengths)), columns =['Score', '!', '?', 'Has Links', 'Length'])

In [363]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Score,!,?,Has Links,Length
Score,1.0,-0.042213,0.035056,-0.031487,-0.009955
!,-0.042213,1.0,0.003894,-0.025979,0.07031
?,0.035056,0.003894,1.0,0.005149,0.149402
Has Links,-0.031487,-0.025979,0.005149,1.0,-0.002232
Length,-0.009955,0.07031,0.149402,-0.002232,1.0


In [364]:
for datum in data:
  datum['text'] = datum['text'].lower()
  datum['text'] = "".join([char for char in datum['text'] if char not in string.punctuation])
  datum['text'] = word_tokenize(datum['text'])
  datum['text'] = [word for word in datum['text'] if word not in stop_words]
  #datum['text'] = ' '.join(datum['text'])
  for comment in datum['comments']:
    comment['text'] = comment['text'].lower()
    comment['text'] = "".join([char for char in comment['text'] if char not in string.punctuation])
    comment['text'] = word_tokenize(comment['text'])
    comment['text'] = [word for word in comment['text'] if word not in stop_words]
    #comment['text'] = ' '.join(comment['text'])

In [365]:
texts = []
comments = []

for i in range(len(data)):
  texts.append(data[i]['text'])
  for j in range(len(data[i]['comments'])):
    comments.append(data[i]['comments'][j]['text'])

In [366]:
from gensim.models import Word2Vec
#import gensim.downloader as api
#model = api.load("glove-wiki-gigaword-100")
model = Word2Vec(
    data,
    size=32,
    min_count=5,
    window=5,
    batch_words = 24
).wv

In [367]:
for text in texts:
  texts_tokenized = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
  for comment in comments:
    comments_tokenized = tokenizer(comments, return_tensors="pt", padding=True, truncation=True, max_length=512)

ValueError: ignored

In [None]:
plt.pcolormesh(comments_tokenized["attention_mask"])
plt.axis("off")
plt.colorbar()
plt.show()

In [None]:
import numpy as np

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

batch_size=32
features=[]
with torch.no_grad():
  for i in range(0, len(comments), batch_size):
    comments_batch = comments_tokenized["input_ids"][i : i + batch_size].to(device)
    masks_batch = comments_tokenized["attention_mask"][i : i + batch_size].to(device)
    output = model(comments_batch, masks_batch)
    batch_features = output.last_hidden_state[:, 0, :].cpu().numpy()
    features.append(batch_features)

features = np.concatenate(features, axis=0)

In [None]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, scores)

In [None]:
import os
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(512, 256),
            nn.LeakyReLU(0.01),
            nn.Dropout(0.1),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.01),
            nn.Dropout(0.1),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.01),
            nn.Dropout(0.1),
            nn.Linear(64,32),
            nn.LeakyReLU(0.01),
            nn.Dropout(0.1),
            nn.Linear(32,16),
            nn.LeakyReLU(0.01),
            nn.Dropout(0.1),
            nn.Linear(16,5)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [None]:
print(NeuralNetwork().forward(posts[0].tokenized))