LIBRARIES

In [36]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

CLEAN DATA

In [37]:
df = pd.read_csv('/content/reddit-comments-2015-08.csv')

In [38]:
def preprocess_comments(comments):
    comments = comments.lower()
    comments = re.sub(r'\brt\b', '', comments)
    comments = re.sub(r'@\S+|https?://\S+|www\.\S+|#\S+', '', comments)
    comments = re.sub(r'[^a-zA-Z\s]', '', comments)
    return comments

df['Cleaned_comments'] = df['body'].apply(preprocess_comments)

In [39]:
stop_words = set(stopwords.words('english'))
def tokenize_and_remove_stopwords(comments):
    tokens = word_tokenize(comments)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

df['Tokenized_comments'] = df['Cleaned_comments'].apply(tokenize_and_remove_stopwords)

In [40]:
df.head()

Unnamed: 0,body,Cleaned_comments,Tokenized_comments
0,I joined a new league this year and they have ...,i joined a new league this year and they have ...,"[joined, new, league, year, different, scoring..."
1,"In your scenario, a person could just not run ...",in your scenario a person could just not run t...,"[scenario, person, could, run, mandatory, back..."
2,They don't get paid for how much time you spen...,they dont get paid for how much time you spend...,"[dont, get, paid, much, time, spend, building,..."
3,"I dunno, back before the August update in an A...",i dunno back before the august update in an a ...,"[dunno, back, august, update, lobby, tower, co..."
4,"No, but Toriyama sometimes would draw himself ...",no but toriyama sometimes would draw himself a...,"[toriyama, sometimes, would, draw, little, rob..."


IMPLEMENTING RNN

In [41]:
word_to_index_unified = {}
index_unified = 0

for tweet_tokens in df['Tokenized_comments']:
    for token in tweet_tokens:
        if token not in word_to_index_unified:
            word_to_index_unified[token] = index_unified
            index_unified += 1

word_dim_unified = len(word_to_index_unified)
hidden_dim = 10
output_dim = 1

U_unified = np.random.uniform(-np.sqrt(1. / word_dim_unified), np.sqrt(1. / word_dim_unified), (hidden_dim, word_dim_unified))
V_unified = np.random.uniform(-np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim), (output_dim, hidden_dim))
W_unified = np.random.uniform(-np.sqrt(1. / hidden_dim), np.sqrt(1. / hidden_dim), (hidden_dim, hidden_dim))

SIGMOID

In [42]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

FORWARD PROPAGATION

In [43]:
def forward_propagation_unified(sentence):
    h = np.zeros((len(sentence), hidden_dim))
    for t in range(len(sentence)):
        if sentence[t] in word_to_index_unified:
            word_index = word_to_index_unified[sentence[t]]
            h[t] = np.tanh(U_unified[:, word_index] + W_unified.dot(h[t - 1]))

    output = sigmoid(V_unified.dot(h[-1]))
    return output

In [44]:
dummy_target_values = np.array(np.random.randint(2, size=15000))
four_tweet_tokens = df['Tokenized_comments']
predictions = []
losses = []

for tweet_tokens, target in zip(four_tweet_tokens, dummy_target_values):
    if len(tweet_tokens) > 0:
        predictions_tweet = forward_propagation_unified(tweet_tokens)
        loss_tweet = np.mean((predictions_tweet - target) ** 2)
        predictions.append(predictions_tweet)
        losses.append(loss_tweet)
for i, (prediction, loss) in enumerate(zip(predictions, losses)):
    print(f"comment {i + 1}: Prediction - {prediction}, Loss - {loss}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
comment 9985: Prediction - [0.50096625], Loss - 0.25096718676946606
comment 9986: Prediction - [0.50005032], Loss - 0.2499496857772533
comment 9987: Prediction - [0.4993731], Loss - 0.24937348853947222
comment 9988: Prediction - [0.49984486], Loss - 0.2498448811587442
comment 9989: Prediction - [0.50041893], Loss - 0.24958124641141632
comment 9990: Prediction - [0.5000955], Loss - 0.24990450599752703
comment 9991: Prediction - [0.50002121], Loss - 0.2500212057553679
comment 9992: Prediction - [0.50054592], Loss - 0.24945438171508386
comment 9993: Prediction - [0.50021737], Loss - 0.24978267407859875
comment 9994: Prediction - [0.49985555], Loss - 0.25014446878009566
comment 9995: Prediction - [0.49915746], Loss - 0.25084324501191463
comment 9996: Prediction - [0.49969211], Loss - 0.24969220666864886
comment 9997: Prediction - [0.49964079], Loss - 0.25035933495584545
comment 9998: Prediction - [0.50037324], Loss - 0.250373

In [45]:
max_index = np.argmax(predictions)
max_prediction = predictions[max_index]
max_loss = losses[max_index]

print(f"Comment {max_index + 1} has the highest prediction - {max_prediction}, Loss - {max_loss}")

Comment 5438 has the highest prediction - [0.50162149], Loss - 0.2516241197530289
