This is a fun project I recently did. I used DistilBERT model to do sentimental analysis on a Reddit post about a soccer match of my favorite club Chelsea FC vs. Crystal Palace. Because the club has not play well recently so I wanted to see the polarity score of the comments.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)
df.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [None]:
batch_1 = df[:2000]

In [None]:
batch_1[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

In [None]:
# Loading pre-trained BERT model
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
# model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
# Prepare dataset
# Tokenization

In [None]:
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
batch_1[0]

0       a stirring , funny and finally transporting re...
1       apparently reassembled from the cutting room f...
2       they presume their audience wo n't sit still f...
3       this is a visually stunning rumination on love...
4       jonathan parker 's bartleby should have been t...
                              ...                        
1995    too bland and fustily tasteful to be truly pru...
1996                           it does n't work as either
1997    this one aims for the toilet and scores a dire...
1998    in the name of an allegedly inspiring and easi...
1999    the movie is undone by a filmmaking methodolog...
Name: 0, Length: 2000, dtype: object

In [None]:
tokenized

0       [101, 1037, 18385, 1010, 6057, 1998, 2633, 182...
1       [101, 4593, 2128, 27241, 23931, 2013, 1996, 62...
2       [101, 2027, 3653, 23545, 2037, 4378, 24185, 10...
3       [101, 2023, 2003, 1037, 17453, 14726, 19379, 1...
4       [101, 5655, 6262, 1005, 1055, 12075, 2571, 376...
                              ...                        
1995    [101, 2205, 20857, 1998, 11865, 16643, 2135, 5...
1996    [101, 2009, 2515, 1050, 1005, 1056, 2147, 2004...
1997    [101, 2023, 2028, 8704, 2005, 1996, 11848, 199...
1998    [101, 1999, 1996, 2171, 1997, 2019, 9382, 1898...
1999    [101, 1996, 3185, 2003, 25757, 2011, 1037, 244...
Name: 0, Length: 2000, dtype: object

In [None]:
print(len(tokenized[0]))
print(len(tokenized[1]))

20
16


In [None]:
# Padding
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
np.array(padded).shape

(2000, 59)

In [None]:
# Masking
padded

array([[  101,  1037, 18385, ...,     0,     0,     0],
       [  101,  4593,  2128, ...,     0,     0,     0],
       [  101,  2027,  3653, ...,     0,     0,     0],
       ...,
       [  101,  2023,  2028, ...,     0,     0,     0],
       [  101,  1999,  1996, ...,     0,     0,     0],
       [  101,  1996,  3185, ...,     0,     0,     0]])

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 59)

In [None]:
attention_mask

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [None]:
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
len(features)
len(features[0])

768

In [None]:
labels = batch_1[1]
labels

0       1
1       0
2       0
3       1
4       1
       ..
1995    0
1996    0
1997    0
1998    0
1999    0
Name: 1, Length: 2000, dtype: int64

In [None]:
# Train model
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [None]:
pip install praw

Collecting praw
  Using cached praw-7.7.1-py3-none-any.whl (191 kB)
Collecting prawcore<3,>=2.1 (from praw)
  Using cached prawcore-2.4.0-py3-none-any.whl (17 kB)
Collecting update-checker>=0.18 (from praw)
  Using cached update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update-checker, prawcore, praw
Successfully installed praw-7.7.1 prawcore-2.4.0 update-checker-0.18.0


In [None]:
import praw

reddit = praw.Reddit(
    client_id="ycftTaJTxNqYYV9RViBU0w",
    client_secret="PVpxUtkZOmwdODqKSXp3eIn4W8taeg",
    password="Tuitendanh@060505",
    user_agent="danhdanny",
    username="ngddanh",
)

In [None]:
# link post: https://www.reddit.com/r/chelseafc/comments/18s8q23/match_thread_chelsea_vs_crystal_palace_english/?newUser=true&showOnboarding=true
url = "https://www.reddit.com/r/chelseafc/comments/18s8q23/match_thread_chelsea_vs_crystal_palace_english/?newUser=true&showOnboarding=true"
submission = reddit.submission(url=url)

In [None]:
post_comments=[]
for top_level_comment in submission.comments:
    post_comments.append(str(top_level_comment.body))

AttributeError: 'MoreComments' object has no attribute 'body'

In [None]:
post_comments


['We have a Discord server for our live chat [here!](https://discord.gg/chelseafc)\n\nAlso, a friendly reminder to report any comments that break our community guidelines and contribute to a toxic atmosphere - everyone deserves to be treated with respect.\n\n*I am a bot, and this action was performed automatically. Please [contact the moderators of this subreddit](/message/compose/?to=/r/chelseafc) if you have any questions or concerns.*',
 'Why the fuck are Palace booing Connor after he played a blinder for them on loan?',
 'Mudryk looking fkn good tonight holy shit.',
 'The debate on Gallagher is over, he is utter quality. On and off the ball.',
 'On a serious note gusto is only 20....mad',
 'Well played Mudryk',
 "Gusto is a baller. I don't want that to go under the radar",
 'Misha is really starting to get in form!! 4 PL goals, 4 pens drawn and 1 assist.',
 'Special shoutout to the fan that tossed the ball away on that last corner kick lol',
 'Holy fuck Nkunku Mudryk connection so 

In [None]:
inputs = tokenizer(post_comments, padding=True, truncation=True, return_tensors="np",add_special_tokens=True)
# inputs = tokenizer(post_comments, padding=True,truncation=True,add_special_tokens=True)

In [None]:
inputs


{'input_ids': <tf.Tensor: shape=(490, 117), dtype=int32, numpy=
array([[  101,  2057,  2031, ...,     0,     0,     0],
       [  101,  2339,  1996, ...,     0,     0,     0],
       [  101,  8494,  2854, ...,     0,     0,     0],
       ...,
       [  101,  2204,  2147, ...,     0,     0,     0],
       [  101,  2339,  2052, ...,     0,     0,     0],
       [  101,  3835, 11147, ...,     0,     0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(490, 117), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [None]:
inputs['input_ids']

array([[  101,  2057,  2031, ...,     0,     0,     0],
       [  101,  2339,  1996, ...,     0,     0,     0],
       [  101,  8494,  2854, ...,     0,     0,     0],
       ...,
       [  101,  2204,  2147, ...,     0,     0,     0],
       [  101,  2339,  2052, ...,     0,     0,     0],
       [  101,  3835, 11147, ...,     0,     0,     0]])

In [None]:
type(inputs['attention_mask'])

numpy.ndarray

In [None]:
input_ids = torch.tensor(inputs['input_ids'])
attention_mask = torch.tensor(inputs['attention_mask'])

with torch.no_grad():
    last_hidden_states = model(input_ids=input_ids, attention_mask=attention_mask)

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
y_pred = lr_clf.predict(features)


In [None]:
y_pred

array([0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,

In [None]:
len(y_pred)

490

In [None]:
np.mean(y_pred)

0.45714285714285713