In [2]:
import numpy as np
import pandas as pd
import spacy
from tqdm import tqdm
from transformers import pipeline
from transformers import AutoTokenizer
from datetime import datetime as dt
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import torch

In [3]:
#setting GPU usage
spacy.prefer_gpu()

True

In [4]:
# Loading Model trained with the NEL
nlp = spacy.load('../model')

In [5]:
torch.cuda.empty_cache()

In [6]:
print(torch.cuda.memory_reserved(0))
torch.cuda.get_device_properties(0).total_memory

555745280


8589606912

In [7]:
df = pd.read_pickle('../data/comments_clean.pk1')
# the full file isnt available on github a smaller file is provided

In [8]:
df.reset_index(drop = True, inplace = True)
df.dropna(axis = 0, subset = 'comment', inplace = True)
df.drop_duplicates(subset = ['comment'], keep = 'first', inplace = True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1490073 entries, 0 to 1882366
Data columns (total 5 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   comment         1490073 non-null  object 
 1   user            1427993 non-null  object 
 2   date_time       1490069 non-null  float64
 3   sub_title       1490069 non-null  object 
 4   clean_comments  1490073 non-null  object 
dtypes: float64(1), object(4)
memory usage: 68.2+ MB


In [10]:
#creating mirror df without nulls in the title
df2 = df.dropna(subset = 'sub_title', axis = 0)

In [11]:
unique_titles = [title  for title in df2.sub_title.unique()]

In [12]:
df= df[df.comment != '[deleted]']

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1490072 entries, 0 to 1882366
Data columns (total 5 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   comment         1490072 non-null  object 
 1   user            1427993 non-null  object 
 2   date_time       1490068 non-null  float64
 3   sub_title       1490068 non-null  object 
 4   clean_comments  1490072 non-null  object 
dtypes: float64(1), object(4)
memory usage: 68.2+ MB


In [14]:
df['comment_len'] = [len(comment) for comment in df.comment]


In [15]:
df['comment_len'] = [len(comment.split()) for comment in df.comment]


In [16]:
df = df[df.user != 'sneakpeek_bot']

In [17]:
df.sort_values(by = 'comment_len', ascending=False)

Unnamed: 0,comment,user,date_time,sub_title,clean_comments,comment_len
1014060,Yeah. It was about 4 years ago. I stepped on 7...,Public_Halir,1.628423e+09,For those who accidentally stepped on 7th Mont...,Yeah. It was about 4 years ago. I stepped on 7...,1869
1085916,I’d like to talk about bullying too. Back in 2...,Tsuikyit_The_VIP,1.626886e+09,Personal story regarding mental health and bul...,I’d like to talk about bullying too. Back in 2...,1866
1394723,> Goh Chok Tong story: Lessons for 4G leaders...,Varantain,1.620521e+09,Goh Chok Tong story: Lessons for 4G leaders,> Goh Chok Tong story: Lessons for 4G leaders...,1842
709896,Summary of points by Sylvia in the linked vide...,Human-Feed,1.635086e+09,Trailer: Xiaxue's Exclusive interview with Sylvia,Summary of points by Sylvia in the linked vide...,1827
834395,They scrubbed their but I managed to recover ...,Eurito1,1.632624e+09,Statement from BooksActually staff (without Ke...,They scrubbed their but I managed to recover ...,1800
...,...,...,...,...,...,...
1468221,,WhimsyQuodlibet,1.618670e+09,/r/singapore random discussion and small quest...,,0
1658411,,dontdownvotemebruh,1.610699e+09,"British man, Singaporean fiancee charged after...",,0
39933,,random_avocado,1.646030e+09,/r/singapore random discussion and small quest...,,0
160154,,dawnfire999,1.642508e+09,How is the criminal justice system (accused ri...,,0


In [18]:
title_ents = []
title_list = []
title_dict = {}
count = 0
for doc in nlp.pipe(tqdm(unique_titles)):
    doc_ents = []
    for ent in doc.ents:
        if ent.kb_id_ != 'NIL':
            ent_details = ent.kb_id_
            doc_ents.append(ent_details)
    title_ents.append(doc_ents)
    title_list.append(doc.text)

    title_dict[f'{count}'] = {'comment': doc.text}
    relevant_sections = {}
    for ents in doc.ents:
        relevant_sections[f'{ents.kb_id_}'] = []
        for token in ents:
            if token.ent_iob == 3:
                clause = {}
                current_token = token
                subject_count = 1
                while current_token.dep_ != "ROOT" and subject_count <= 1:
                    if current_token.dep_ ==  'nsubj' or current_token.dep_== 'iobj' or current_token.dep_== 'dobj' or current_token.dep_ == 'pobj':
                        subject_count += 1
                        current_token = current_token.head
                    else:
                        current_token = current_token.head
        section = [t for t in current_token.subtree]
        relevant_sections[f'{ents.kb_id_}'].append(section)
        title_dict[f'{count}']['entities'] = relevant_sections
    count +=1


100%|██████████| 26277/26277 [06:36<00:00, 66.30it/s]


In [19]:
torch.cuda.empty_cache()

In [20]:
title_df= pd.DataFrame({'title': title_list, 'title_ents': title_ents})

In [21]:
title_df = title_df[title_df.title_ents.map(lambda x: len(x))>0]

In [22]:
titles_with_ents = [titles for titles in title_df.title]

In [23]:

comment_ents = []

comment = []

In [24]:
%%time
'''
for doc in nlp.pipe(tqdm(df.clean_comments)):
    doc_ents = []
    for ent in doc.ents:
        if ent.kb_id_ != 'NIL':
            ent_details = ent.kb_id_
            doc_ents.append(ent_details)
            comment.append(doc.text)
    comment_ents.append(doc_ents)
    comment.append(doc.text)

'''

Wall time: 0 ns


"\nfor doc in nlp.pipe(tqdm(df.clean_comments)):\n    doc_ents = []\n    for ent in doc.ents:\n        if ent.kb_id_ != 'NIL':\n            ent_details = ent.kb_id_\n            doc_ents.append(ent_details)\n            comment.append(doc.text)\n    comment_ents.append(doc_ents)\n    comment.append(doc.text)\n\n"

In [29]:
%%time
comment_dict = {}
count = 0
for doc in nlp.pipe(tqdm(df.clean_comments)):
    doc_ents_id = []
    comment_dict[f'{count}'] = {'comment': doc.text}
    comment_dict[f'{count}']['entities'] = []
    for ent in doc.ents:
        relevant_sections = {}
        if ent.kb_id_ != 'NIL':
            ent_details = ent.kb_id_
            doc_ents_id.append(ent_details)
            comment.append(doc.text)
            relevant_sections[f'{ent.kb_id_}'] = []
            current_token = ent[0]
            subject_count = 1
            while current_token.dep_ != "ROOT" and subject_count <= 1:
                if current_token.dep_ ==  'nsubj' or current_token.dep_== 'iobj' or current_token.dep_== 'dobj' or current_token.dep_ == 'pobj':
                    subject_count += 1
                    current_token = current_token.head
                else:
                    current_token = current_token.head
                section = [t for t in current_token.subtree]
            relevant_sections[f'{ent.kb_id_}'].append(section)
    comment_dict[f'{count}']['entities'].append(relevant_sections)
    count +=1
    comment_ents.append(doc_ents)
    comment.append(doc.text)

  5%|▌         | 81169/1483093 [10:52:24<187:48:04,  2.07it/s]


KeyboardInterrupt: 

  0%|          | 4072/1483093 [01:43<39:33:34, 10.39it/s]

KeyboardInterrupt: 

In [26]:
zip_data = zip(comment_ents, comment)

In [27]:
comment_df = pd.DataFrame(data = zip_data, columns = ['comment_entities', 'comment'])

In [None]:
comment_df.to_pickle('../data/comments_entities_only.pk1')

In [None]:
comment_dict

In [None]:
comment_df = pd.DataFrame(data = zip_data, columns = ['entities', 'polarity', 'subjectivity'])

In [None]:
comment_df['comment'] = [comment for comment in df.comment]

In [None]:
comment_df.info()

In [None]:
df_with_scores = df.merge(right = comment_df, how = 'left', right_on = 'comment', left_on = 'comment')

In [None]:
title_df= pd.DataFrame({'title': title_list, 'title_ents': title_ents})

In [None]:
df_with_scores.head()

In [None]:
df_with_scores.head()

In [None]:
df_with_scores.drop(columns = ['Unnamed: 0'] , inplace = True)

In [None]:
df_with_scores.info()

In [None]:
df_with_scores['date_time'] = [date for date in df_with_scores.date_time]

In [None]:
df_with_scores = df_with_scores.merge(right = title_df, how = 'left', left_on = 'sub_title', right_on = 'title')

In [None]:
df_with_scores['entities_both'] = np.where(df_with_scores.entities.map(lambda x: len(x)) ==0, df_with_scores.title_ents, df_with_scores.entities)

In [None]:
df_clean = df_with_scores.dropna(axis = 0, how = 'any', inplace = False)
df_clean.info()

In [None]:
df_clean.info()

In [None]:
df_clean['len_entities_both'] = [len(entity) for entity in df_clean.entities_both]

In [None]:
df_clean2 = df_clean[df_clean.len_entities_both >= 1]

In [None]:
df_clean2.info()

In [None]:
comments = [comment for comment in df_clean2.comment]

In [None]:
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment')
sentiment = pipeline(task = 'sentiment-analysis', model = 'cardiffnlp/twitter-roberta-base-sentiment', tokenizer = tokenizer, device = 0)

In [None]:
%%time
sents = sentiment(comments, max_length = 512, truncation = True, padding = 'max_length', batch_size = 64)

In [None]:
sents

In [None]:
df_clean2['label_h'] = [sents['label'] for sents in sents]

In [None]:
df_clean2['label_h'] = df_clean2['label_h'].map({'LABEL_0': -1, 'LABEL_1': 0, 'LABEL_2': 1})

In [None]:
df_clean2['score_h'] = [sents['score'] for sents in sents]

In [None]:
df_clean2['sent_score_h'] = df_clean2.score * df_clean2.label_h

In [None]:
df_clean2.head(30)

In [None]:
df_clean2.info()

In [None]:
df_clean2['date'] = [dt.utcfromtimestamp(time) for time in df_clean2['date_time']]
df_clean2['year_month'] = [date_time.to_period("M") for date_time in df_clean2.date]
df_clean2.year_month = [d_t.strftime('%Y-%m') for d_t in df_clean2.year_month]

In [None]:
df_clean2.info()

In [None]:
df_clean2.reset_index(inplace = True, drop = True)

In [None]:
df_clean2.entities = [list(set(entity)) for entity in df_clean2.entities]
df_clean2.entities_both = [list(set(entity)) for entity in df_clean2.entities_both]

In [None]:
df_clean2['len_entities'] = [len(entity) for entity in df_clean2.entities_both]

In [None]:
df_clean2

In [None]:
df_clean2 = df_clean2[df_clean2.len_entities >= 1]

In [None]:
df_clean2

In [None]:
entities = []
for ents in df_clean2.entities:
    for ent in ents:
        entities.append(ent)

In [None]:
unique_entities = sorted(set(entities))
unique_entities = [entity for entity in unique_entities]

In [None]:
pd.DataFrame(unique_entities, columns = ['entities']).to_csv('../data/entity_list.csv')

In [None]:
sorted_dates = [item for item in df_clean2.year_month.unique()]
pd.DataFrame(sorted_dates, columns = ['month']).sort_values(by= 'month', ascending=True).to_csv('../data/date_list.csv')

In [None]:
df_clean2['label_t'] =np.where(df_clean2.polarity < -0.05, '-1',
                            np.where(df_clean2.polarity >0.05, '1', '0'))

In [None]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
df_clean2['output_v'] = [analyzer.polarity_scores(comment) for comment in tqdm(df_clean2.comment)]

In [None]:
df_clean2['score_v'] = [output['compound'] for output in df_clean2.output_v]

In [None]:
df_clean2['label_v'] = np.where(df_clean2.polarity < -0.05, '-1',
                               np.where(df_clean2.polarity >0.05, '1', '0'))

In [None]:
df_clean2.reset_index(inplace = True, drop = True)

In [None]:
for item in tqdm(unique_entities):
    rows = []
    for index,row in df_clean2.iterrows():
        if item in row.entities:
            rows.append(row.values)
    item_df = pd.DataFrame(data = rows, columns = df_clean2.columns)
    item_df.entities = item
    item_df.to_pickle(f'../data/indiv_data/{item}.pk1')

In [None]:
df_clean2.to_pickle('../data/label_data.pk1')