## Reading in Data & Importing Lib

### Set up Dir

In [None]:
import sys, os
if 'google.colab' in sys.modules:

    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/My Drive/School stuff/Y4 S1/DSA4264'

    # move to Google Drive directory
    os.chdir(path_to_file)

Mounted at /content/gdrive


### Install Packages

In [None]:
%pip install keybert
%pip install yake
%pip install rake_nltk

Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl.metadata (15 kB)
Downloading keybert-0.8.5-py3-none-any.whl (37 kB)
Installing collected packages: keybert
Successfully installed keybert-0.8.5
Collecting yake
  Downloading yake-0.4.8-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting segtok (from yake)
  Downloading segtok-1.5.11-py3-none-any.whl.metadata (9.0 kB)
Downloading yake-0.4.8-py2.py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading segtok-1.5.11-py3-none-any.whl (24 kB)
Installing collected packages: segtok, yake
Successfully installed segtok-1.5.11 yake-0.4.8
Collecting rake_nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl.metadata (6.4 kB)
Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake_nltk
Successfully installed rake_nltk-1.0.6


### Import Lib

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import re,nltk
import string
import gensim
import yake

from transformers import AutoTokenizer
from transformers import pipeline
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from keybert import KeyBERT
from rake_nltk import Rake
from google.colab import files

### Downloads

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Read in Data

In [None]:
Data1 = pd.read_csv('../data/deberta_v3_labelled_3_1.csv', engine = 'python')
Data2 = pd.read_csv('../data/deberta_v3_labelled_3_2.csv', engine = 'python')
Data3 = pd.read_csv('../data/deberta_v3_labelled_3_3.csv', engine = 'python')
Data4 = pd.read_csv('../data/deberta_v3_labelled_3_4.csv', engine = 'python')
Data5 = pd.read_csv('../data/deberta_v3_labelled_3_5.csv', engine = 'python')

In [None]:
post_title = pd.read_csv('../data/reddit_posts_data.csv', engine = 'python')

### Merge Data

In [None]:
# Merge all Data
Data = pd.concat([Data1, Data2, Data3, Data4, Data5], ignore_index=True, axis=0)

# Convert datatype
Data['timestamp']= pd.to_datetime(Data['timestamp'])

# Create link_id
Data['link_id2'] = Data['link_id'].astype(str).apply(lambda x: x.rsplit('_',1)[-1])

## Add Subreddit Names

In [None]:
# Create subreddit dataframe
subreddit_name = pd.DataFrame({
    'id': ['t5_2qh8c', 't5_xnx04', 't5_70s6ew'],
    'subreddit': ['r/Singapore', 'r/SingaporeRaw', 'r/SingaporeHappenings']
})

# Merge to Data dataframe
Data = Data.merge(subreddit_name, left_on='subreddit_id', right_on='id', how='left')

# Remove unwanted columns
Data = Data.drop(columns=['id_y'])

## Add Post Titles

In [None]:
# Add post title to Data dataframe
Data = Data.merge(post_title, left_on='link_id2', right_on='post_id', how='left')

# Drop unwanted columns
Data = Data.drop(columns=['link_id2','post_type','subreddit_id_y', 'subreddit_name','post_content'])

# Rename columns
Data = Data.rename(columns={'id_x': 'text_id', 'subreddit_id_x': 'subreddit_id', 'created_timestamp':'post_timestamp', 'title':'post_title'})

### Clean Post Titles

In [None]:
Data

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,text_id,subreddit_id,moderation,BERT_2_hate,subreddit,post_id,post_timestamp,post_title,author,author_id,comment_count,vote_score
0,STI chiong ah,2020-05-14 12:35:30,iamabear1,/r/singapore/comments/gjjem5/covid19_8663_busi...,t3_gjjem5,t3_gjjem5,fqljinp,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",True,r/Singapore,gjjem5,2020-05-14 09:59:13,"Covid-19: 8,663 businesses in S'pore closed do...",Not Found,Not Found,4.0,30.0
1,Look on the bright side - you'll never make th...,2020-02-09 17:23:24,lkc159,/r/singapore/comments/f15aks/did_i_just_get_sc...,t3_f15aks,t3_f15aks,fh3hl0g,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",False,r/Singapore,f15aks,2020-02-09 07:14:11,Did I just get scammed,Not Found,Not Found,42.0,50.0
2,"For posts flaired as such (by OP), we will be ...",2021-04-06 18:08:59,AutoModerator,/r/singapore/comments/maajuo/a_compilation_of_...,t3_maajuo,t3_maajuo,gtlh5uf,t5_2qh8c,"{'collapsed_reason': None, 'author_is_blocked'...",True,r/Singapore,maajuo,2021-03-22 00:39:38,A compilation of mental health/wellness resour...,Not Found,Not Found,45.0,301.0
3,sounds q fucked up if no concern for each othe...,2021-01-22 14:22:42,[deleted],/r/singapore/comments/l28wfr/rsingapore_random...,t3_l28wfr,t1_gk6fcys,gk6gc0y,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",True,r/Singapore,l28wfr,2021-01-21 22:00:14,/r/singapore random discussion and small quest...,AutoModerator,t2_6l4z3,2270.0,13.0
4,Chinese media reported a while ago: https://ww...,2020-03-26 04:51:22,localinfluenza,/r/singapore/comments/fp5hgu/pcf_cluster_anoth...,t3_fp5hgu,t3_fp5hgu,flj42mf,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",False,r/Singapore,fp5hgu,2020-03-26 04:50:16,PCF Cluster: Another teacher diagnosed with Co...,Not Found,Not Found,21.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4509967,Its fuckin hot rn,2022-05-25 04:22:50,Mahsunon,/r/singapore/comments/ux28gq/rsingapore_random...,t3_ux28gq,t1_i9waspv,i9wb51x,t5_2qh8c,"{'controversiality': 0, 'collapsed_reason_code...",True,r/Singapore,ux28gq,2022-05-24 22:00:11,/r/singapore random discussion and small quest...,AutoModerator,t2_6l4z3,969.0,8.0
4509968,"hey, sending hugs your way &lt;3",2022-09-29 14:05:32,_0_o,/r/singapore/comments/xqprla/rsingapore_random...,t3_xqprla,t1_iqcvjsd,iqd9twj,t5_2qh8c,"{'controversiality': 0, 'collapsed_reason_code...",False,r/Singapore,xqprla,2022-09-28 22:00:10,/r/singapore random discussion and small quest...,AutoModerator,t2_6l4z3,437.0,9.0
4509969,Depends in the speed they got caught doing. In...,2023-03-27 01:00:34,dodgethis_sg,/r/singapore/comments/1235s44/eli5_why_do_the_...,t3_1235s44,t3_1235s44,jdtcn3x,t5_2qh8c,"{'controversiality': 0, 'collapsed_reason_code...",False,r/Singapore,1235s44,2023-03-27 00:49:59,[ELI5] Why do the traffic police let offenders...,Not Found,Not Found,26.0,0.0
4509970,Maybe I should have typed ‘doesn’t change too ...,2023-04-29 04:43:31,ShadeX8,/r/singapore/comments/130dj90/latest_property_...,t3_130dj90,t1_ji5arfa,ji5d2nc,t5_2qh8c,"{'banned_at_utc': None, 'mod_reason_by': None,...",False,r/Singapore,130dj90,2023-04-27 07:48:59,Latest property cooling measures unlikely to e...,Syumie,t2_16l1p4,92.0,133.0


In [None]:
# Lowercasing
Data['post_title_cleaned'] = Data['post_title'].str.lower()

# Remove punctuations
Data['post_title_cleaned'] = Data['post_title_cleaned'].str.translate(str.maketrans('', '', string.punctuation))

# Remove special characters
Data['post_title_cleaned'] = Data['post_title_cleaned'].str.replace(r'[^a-zA-Z\s]', '', regex=True)

# Remove white space
Data['post_title_cleaned'] = Data['post_title_cleaned'].str.strip()

### Remove Irrelevant Keywords

In [None]:
# Remove ''singapore', 'sg' and 's'pore' from Data['rake_keywords']
keywords_to_remove = ['singapore', 'sg', 'spore', 'singaporeans\'', 'rsingapore']

def remove_keywords(text):
    if isinstance(text, str):  # Check if the value is a string
        for keyword in keywords_to_remove:
            text = text.replace(keyword, '')
    return text

Data['post_title_cleaned'] = Data['post_title_cleaned'].apply(remove_keywords)

## Keywords Extraction


### Rake Keyword Algo

In [None]:
rake = Rake()

# Function to extract top 5 keywords
def extract_top_keywords(text, top_n=5):
    rake.extract_keywords_from_text(text)
    keywords = rake.get_ranked_phrases()[:top_n]  # Get top_n keywords
    return keywords

# Apply the function to each 'post_title' entry in the DataFrame
Data['rake_keywords'] = Data['post_title_cleaned'].astype(str).apply(lambda x: extract_top_keywords(x))

In [None]:
Data

Unnamed: 0,text,timestamp,username,link,link_id,parent_id,text_id,subreddit_id,moderation,BERT_2_hate,subreddit,post_id,post_timestamp,post_title,author,author_id,comment_count,vote_score,post_title_cleaned,rake_keywords
0,STI chiong ah,2020-05-14 12:35:30,iamabear1,/r/singapore/comments/gjjem5/covid19_8663_busi...,t3_gjjem5,t3_gjjem5,fqljinp,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",True,r/Singapore,gjjem5,2020-05-14 09:59:13,"Covid-19: 8,663 businesses in S'pore closed do...",Not Found,Not Found,4.0,30.0,covid businesses in closed down in april hi...,"[covid businesses, april highest, years, closed]"
1,Look on the bright side - you'll never make th...,2020-02-09 17:23:24,lkc159,/r/singapore/comments/f15aks/did_i_just_get_sc...,t3_f15aks,t3_f15aks,fh3hl0g,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",False,r/Singapore,f15aks,2020-02-09 07:14:11,Did I just get scammed,Not Found,Not Found,42.0,50.0,did i just get scammed,[get scammed]
2,"For posts flaired as such (by OP), we will be ...",2021-04-06 18:08:59,AutoModerator,/r/singapore/comments/maajuo/a_compilation_of_...,t3_maajuo,t3_maajuo,gtlh5uf,t5_2qh8c,"{'collapsed_reason': None, 'author_is_blocked'...",True,r/Singapore,maajuo,2021-03-22 00:39:38,A compilation of mental health/wellness resour...,Not Found,Not Found,45.0,301.0,a compilation of mental healthwellness resourc...,"[mental healthwellness resources, please add, ..."
3,sounds q fucked up if no concern for each othe...,2021-01-22 14:22:42,[deleted],/r/singapore/comments/l28wfr/rsingapore_random...,t3_l28wfr,t1_gk6fcys,gk6gc0y,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",True,r/Singapore,l28wfr,2021-01-21 22:00:14,/r/singapore random discussion and small quest...,AutoModerator,t2_6l4z3,2270.0,13.0,r random discussion and small questions thread...,"[small questions thread, r random discussion, ..."
4,Chinese media reported a while ago: https://ww...,2020-03-26 04:51:22,localinfluenza,/r/singapore/comments/fp5hgu/pcf_cluster_anoth...,t3_fp5hgu,t3_fp5hgu,flj42mf,t5_2qh8c,"{'removal_reason': None, 'collapsed': False, '...",False,r/Singapore,fp5hgu,2020-03-26 04:50:16,PCF Cluster: Another teacher diagnosed with Co...,Not Found,Not Found,21.0,9.0,pcf cluster another teacher diagnosed with cov...,"[pcf cluster another teacher diagnosed, feelin..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4509967,Its fuckin hot rn,2022-05-25 04:22:50,Mahsunon,/r/singapore/comments/ux28gq/rsingapore_random...,t3_ux28gq,t1_i9waspv,i9wb51x,t5_2qh8c,"{'controversiality': 0, 'collapsed_reason_code...",True,r/Singapore,ux28gq,2022-05-24 22:00:11,/r/singapore random discussion and small quest...,AutoModerator,t2_6l4z3,969.0,8.0,r random discussion and small questions thread...,"[small questions thread, r random discussion, ..."
4509968,"hey, sending hugs your way &lt;3",2022-09-29 14:05:32,_0_o,/r/singapore/comments/xqprla/rsingapore_random...,t3_xqprla,t1_iqcvjsd,iqd9twj,t5_2qh8c,"{'controversiality': 0, 'collapsed_reason_code...",False,r/Singapore,xqprla,2022-09-28 22:00:10,/r/singapore random discussion and small quest...,AutoModerator,t2_6l4z3,437.0,9.0,r random discussion and small questions thread...,"[small questions thread, r random discussion, ..."
4509969,Depends in the speed they got caught doing. In...,2023-03-27 01:00:34,dodgethis_sg,/r/singapore/comments/1235s44/eli5_why_do_the_...,t3_1235s44,t3_1235s44,jdtcn3x,t5_2qh8c,"{'controversiality': 0, 'collapsed_reason_code...",False,r/Singapore,1235s44,2023-03-27 00:49:59,[ELI5] Why do the traffic police let offenders...,Not Found,Not Found,26.0,0.0,eli why do the traffic police let offenders go...,"[traffic police let offenders go without, eli,..."
4509970,Maybe I should have typed ‘doesn’t change too ...,2023-04-29 04:43:31,ShadeX8,/r/singapore/comments/130dj90/latest_property_...,t3_130dj90,t1_ji5arfa,ji5d2nc,t5_2qh8c,"{'banned_at_utc': None, 'mod_reason_by': None,...",False,r/Singapore,130dj90,2023-04-27 07:48:59,Latest property cooling measures unlikely to e...,Syumie,t2_16l1p4,92.0,133.0,latest property cooling measures unlikely to e...,"[latest property cooling measures unlikely, mu..."


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


## Split Data for downloading

In [None]:
df = np.array_split(Data, 7)

df[0].to_csv('../data/Data_1.csv', encoding = 'utf-8-sig')
files.download('../data/Data_1.csv')

df[1].to_csv('../data/Data_2.csv', encoding = 'utf-8-sig')
files.download('../data/Data_2.csv')

df[2].to_csv('../data/Data_3.csv', encoding = 'utf-8-sig')
files.download('../data/Data_3.csv')

df[3].to_csv('../data/Data_4.csv', encoding = 'utf-8-sig')
files.download('../data/Data_4.csv')

df[4].to_csv('../data/Data_5.csv', encoding = 'utf-8-sig')
files.download('../data/Data_5.csv')

df[5].to_csv('../data/Data_6.csv', encoding = 'utf-8-sig')
files.download('../data/Data_6.csv')

df[6].to_csv('../data/Data_7.csv', encoding = 'utf-8-sig')
files.download('../data/Data_7.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>