
1. **Extract Reddit Data**



In [None]:
!pip install praw

In [None]:
!pip install pandas

In [None]:
# Add Google Drive as an accessible path (Optional if you are running from Jupyter Notebook)
from google.colab import drive
drive.mount('/content/drive')

# change path to the designated google drive folder
# otherwise, data will be saved in /content folder which you may have issue locating
%cd /content/drive/My Drive/Colab Notebooks/

In [None]:
#! python3
import praw
import pandas as pd
import datetime as dt

In [None]:
reddit = praw.Reddit(client_id='', \
                     client_secret='_', \
                     user_agent='', \
                     username='', \
                     password='')

In [None]:
url = ""
submission = reddit.submission(url=url)

print(submission.title)
# Output: the submission's title
print(submission.score)
# Output: the submission's score
print(submission.id)
# Output: the submission's ID
print(submission.url)


reddit.submissions() < can use submission id (take from above code) or the url link 

In [None]:
import pprint

# assume you have a Reddit instance bound to variable `reddit`
submission = reddit.submission(url=url)
print(submission.title)  # to make it non-lazy
pprint.pprint(vars(submission))

This shows all the comment (all the top level first followed by all the 2nd level, all the 3rd level....)


In [None]:
submission.comments.replace_more(limit=None)
comment_queue = submission.comments[:]  # Seed with top-level
while comment_queue:
    comment = comment_queue.pop(0)
    print(comment.body)
    comment_queue.extend(comment.replies)

Generate raw data of comments - title of post, user commented, comment body, comment score, comment created date into one dataframe and output as one excel document (Change the URL and run the same code until done)

In [None]:
comms_dict = { "title": [], "user": [], "body":[], "score":[], "created" :[]}

In [None]:
submission = reddit.submission(url='https://www.reddit.com/r/singapore/comments/hj6tcn/better_jobs_for_singaporeans/')
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
    comms_dict["title"].append(submission.title)
    comms_dict["user"].append(comment.author)
    comms_dict["body"].append(comment.body)
    comms_dict["score"].append(comment.score)
    comms_dict["created"].append(comment.created)


In [None]:
comms_dict

In [None]:
comms_data = pd.DataFrame(comms_dict)

In [None]:
comms_data

In [None]:
def get_date(created):
    return dt.datetime.fromtimestamp(created)

In [None]:
_timestamp = comms_data["created"].apply(get_date)

In [None]:
comms_data = comms_data.assign(timestamp = _timestamp)

In [None]:
comms_data

In [None]:
mypath= "."
comms_data.to_csv(f'{mypath}/foreigners_2020.csv', index=False)

2. **Cleaning and EDA**

1.   Check CSV file
2.   remove empty 'body' columns
3.   fill empty username with 'null_username')

In [None]:
import pandas as pd
import numpy as np
mypath= "."
filename = "foreigner_2020"
df = pd.read_csv(f'{mypath}/{filename}.csv')
df.info()

In [None]:
df.dropna(axis = 0, subset = ['body'], inplace = True)
df.fillna(value='null_username',inplace = True)
df.info() 

First round of cleaning

1.   Change text to string
2.   Lowercase for all
3.   Expand contractions
4.   Remove punctuations
5.   Remove digits in text


In [None]:
# Apply a first round of text cleaning techniques
import re 
import string

#https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python

def clean_text_round1(text):
    
    '''Make text lowercase, remove punctuation and remove words containing numbers.'''
    text = str(text)
    text = text.lower()
    text = re.sub(r"won\'t", "will not",text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"let\'s", "let us",text)
    # general
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will",text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am",text)

    text = re.sub('[%s]' % re.escape(string.punctuation), '',text)  #xing yu: remove punctuations. also help to remove '-' eg: input: same-sex -> output: samesex
    text = re.sub('\w*\d\w*', '', text) #xing yu: remove words that contain numbers
    text = re.sub(r'[^\w]', ' ',text) #remove \n

    text = re.sub(r'http\S+', '', text) #remove url
    text = re.sub(r'www\S+', '', text) #remove url 
   
    #removes digits in between texts
    return text

In [None]:
df['clean_text_1'] = df['body'].apply(clean_text_round1)
df.head()

Second round of cleaning: Remove stopwords

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
def clean_text_round2(text):
    return ' '.join([word for word in text.split() if word not in (stop)])
#split the words by space and join the words if the word is not in stop

In [None]:
df['clean_text_2'] = df['clean_text_1'].apply(clean_text_round2)
df.info()

In [None]:
mypath= "."
df.to_csv(f'{mypath}/cleaned_{filename}.csv', index=False)

Open new csv file and drop null clean_text_2

In [None]:
df = pd.read_csv(f'{mypath}/cleaned_{filename}.csv')
df.info()

In [None]:
df.dropna(axis = 0, subset = ['clean_text_2'], inplace = True) 
mypath= "."
df.to_csv(f'{mypath}/cleaned_{filename}.csv', index=False)

In [None]:
df.info()

**Text Analysis**

In [None]:
### Let's download ```smt203util.py``` Below code should download the file in the same folder where your jupyter notebook is. 
!wget https://raw.githubusercontent.com/anjisun221/css_codes/main/ay21t1/Lab03_text_analysis/smt203util.py
#wget download file from internet (download smt203util.py)

In [None]:
### Let's import all function from smt203util! 
from smt203util import *

In [None]:
### Import Pandas to analyze the data
import pandas as pd

In [None]:
mypath= "."
filename = "" #use the cleaned one
df = pd.read_csv(f'{mypath}/{filename}.csv')
df.info()


Third round of cleaning to remove keywords separately

In [None]:

#xing yu: suggest to add extra cleaning round to remove keywords seperately for  lgbt and ceca
def clean_text_keywords(text):
    text = str(text)
    text = text.lower()
    text= re.sub('lgbt','',text) #xing yu: remove lgbt keyword
    text = re.sub('377a','',text)#xing yu: remove 377a keyword
    text = re.sub('pinkdot','',text) #xing yu: remove pinkdot keyword
    text = re.sub('pink dot','',text) #xing yu: remove pink dot keyword in case of variation

    text = str(text)
    text = text.lower()
    text= re.sub('ceca','',text) #xing yu: remove ceca keyword
    text = re.sub('foreign talent','',text)#xing yu: remove foreign talent keyword
    text = re.sub('foreign talents','',text) #xing yu: remove foreign talents keyword in case of variation
    text = re.sub('foreign worker','',text) #xing yu: remove foreign worker keyword 
    text = re.sub('foreign workers','',text)#xing yu: remove foreign workers keyword in case of variation 

    text = re.sub(r'http\S+', '', text) #remove url
    text = re.sub(r'www\S+', '', text) #remove url 
    return text

In [None]:
df['clean_text_3'] = df['clean_text_2'].apply(clean_text_keywords)
df.head()
df.to_csv(f'{mypath}/{filename}.csv', index=False)

Unigram analysis - Counting words (and save it to a file)

In [None]:
## Create ```word_counts``` folder 
import os
os.makedirs('word_counts', exist_ok=True)

In [None]:
# this function create a dictionary with word counts from dataframe 
def count_words_from_dataframe(df):
    result_dict = {}
    # iterate rows of dataframe 
    for index, row in df.iterrows():
        text = str(row['CleanText3'])
        #return the text for each row
        
        # this will split a sentence into words 
        tokens = text.split()
        #split the sentence by space >> list
        
        # iterate each word and count the number of words it appears in each of the speech for the list of speech
        for i in range(0, len(tokens)):
            token = tokens[i]
            try:
                result_dict[token] += 1
            except KeyError:
                result_dict[token] = 1
                    
    return result_dict

In [None]:
import csv

count_type = 'unigram'
## this function will return a dictionary of words and frequency
result = count_words_from_dataframe(df)
sorted_dic = ((k, result[k]) for k in sorted(result, key=result.get, reverse=True))
with open(f"./word_counts/{filename}_{count_type}_counts.csv", 'w') as fp:
      writer = csv.writer(fp, delimiter=',')
      writer.writerows(sorted_dic)

Draw Wordcloud using unigrams

In [None]:
## This it OPTIONAL if you are running the current notebook using Google Colab
!conda install --yes -c conda-forge wordcloud

In [None]:
### Import relevant libraries
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import numpy as np
import re
from PIL import Image
from os import path
import csv

### this function will creat wordcloud based on word frequencies and save them into files under plot. 
def makeImage(termDict, outputfile):
    
    wc = WordCloud(max_font_size=60, width=1280, height=720, background_color="white")    

    ### generate word cloud using frequencies!
    wc.generate_from_frequencies(termDict)
    wc.to_file(outputfile+".png")
    wc.to_file(outputfile+".pdf")
    
    ### show the figure
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show() 
    

In [None]:
### Create ```plot``` folder 
os.makedirs('plot', exist_ok=True)

In [None]:

### We're using one of the utility function! "read_word_count_file"
fullTermsDict = read_word_count_file(f'{mypath}/word_counts/{filename}_unigram_counts.csv')
outputfile = f"./plot/wordcloud_count_unigram_{filename}"
makeImage(fullTermsDict, outputfile)


Count Bigram

In [None]:
def count_bigrams_from_dataframe(df):
    
    result_dict = {}                    
    for index, row in df.iterrows():
        text = row['CleanText2']
        tokens = str(text).split()
        for i in range(0, len(tokens)-1):
          token = tokens[i] + " " + tokens[i+1]
          try:
              result_dict[token] += 1
          except KeyError:
              result_dict[token] = 1

        ## write your code 
        
    return result_dict


import csv
count_type = 'bigram'

result = count_bigrams_from_dataframe(df)
    

## soring the words based on their frequency
sorted_dic = ((k, result[k]) for k in sorted(result, key=result.get, reverse=True))

## write the dictionary in a file
with open(f"./word_counts/{filename}_{count_type}_counts.csv", 'w') as fp:
    writer = csv.writer(fp, delimiter=',')
    writer.writerows(sorted_dic)

In [None]:
#word cloud with bigram
### We're using one of the utility function! "read_word_count_file"
fullTermsDict = read_word_count_file(f'{mypath}/word_counts/{filename}_bigram_counts.csv')
outputfile = f"./plot/wordcloud_count_bigram_{filename}"
makeImage(fullTermsDict, outputfile)

Let's load word frequency file and create dictionary of word counts for the two platforms (NEED DATA FROM HARDWAREZONE AND REDDIT)

In [None]:
#counts_i_name ='lgbt_reddit_cleaned_2021'
#counts_i_name ='ceca_reddit_cleaned_2021'
#counts_i_name ='REDDIT2020LGBTFINAL'
counts_i_name ='REDDIT2020CECAFINAL'

counts_i = read_word_count_file(f"{mypath}/word_counts/{counts_i_name}_unigram_counts.csv")

### we filter out words that doesnt exist in our background corpus
counts_i_dict = {k: v for k, v in counts_i.items() if k in global_counts}
#check whether that word is in global_count

In [None]:
#counts_j_name = 'HWZ2021LGBT'
#counts_j_name = 'HWZ2021CECA'
#counts_j_name = 'HWZEDMW2020LGBTFINAL'
counts_j_name = 'HWZEDMW2020CECAFINAL'

counts_j = read_word_count_file(f"{mypath}/word_counts/{counts_j_name}_unigram_counts.csv")

### we filter out words that exist in our background corpus
counts_j_dict = {k: v for k, v in counts_j.items() if k in global_counts}

In [None]:
### this function will return log-odds values. 
top_words_df = calculate_log_odds_idp(global_counts, counts_i_name, counts_i_dict, counts_j_name, counts_j_dict)

Interpretation lad z-score (log_odds_z_score)

In [None]:

#top_words_df[top_words_df[counts_i_name] >= 2].iloc[::-1].head(20)

top_words_df.sort_values(by = [counts_i_name], ascending=False).head(20)

In [None]:

#top_words_df[top_words_df[counts_j_name] >= 2].head(20)
top_words_df.sort_values(by = [counts_j_name], ascending=False).head(20)

In [None]:
### Below function will simply help you to print the above table into file, so that we can use it for drawing word cloud. 
### The output of this function is a csv file where each row contains (word, log_odds_z_score) for the two corpora. 
### threshold_i and threshold_j are the threshold of word frequency. 
### num_i and num_j are the number of representative words it will write in file. 
find_discriminative_words(top_words_df, threshold_i=10, threshold_j=10, num_i=20, num_j=20, mypath='.')
#larger dataset > set higher threshold


Draw WordClouds based on log odds values

Draw a WordCloud for Reddit

In [None]:

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import numpy as np
import re
from PIL import Image
from os import path
import csv
 
### this function will creat wordcloud based on word frequencies and save them into files under plot. 
def makeImage(termDict, outputfile):

    wc = WordCloud(width=500, height=300, background_color="black",colormap="Reds")    
 
    ### generate word cloud using frequencies!
    wc.generate_from_frequencies(termDict)
    wc.to_file(outputfile+".png")
    wc.to_file(outputfile+".pdf")
    
    ### show the figure
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show() 

In [None]:
### Create ```plot``` folder 
os.makedirs('plot', exist_ok=True)
 
filename = counts_i_name

### We're using one of the utility function! "read_word_count_file"
fullTermsDict = read_word_count_file(f'{mypath}/{filename}_zscore.csv')
outputfile = f"./plot/wordcloud_count_zscore_{filename}"
makeImage(fullTermsDict, outputfile)

Draw a WordCloud for hardwarezone

In [None]:


from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import numpy as np
import re
from PIL import Image
from os import path
import csv
 
### this function will creat wordcloud based on word frequencies and save them into files under plot. 
def makeImage(termDict, outputfile):

    wc = WordCloud(width=500, height=300, background_color="black",colormap="Greens")    
 
    ### generate word cloud using frequencies!
    wc.generate_from_frequencies(termDict)
    wc.to_file(outputfile+".png")
    wc.to_file(outputfile+".pdf")
    
    ### show the figure
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show() 
    

In [None]:
### Create ```plot``` folder 
os.makedirs('plot', exist_ok=True)
 
filename = counts_j_name

### We're using one of the utility function! "read_word_count_file"
fullTermsDict = read_word_count_file(f'{mypath}/{filename}_zscore.csv')
outputfile = f"./plot/wordcloud_count_zscore_{filename}"
makeImage(fullTermsDict, outputfile)

3. **Classification**

In [None]:
import numpy as np
import pandas as pd

# Packages for train/test dataset split
from sklearn.model_selection import train_test_split

In [None]:
!pip install datasets transformers[sentencepiece]

In [None]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

In [None]:
from transformers import pipeline

In [None]:
mypath = '.'
df = pd.read_csv(f'{mypath}/.csv',encoding= 'unicode_escape')
df.info()


In [None]:
df['sentiment'].value_counts()

In [None]:
sentences = list(df['clean_text_1'].iloc[0:200].values)
y_str = list(df['sentiment'].iloc[0:200].values)

In [None]:
len(y_str)
len(sentences)

In [None]:
pred_sentences = list(df['clean_text_1'].values)
len(pred_sentences)

In [None]:
y = []
for i in range(len(y_str)):
    if y_str[i] == "positive":
        y.append(0)
      
    elif y_str[i] == "negative":
        y.append(1)
  
    elif y_str[i] == "neutral":
        y.append(2) 

Training dataset

In [None]:
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.20, random_state=999)

In [None]:
len(sentences_test)

In [None]:
sentences_train, sentences_val, y_train, y_val = train_test_split(sentences_train, y_train, test_size=.2, random_state=999)

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
train_encodings = tokenizer(sentences_train, truncation=True, padding=True)
val_encodings = tokenizer(sentences_val, truncation=True, padding=True)
test_encodings = tokenizer(sentences_test, truncation=True, padding=True)

In [None]:
import torch

class myDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)




In [None]:
train_dataset = myDataset(train_encodings, y_train)
val_dataset = myDataset(val_encodings, y_val)
test_dataset = myDataset(test_encodings, y_test)

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=40,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)



In [None]:
# if it's not a binary classification, num_labels should be given! 
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.evaluate()

In [None]:
trainer.save_model()

In [None]:
new_model = DistilBertForSequenceClassification.from_pretrained("./results", num_labels=3)

new_trainer = Trainer(
    model=new_model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

Creating prediction column

In [None]:
# create dataset for prediction
new_encodings = tokenizer(pred_sentences, truncation=True, padding=True)
# create dummy labels with the number of sentences to predict. 
y_new = np.full(len(pred_sentences), 1)
new_dataset = myDataset(new_encodings, y_new)

In [None]:
new_predictions = new_trainer.predict(new_dataset)

In [None]:
new_preds = np.argmax(new_predictions.predictions, axis=-1)
sump = 0
sumn = 0
sumneg = 0
for i in new_preds:
  if i == 0:
    sump += 1
  elif i == 1:
    sumneg += 1
  else:
    sumn += 1
print(f'There are {sump} positive comments, {sumneg} negative comments and {sumn} neutral comments')

In [None]:
df['Prediction'] = new_preds.tolist()

In [None]:
def add_predictions(pred):
  if pred == 0:
    pred = 'pro-lgbt'
  elif pred == 1:
    pred = 'anti-lgbt'
  else:
    pred = 'neutral'
  return pred 

In [None]:
df['Prediction'] = df['Prediction'].apply(add_predictions)
df.head()

In [None]:
df['Prediction'].value_counts()

In [None]:
df.to_csv(f'{mypath}/.csv', index=False)