In [1]:
import pandas as pd
import numpy as np
import os

# text preprocessing
from nltk import word_tokenize, TweetTokenizer, sent_tokenize, RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import gensim
from gensim.models import CoherenceModel
import pyLDAvis.gensim
from sklearn.decomposition import LatentDirichletAllocation
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# plots and metrics
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# feature extraction / vectorization
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

# save and load a file
import pickle

In [2]:
parent_path = '/Users/chenjianyu/Library/Mobile Documents/com~apple~CloudDocs/SMU/SMU Module Materials/Y2S2/SMT203 Computational Social Sci/Covid-19-Singapore-Analysis'

  and should_run_async(code)


In [3]:
social_media_data_folder_paths = [
    f'{parent_path}/Data/Hardwarezone Data/Cleaned Data', 
    f'{parent_path}/Data/Twitter Data/Cleaned Data/Policies/Combined',
    f'{parent_path}/Data/Facebook Data/Cleaned Data/Policies/Combined',
    f'{parent_path}/Data/Instagram Data/Cleaned Data/Policies/Combined',
    f'{parent_path}/Data/Reddit Data/Cleaned Data/Policies/Combined',
]

  and should_run_async(code)


In [4]:
def get_opinions(df):
    return df[df['Textblob_subjectivity_score'] >= 0.5]

def classify_emotions(df):
    emotions = []
    for row in df['Sentences']:
        emotion = emotions_clf.predict([row])
        emotions.append(emotion[0])
    df['Emotions'] = emotions
    return df

def get_policy_data(policy, folders):
    frames = []
    for folder in folders:
        files = [file for file in os.listdir(folder) if file.endswith('.csv')]
        for file in files:
            if policy.lower() in file.lower():
                df = pd.read_csv(f'{folder}/{file}')
                df = df[['Comments', 'Comment Datetime', 'actionable', 'valuable']]
                frames.append(df)
                print(True)
                break
    final_df = pd.concat(frames, ignore_index=True)
    return final_df

def get_vader_sentiment(df, comment_header):
    # df = pd.read_csv(file_path)
    vader_analyser = SentimentIntensityAnalyzer()

    vader_neg_sentiment = []
    vader_neu_sentiment = []
    vader_pos_sentiment = []
    vader_compound_sentiment = []

    for row in df[comment_header]:
        score = vader_analyser.polarity_scores(row)

        vader_neg_sentiment.append(score['neg'])
        vader_neu_sentiment.append(score['neu'])
        vader_pos_sentiment.append(score['pos'])
        vader_compound_sentiment.append(score['compound'])
    
    df['Vader_neg_score'] = vader_neg_sentiment
    df['Vader_neu_score'] = vader_neu_sentiment
    df['Vader_pos_score'] = vader_pos_sentiment
    df['Vader_compound_score'] = vader_compound_sentiment

    return df

def get_textblob_sentiment(df, comment_header):
    
    # df = pd.read_csv(file_path)

    polarity_scores = []
    subjectivity_scores = []

    for row in df[comment_header]:
        analysis = TextBlob(row)
        polarity_scores.append(analysis.sentiment.polarity)
        subjectivity_scores.append(analysis.sentiment.subjectivity)
    
    df['Textblob_polarity_score'] = polarity_scores
    df['Textblob_subjectivity_score'] = subjectivity_scores

    return df

def get_actionable_comments(df, label = 1):
    # df = pd.read_csv(file_path)
    actionable_comments = df[df['actionable'] == label]

    return actionable_comments

def get_valuable_comments(df, label = 1.0):
    # df = pd.read_csv(file_path)
    valuable_comments = df[df['valuable'] == label]

    return valuable_comments

def sent_tokenize_then_to_df(df):
    sentences_dict = {'Sentences': [], 'Comment Datetime': []}

    for i, row in df.iterrows():
        sentences = sent_tokenize(df['Comments'].iloc[i])
        for sent in sentences:
            sentences_dict['Sentences'].append(sent)
            sentences_dict['Comment Datetime'].append(df['Comment Datetime'].iloc[i])

    final_df = pd.DataFrame(sentences_dict)
    final_df.drop_duplicates(subset=['Sentences'], inplace=True)
    final_df.reset_index(inplace=True)
    return final_df

  and should_run_async(code)


# Circuit Breaker

In [24]:
policies = ['circuit breaker', 'tracetogether', 'foreign worker', 'social distancing', 'economic measures', 'vaccination', 'mask']
names = ['total_comments', 'valuable_sentiments', 'actionable_sentiments', 'valuable_opinions', 'actionable_opinions', 'valuable_pos', 'valuable_neg', 'valuable_anger', 'valuable_joy', 'valuable_sad', 'valuable_fear', 'valuable_neu']

  and should_run_async(code)


In [33]:
for policy in policies:
    ind = 0
    cb_df = get_policy_data(policy, social_media_data_folder_paths) ##
    cb_df.to_csv(f'{policy}_{names[ind]}.csv')
    ind += 1

    cb_valuable_comments = get_valuable_comments(cb_df)
    cb_valuable_comments.reset_index(inplace=True)

    cb_actionable_comments = get_actionable_comments(cb_df)
    cb_actionable_comments.reset_index(inplace=True)

    cb_valuable_sent = sent_tokenize_then_to_df(cb_valuable_comments)
    cb_actionable_sent = sent_tokenize_then_to_df(cb_actionable_comments)

    cb_valuable_polarity = get_textblob_sentiment(cb_valuable_sent, 'Sentences')
    cb_actionable_polarity = get_textblob_sentiment(cb_actionable_sent, 'Sentences')

    cb_valuable_sentiment = get_vader_sentiment(cb_valuable_polarity, 'Sentences')
    cb_actionable_sentiment = get_vader_sentiment(cb_actionable_polarity, 'Sentences')

    cb_valuable_emotions = classify_emotions(cb_valuable_sentiment) ##
    cb_valuable_emotions.to_csv(f'{policy}_{names[ind]}.csv')
    ind += 1
    cb_actionable_emotions = classify_emotions(cb_actionable_sentiment) ##
    cb_actionable_emotions.to_csv(f'{policy}_{names[ind]}.csv')
    ind += 1

    cb_valuable_opinions = get_opinions(cb_valuable_emotions) ##
    cb_valuable_opinions.to_csv(f'{policy}_{names[ind]}.csv')
    ind += 1
    cb_actionable_opinions = get_opinions(cb_actionable_emotions) ##
    cb_actionable_opinions.to_csv(f'{policy}_{names[ind]}.csv')
    ind += 1

    cb_val_pos_opinions =     cb_valuable_opinions[cb_valuable_opinions['Vader_compound_score'] >= 0.1]
    cb_val_pos_opinions.to_csv(f'{policy}_{names[ind]}.csv')
    ind += 1
    cb_val_neg_opinions =     cb_valuable_opinions[cb_valuable_opinions['Vader_compound_score'] <= -0.1]
    cb_val_neg_opinions.to_csv(f'{policy}_{names[ind]}.csv')
    ind += 1
    cb_val_anger_opinions =   cb_valuable_opinions[cb_valuable_opinions['Emotions'] <= 'anger']
    cb_val_anger_opinions.to_csv(f'{policy}_{names[ind]}.csv')
    ind += 1
    cb_val_joy_opinions =     cb_valuable_opinions[cb_valuable_opinions['Emotions'] <= 'joy']
    cb_val_joy_opinions.to_csv(f'{policy}_{names[ind]}.csv')
    ind += 1
    cb_val_sadness_opinions = cb_valuable_opinions[cb_valuable_opinions['Emotions'] <= 'sadness']
    cb_val_sadness_opinions.to_csv(f'{policy}_{names[ind]}.csv')
    ind += 1
    cb_val_fear_opinions =    cb_valuable_opinions[cb_valuable_opinions['Emotions'] <= 'fear']
    cb_val_fear_opinions.to_csv(f'{policy}_{names[ind]}.csv')
    ind += 1
    cb_val_neutral_opinions = cb_valuable_opinions[cb_valuable_opinions['Emotions'] <= 'neutral']
    cb_val_neutral_opinions.to_csv(f'{policy}_{names[ind]}.csv')
    ind += 1

  and should_run_async(code)
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [None]:
cb_df.to_csv('circuit_breaker_total_comments.csv')

In [9]:
df_train = pd.read_csv('data/data_train.csv')
df_test = pd.read_csv('data/data_test.csv')

X_train = df_train.Text
X_test = df_test.Text

y_train = df_train.Emotion
y_test = df_test.Emotion

class_names = ['joy', 'sadness', 'anger', 'neutral', 'fear']
data = pd.concat([df_train, df_test])

# print('size of training set: %s' % (len(df_train['Text'])))
# print('size of validation set: %s' % (len(df_test['Text'])))
# print(data.Emotion.value_counts())

# data.head()

  and should_run_async(code)


In [10]:
def preprocess_and_tokenize(data):    

    #remove html markup
    data = re.sub("(<.*?>)", "", data)

    #remove urls
    data = re.sub(r'http\S+', '', data)
    
    #remove hashtags and @names
    data= re.sub(r"(#[\d\w\.]+)", '', data)
    data= re.sub(r"(@[\d\w\.]+)", '', data)

    #remove punctuation and non-ascii digits
    data = re.sub("(\\W|\\d)", " ", data)
    
    #remove whitespace
    data = data.strip()
    
    # tokenization with nltk
    data = word_tokenize(data)
    
    # stemming with nltk
    porter = PorterStemmer()
    stem_data = [porter.stem(word) for word in data]
        
    return stem_data

  and should_run_async(code)


In [11]:
# TFIDF, unigrams and bigrams
vect = TfidfVectorizer(tokenizer=preprocess_and_tokenize, sublinear_tf=True, norm='l2', ngram_range=(1, 2))

# fit on our complete corpus
vect.fit_transform(data.Text)

# transform testing and training datasets to vectors
X_train_vect = vect.transform(X_train)
X_test_vect = vect.transform(X_test)

  and should_run_async(code)


In [12]:
svc = LinearSVC(tol=1e-05)
svc.fit(X_train_vect, y_train)

svm_pred = svc.predict(X_test_vect)

  and should_run_async(code)


In [13]:
svm_model = Pipeline([('tfidf', vect),('clf', svc),])

  and should_run_async(code)


In [14]:
emotions_clf_filename = 'tfidf_svm.sav'
pickle.dump(svm_model, open(emotions_clf_filename, 'wb'))

  and should_run_async(code)


In [15]:
emotions_clf = pickle.load(open('tfidf_svm.sav', 'rb'))

message = 'delivery was hour late and my pizza is cold!' 
emotions_clf.predict([message])

  and should_run_async(code)


array(['anger'], dtype=object)

# Generating sent tokenized comments for all comments in a policy

In [5]:
policies = ['circuit breaker', 'tracetogether', 'foreign worker', 'social distancing', 'economic measures', 'vaccination', 'mask']
for policy in policies:
    cb_df = get_policy_data(policy, social_media_data_folder_paths) ##
    cb_sent = sent_tokenize_then_to_df(cb_df)
    cb_sent.to_csv(f'{policy}_all_comments.csv')

  and should_run_async(code)
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
