In [9]:
import pandas as pd
import numpy as np
import os
from utils import *

# text preprocessing
from nltk import word_tokenize, TweetTokenizer, sent_tokenize, RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
import re
import gensim
from gensim.models import CoherenceModel
import pyLDAvis.gensim
from sklearn.decomposition import LatentDirichletAllocation

# plots and metrics
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# feature extraction / vectorization
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

# save and load a file
import pickle

  and should_run_async(code)


In [11]:
df_train = pd.read_csv('data/data_train.csv')
df_test = pd.read_csv('data/data_test.csv')

X_train = df_train.Text
X_test = df_test.Text

y_train = df_train.Emotion
y_test = df_test.Emotion

class_names = ['joy', 'sadness', 'anger', 'neutral', 'fear']
data = pd.concat([df_train, df_test])

  and should_run_async(code)


In [12]:
def preprocess_and_tokenize(data):    

    #remove html markup
    data = re.sub("(<.*?>)", "", data)

    #remove urls
    data = re.sub(r'http\S+', '', data)
    
    #remove hashtags and @names
    data= re.sub(r"(#[\d\w\.]+)", '', data)
    data= re.sub(r"(@[\d\w\.]+)", '', data)

    #remove punctuation and non-ascii digits
    data = re.sub("(\\W|\\d)", " ", data)
    
    #remove whitespace
    data = data.strip()
    
    # tokenization with nltk
    data = word_tokenize(data)
    
    # stemming with nltk
    porter = PorterStemmer()
    stem_data = [porter.stem(word) for word in data]
        
    return stem_data

  and should_run_async(code)


In [13]:
# TFIDF, unigrams and bigrams
vect = TfidfVectorizer(tokenizer=preprocess_and_tokenize, sublinear_tf=True, norm='l2', ngram_range=(1, 2))

# fit on our complete corpus
vect.fit_transform(data.Text)

# transform testing and training datasets to vectors
X_train_vect = vect.transform(X_train)
X_test_vect = vect.transform(X_test)

  and should_run_async(code)


In [14]:
svc = LinearSVC(tol=1e-05)
svc.fit(X_train_vect, y_train)

svm_pred = svc.predict(X_test_vect)

  and should_run_async(code)


In [15]:
svm_model = Pipeline([('tfidf', vect),('clf', svc),])

  and should_run_async(code)


In [16]:
emotions_clf_filename = 'tfidf_svm.sav'
pickle.dump(svm_model, open(emotions_clf_filename, 'wb'))

  and should_run_async(code)


In [17]:
emotions_clf = pickle.load(open('tfidf_svm.sav', 'rb'))

message = 'delivery was hour late and my pizza is cold!' 
emotions_clf.predict([message])

  and should_run_async(code)


array(['anger'], dtype=object)

In [4]:
def get_policy_data(policy, folders):
    frames = []
    for folder in folders:
        files = [file for file in os.listdir(folder) if file.endswith('.csv')]
        for file in files:
            if policy.lower() in file.lower():
                df = pd.read_csv(f'{folder}/{file}')
                df = df[['Comments', 'Comment Datetime', 'actionable', 'valuable']]
                frames.append(df)
                print(True)
                break
    final_df = pd.concat(frames, ignore_index=True)
    return final_df

# pos_tags = ['JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS']
# 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'
def corpus2docs_1(df):
    docs1 = [TweetTokenizer().tokenize(comment) for comment in df['Sentences']]
    # for i, comment in enumerate(docs1):
    #     tags = nltk.pos_tag(comment)
    #     docs_tags = [tag[0].lower() for tag in tags if tag[1] in pos_tags]
    #     docs1[i] = docs_tags
    docs2 = [[w.lower() for w in doc] for doc in docs1]
    docs3 = [[w for w in doc if re.search('^[a-z]+$', w)] for doc in docs1]
    docs4 = [[w for w in doc if w not in stop_list] for doc in docs3]
    return docs4

def docs2vecs(docs, dic):
    vecs = [dic.doc2bow(doc) for doc in docs]
    return vecs

stop_list = nltk.corpus.stopwords.words('english')

def get_pyLDAvis(model, vecs, dic, topic):
    pyLDAvis.enable_notebook()
    visual= pyLDAvis.gensim.prepare(lda_model, vecs, dic)
    pyLDAvis.save_html(visual, f"{topic}_viz.html")
    return lda_model.show_topics(num_topics, top_words)

def lda_model_show_topics(num_topics):
    for i in range(num_topics):
        print('-'*100)
        print(i)
        print('-'*100)
        print()

def format_topics_sentences(ldamodel, corpus, data, opinions):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(data)
    ori_sentences = opinions['Sentences']
    sent_topics_df = pd.concat([sent_topics_df, contents, ori_sentences], axis=1)
    return(sent_topics_df)

def sent_tokenize_then_to_df(df):
    sentences_dict = {'Sentences': [], 'Comment Datetime': []}

    for i, row in df.iterrows():
        sentences = sent_tokenize(df['Comments'].iloc[i])
        for sent in sentences:
            sentences_dict['Sentences'].append(sent)
            sentences_dict['Comment Datetime'].append(df['Comment Datetime'].iloc[i])

    final_df = pd.DataFrame(sentences_dict)
    final_df.drop_duplicates(subset=['Sentences'], inplace=True)
    final_df.reset_index(inplace=True)
    return final_df

def get_opinions(df):
    return df[df['Textblob_subjectivity_score'] >= 0.5]

def classify_emotions(df):
    emotions = []
    for row in df['Sentences']:
        emotion = emotions_clf.predict([row])
        emotions.append(emotion[0])
    df['Emotions'] = emotions
    return df

def coherence_plot(coherence_values):
    x = range(2, 25, 1)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

  and should_run_async(code)


In [5]:
parent_path = '/Users/joshuawong/Documents/GitHub/Covid-19-Singapore-Analysis'

  and should_run_async(code)


In [6]:
social_media_data_folder_paths = [
    f'{parent_path}/Data/Hardwarezone Data/Cleaned Data', 
    f'{parent_path}/Data/Twitter Data/Cleaned Data/Policies/Combined',
    f'{parent_path}/Data/Facebook Data/Cleaned Data/Policies/Combined',
    f'{parent_path}/Data/Instagram Data/Cleaned Data/Policies/Combined',
    f'{parent_path}/Data/Reddit Data/Cleaned Data/Policies/Combined',
]

  and should_run_async(code)


In [7]:
cb_df = get_policy_data('circuit breaker', social_media_data_folder_paths)

True
True
True
True
True
  and should_run_async(code)


In [18]:
cb_valuable_comments = get_valuable_comments(cb_df)
cb_valuable_comments.reset_index(inplace=True)

cb_actionable_comments = get_actionable_comments(cb_df)
cb_actionable_comments.reset_index(inplace=True)

cb_valuable_sent = sent_tokenize_then_to_df(cb_valuable_comments)
cb_actionable_sent = sent_tokenize_then_to_df(cb_actionable_comments)

cb_valuable_polarity = get_textblob_sentiment(cb_valuable_sent, 'Sentences')
cb_actionable_polarity = get_textblob_sentiment(cb_actionable_sent, 'Sentences')

cb_valuable_sentiment = get_vader_sentiment(cb_valuable_polarity, 'Sentences')
cb_actionable_sentiment = get_vader_sentiment(cb_actionable_polarity, 'Sentences')

cb_valuable_emotions = classify_emotions(cb_valuable_sentiment)
cb_actionable_emotions = classify_emotions(cb_actionable_sentiment)

cb_valuable_opinions = get_opinions(cb_valuable_emotions)
cb_actionable_opinions = get_opinions(cb_actionable_emotions)

  and should_run_async(code)


In [19]:
cb_val_pos_opinions =     cb_valuable_opinions[cb_valuable_opinions['Vader_compound_score'] >= 0.1]
cb_val_neg_opinions =     cb_valuable_opinions[cb_valuable_opinions['Vader_compound_score'] <= -0.1]
cb_val_anger_opinions =   cb_valuable_opinions[cb_valuable_opinions['Emotions'] <= 'anger']
cb_val_joy_opinions =     cb_valuable_opinions[cb_valuable_opinions['Emotions'] <= 'joy']
cb_val_sadness_opinions = cb_valuable_opinions[cb_valuable_opinions['Emotions'] <= 'sadness']
cb_val_fear_opinions =    cb_valuable_opinions[cb_valuable_opinions['Emotions'] <= 'fear']
cb_val_neutral_opinions = cb_valuable_opinions[cb_valuable_opinions['Emotions'] <= 'neutral']

  and should_run_async(code)


In [20]:
cb_val_anger_opinions

  and should_run_async(code)


Unnamed: 0,index,Sentences,Comment Datetime,Textblob_polarity_score,Textblob_subjectivity_score,Vader_neg_score,Vader_neu_score,Vader_pos_score,Vader_compound_score,Emotions
17,17,"For electricity, use lesser aircon should be o...",3/2/21,0.250000,0.500000,0.000,0.879,0.121,0.2960,anger
18,18,"For me, I'm lucky the company already are usin...",3/2/21,0.333333,0.833333,0.000,0.909,0.091,0.4215,anger
27,27,"WFH can cook cheap, simple and healthy meals M...",3/2/21,0.350000,0.514286,0.000,0.895,0.105,0.4019,anger
47,47,tankgunner wrote: don't think it will happen b...,28/2/21,0.050000,1.000000,0.000,1.000,0.000,0.0000,anger
84,85,He voluntarily refunded the amount he had used...,2/4/21,-0.274773,0.569331,0.000,0.827,0.173,0.8720,anger
...,...,...,...,...,...,...,...,...,...,...
12452,13061,What a cruel twist in fate for the sub whom lo...,20/5/20,0.092000,0.718000,0.222,0.512,0.266,0.3182,anger
12453,13062,The people asked for nanny state strict regula...,20/5/20,0.600000,1.000000,0.137,0.766,0.096,-0.2550,anger
12456,13065,You don't have the 1st fucking clue about me o...,20/5/20,-0.600000,0.800000,0.000,1.000,0.000,0.0000,anger
12462,13072,Law: can meet up only in group of 2.,19/5/20,0.000000,1.000000,0.000,1.000,0.000,0.0000,anger


In [22]:
cb_valuable_emotions.to_csv("valuable_emotions.csv", index=False)

  and should_run_async(code)


In [23]:
cb_valuable_emotions.head()

  and should_run_async(code)


Unnamed: 0,index,Sentences,Comment Datetime,Textblob_polarity_score,Textblob_subjectivity_score,Vader_neg_score,Vader_neu_score,Vader_pos_score,Vader_compound_score,Emotions
0,0,kuma-mon wrote: Because need to handle issues ...,3/2/21,0.061806,0.607176,0.046,0.845,0.108,0.7272,fear
1,1,You are not the only one.,3/2/21,0.0,1.0,0.0,1.0,0.0,0.0,neutral
2,2,Some can even ki siao.,3/2/21,0.0,0.0,0.0,1.0,0.0,0.0,neutral
3,3,"If can, ask to go office to work.",3/2/21,0.0,0.0,0.0,1.0,0.0,0.0,joy
4,4,"If not, then have to make adjustment.",3/2/21,0.0,0.0,0.0,1.0,0.0,0.0,neutral


In [None]:
fig = px.line(cb_valuable_emotions[["Emotions", "Comment Datetime"]], 
    x="Comment Datetime", 
    y=f"{policy}", 
    title=f'Popularity of \'{policy}\' in Google Search Singapore',
    labels={"date":"Date", f"{policy}":"Search popularity"})
fig.show()
