### Imports

In [1]:
import dash
import dash_core_components as dcc
import dash_html_components as html
import pandas as pd
import numpy as np
import datetime
from dash.dependencies import Output, Input, State
import plotly.express as px
import pandas as pd 
import re
import nltk

# classifier
import string
import json
import sklearn

from cleantext import clean as cleantext
from preprocessor.api import clean
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

sw_en_json = open('assets/stopwords_en.json')
sw_tg_json = open('assets/stopwords-tl.json')
stopwords_tg = json.loads(sw_tg_json.read())
stopwords_en = json.loads(sw_en_json.read())

# Encoder and Vectors
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Model Classifiers
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
# Performance Metric
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,  plot_confusion_matrix, classification_report, roc_auc_score
import matplotlib.pyplot as plt  
# Cross Validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict

# Stemming 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
porter = PorterStemmer()
wl = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\21923\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\21923\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Functions
#### Dashboard

In [2]:
def getDataByDate(month, year):
    if month == 'all':
        if  year != 'all': return data[data['year'] == int(year)]  # month = all & year = spec
        else: return data                                          # month = all & year = all
        
    if year == 'all':                                              # month = spec
        return data[data['month'] == month]                        # mont = january
    
    return data[(data['month'] == month) & (data['year'] == int(year))] # month = spec & year = spec

def getOptions(listData):
    options = []
    for element in listData:
        options.append({"label": element, "value": str(element).lower()})
    return options

def data_count(df):
    return df.groupby('class').count().reset_index()[['class','message']]

def getBarMonths (year):
    if year == 'all' : return MONTHS[2:] + MONTHS[:4]
    if year == 2020: return MONTHS[2:]
    if year == 2021: return MONTHS[:4]
    
def getDataByPlatform(platform, df):
    if platform == 'facebook':
        df = df[df['platform'] == 'Facebook']
    elif platform == 'twitter':
        df = df[df['platform'] == 'Twitter']
    df = df.groupby('class', as_index=False).count()
    return df[['class', 'message']]

In [3]:
def getDateTitle(month, year):
    if month != 'all':
        if year != 'all':
            return "({} {})".format(month.upper()[:3], str(year)) # (JAN 2021)
        return "({})".format(month.upper()[:3])                   # (JAN)
    if month == 'all':
        if year != 'all':
            return "({})".format(str(year))                       # (2021)
        return '(Overall)'                                        # (Overall)

def getTitle(initialTitle, platform, month, year):
    titleDate = getDateTitle(month, year) 
    if platform == 'all':
        return initialTitle.format('Facebook and Twitter', titleDate)
    if platform == 'facebook':
        return initialTitle.format('Facebook', titleDate)
    if platform == 'twitter':
        return initialTitle.format('Twitter', titleDate)

def getLineData(month, year):
    df = data[(data['month'] == month) & (data['year'] == int(year))]    
    df = df.groupby('class', as_index=False).count()
    df['month_yr'] = "{} {}".format(month.lower()[:3], str(year)) # jan 2020
    return df[['class', 'message', 'month_yr']]

def getLineChart(year):
    data_array = []
    finalTitle = 'Mental Health in Facebook and Twitter '
    if year == 'all':
        months = getBarMonths('all') # array of months, if year == all, return march to april
        finalTitle = finalTitle + '({}-{})'.format(years[0], years[-1]) # (2020 - 2021)
        for year in years: # [2020, 2021]
            for month in getBarMonths(year): # january 2020
                data_array.append(getLineData(month.lower(), year))
    else:
        months = getBarMonths(int(year))
        finalTitle = finalTitle + '({})'.format(str(year))
        for month in months:
            data_array.append(getLineData(month.lower(), year))

    data_array = pd.concat(data_array, axis = 0)
    return px.line(data_array, x='month_yr', y='message', title=finalTitle, color='class')


def getChart(chart, platform, month, year):
    titleGraph = 'Number of Statements in {} {}'
    df = getDataByPlatform(platform, getDataByDate(month, year))
    finalTitle = getTitle('No. of statements in {} {} ', platform, month, year)
    if chart == 'pie': return px.pie(df, values='message', names='class', title=finalTitle)
    if chart == 'bar': return px.bar(df, x='class', y='message', title=finalTitle)

### Variables and Constants

In [4]:
MONTHS = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 
          'August', 'September', 'October', 'November', 'December']
LABELS = ['Anxiety', 'Depression', 'Normal','PTSD']
CLASS = ['All'] + LABELS

PLATFORMS = ['All', 'Facebook', 'Twitter']
years = [2020, 2021]

monthsOptions = ['All'] + MONTHS
yearOptions = ['All'] + years
MONTHSOPTIONS = getOptions(monthsOptions)
CLASSOPTIONS = getOptions(CLASS)
yearsOPTIONS = getOptions(yearOptions)
PLATOPTIONS = getOptions(PLATFORMS)

In [5]:
# STOPWORDS =  stopwords.words("english")
STOPWORDS = stopwords_en

#### Classifier functions

In [6]:
def text_lowercase(text):
    return text.lower()

# remove Filipino stop words
def remove_stopwords_tg(text):
    stop_words = stopwords_tg
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)


# remove english stop words
def remove_stopwords(text):
    stop_words = set(STOPWORDS)
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

#set up punctuations we want to be replaced
REPLACE_NO_SPACE = re.compile("(\;)|(\:)|(\!)|(\')|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(\+)|(/)|(:)|(\.)|(\&)|(\*)|(\?)|(\.)")
REPLACE_NUM = re.compile(r'^[0-9]*[.,]{0,1}[0-9]*$')


def stem_sentence(sentence):
    token_words  = word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(wl.lemmatize(porter.stem(word), pos='v'))
        stem_sentence.append(' ')
    return ' '.join(stem_sentence)


# custum function to clean the dataset (combining tweet_preprocessor and regular expression)
def clean_text(df):
    tempArr = []
    for line in df:
        # send to tweet_processor
        tmpL = clean(line)
        # remove puctuation
        tmpL = REPLACE_NO_SPACE.sub("", tmpL.lower()) # convert all tweets to lower cases
        tmpL = REPLACE_WITH_SPACE.sub(" ", tmpL)
        tmpL = REPLACE_NUM.sub(' ', tmpL)
        tempArr.append(tmpL)
    return tempArr

def clean_final (data):
    data = clean_text(data)
    temp = []
    for text in data:
        text = remove_stopwords_tg(text)
        text = remove_stopwords(text)
        temp.append(stem_sentence(text))
    return temp

def get_vector (vector):
    if vector == 'count':
        return CountVectorizer()
    elif vector == 'tf':
        return TfidfVectorizer(max_features=5000)


def generate_model(model):
    if model == 'svm':
        return svm.SVC(kernel='rbf', C=2,
                    decision_function_shape='ovr',
                    probability=True)
    elif model == 'nb':
        return MultinomialNB()

def show_CM (name, model, x_test, y_test):
    print("%s confusion matrix: " % name)
    plot_confusion_matrix(model, x_test, y_test, display_labels = LABELS)  
    plt.show()  
    
    
def show_classification_report(name, model, x_test, y_test, y_pred):
    print('{} classification report:\n{}'.format(name, classification_report(y_test, y_pred, zero_division=1,
                                                                             target_names=LABELS)))
    print('=======================================================')
    print('Accuracy score\t->\t{:.2f} %'.format(accuracy_score(y_test, 
                                                                       y_pred) * 100))
    print('Precision score\t->\t{:.2f} %'.format(precision_score(y_test, y_pred, 
                                                                         zero_division=1,
                                                                         average='weighted') * 100))
    print('Recall score\t->\t{:.2f} %'.format(recall_score(y_test, y_pred, average='weighted') * 100))
    print('F1 score\t->\t{:.2f} %'.format(f1_score(y_test, y_pred, average='weighted') * 100))

    print('ROC AUC score\t->\t{:.2f} %'.format(roc_auc_score(y_test,  
                                                             model.predict_proba(x_test), multi_class='ovr') * 100))
    
    
cv = KFold(n_splits=10, random_state=42, shuffle=True)
def generate_cross_validation(classifier, model_name, X, y):
    y_prediction = cross_val_predict(classifier, X, y, cv=cv)
    accuracy = accuracy_score(y_prediction.astype(int), y.astype(int))
    precision = precision_score(y_prediction.astype(int), y.astype(int),  zero_division=1, average='weighted')
    recall = recall_score(y_prediction.astype(int), y.astype(int),  zero_division=1, average='weighted')
    f1 = f1_score(y_prediction.astype(int), y.astype(int),  zero_division=1, average='weighted')

    print('--------------{} PERFORMANCE METRIC SCORES----------------\n'.format(model_name))
    model_df = pd.DataFrame(data={'Scores (%)': [accuracy, precision, recall, f1]}, 
                                 index=['Accuracy', 'Precision', 'Recall', 'f1_score'])
    model_df['Scores (%)'] = model_df['Scores (%)'] * 100
    return model_df

def preprocess(dataframe, lang):
    df = dataframe.dropna(subset=['class'])
    df = df[df['class'] != ' ']
    # prepare classification
    df.loc[df['class'] == 'a', 'class'] = 'anxiety'
    df.loc[df['class'] == 'd', 'class'] = 'depression'
    df.loc[df['class'] == 'p', 'class'] = 'ptsd'
    df.loc[df['class'] == 'n', 'class'] = 'normal'
    #convert month to lower case
    df['month'] = df.apply(lambda row: text_lowercase(str(row['month'])), axis = 1)

#     df['message'] = clean_text(df['message'])
    df['message'] = clean_final(df['message'])
#     df['message'] = df['message'].apply(stem_sentence)
    
    """For language"""
#     def no_stop_with_stem_eng():
#         # remove stop words and perform stemming
#         df['message'] = df.apply(lambda row: remove_stopwords(row['message']), axis=1)
#         df['message'] = df.apply(lambda row : stem_sentence(row['message']), axis = 1)
    
#     def no_stop_fil():
#         df['message'] = df.apply(lambda row: remove_stopwords_tg(row['message']), axis=1)
  
#     if lang == 'english':
#         no_stop_with_stem_eng()
#     elif lang == 'filipino': 
#         no_stop_fil()
#     elif lang == 'taglish':
#         no_stop_fil()
#         no_stop_with_stem_eng()

    """All 50""" 
#     temp_no = df.loc[df['class'] == 'normal'][0:60]
#     temp_a = df.loc[df['class'] == 'anxiety'][0:60]
#     temp_d = df.loc[df['class'] == 'depression'][0:60]
#     temp_p = df.loc[df['class'] == 'ptsd'][0:53]
#     df = pd.concat([temp_no, temp_a, temp_d, temp_p], axis = 0)

    """Normal is reduced to the first 710"""
#     temp = df.loc[df['class'] == 'normal'][0:710]
#     no_normal = df[df['class'] != 'normal']
#     df = pd.concat([no_normal, temp], axis = 0)
    return df


def analyze(model_name, vector, data, lang):
    # preprocess data
    df_pre = preprocess(data, lang)
    msg = df_pre['message']
    
    # initialize label encoder and vectorization
    le = LabelEncoder()
    vectorizer = get_vector(vector)
    le.fit(df_pre['class'])
    vectorizer.fit(msg)
    X = vectorizer.fit_transform(msg)
    
    y = le.fit_transform(df_pre['class']) 
    
    x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                        random_state=42, 
                                                        test_size=0.3)
    # create model
    model = generate_model(model_name)
    # train model
    prob = model.fit(x_train, y_train).predict_proba(x_test)
    # predict
    prediction = model.predict(x_test)
    # show confusion matrix
#     show_CM(model_name, model, x_test, y_test)
#     show classificaiton report
    show_classification_report(model_name, model, x_test, y_test, prediction)
    # return model and model name
    return {'cls': model, 'name': model_name, 'X': X, 'y': y, 'vectorizer': vectorizer}

### Read file

In [7]:
data = pd.read_csv("data_classified.csv")

"""Another approach of reading all files"""
# file_list = ['a', 'd', 'p', 'n']
# files = []
# for name in file_list:
#     file = pd.read_csv('data_{}.csv'.format(name))
#     files.append(file)

# data = pd.concat(files, axis = 0)


data.loc[data['class'] == 'a', 'class'] = 'anxiety'
data.loc[data['class'] == 'p', 'class'] = 'ptsd'
data.loc[data['class'] == 'd', 'class'] = 'depression'
data.loc[data['class'] == 'n', 'class'] = 'normal'
data['month'] = data['month'].str.lower()
data = data.drop(columns=['username', 'created_time'])
data = data.dropna(subset=['month', 'year'])

def getMonthYr (row):
    month = row['month'][:3]
    return "{} {}".format(month[0].upper() + month[1:], int(row['year']))
    
data['month_yr'] = data.apply(getMonthYr, axis=1) 
data

Unnamed: 0.1,Unnamed: 0,lang,class,message,month,year,platform,month_yr
0,0,taglish,anxiety,Panic disorder is scary 🥺 I experienced it mys...,january,2021.0,Twitter,Jan 2021
1,1,filipino,anxiety,pandemic anxiety talaga parang iniisip ko pa l...,may,2020.0,Twitter,May 2020
2,2,filipino,normal,buti satin wala current sa baguio meron nanama...,april,2020.0,Facebook,Apr 2020
3,3,taglish,normal,Much that I want to say wow... But Wala pa tay...,april,2020.0,Facebook,Apr 2020
4,4,filipino,anxiety,Kung mag extend naman po pwede naman po pauwii...,april,2020.0,Facebook,Apr 2020
...,...,...,...,...,...,...,...,...
3270,2012,filipino,normal,gusto ko lang naman matapos na tong pandemic k...,march,2021.0,Twitter,Mar 2021
3271,2013,taglish,normal,the fact that sinira ng pandemic na to mental ...,march,2021.0,Twitter,Mar 2021
3272,2014,filipino,normal,"sobrang miss ko na mga kaibigan ko, grabe iba ...",march,2021.0,Twitter,Mar 2021
3273,2015,filipino,normal,"Naalala ko lang, holy week sa Pinas ngayon at ...",april,2020.0,Twitter,Apr 2020


### Layout

In [8]:
external_stylesheets = [
    {
        "href": "https://fonts.googleapis.com/css2?family=Montserrat&display=swap",
        "rel": "stylesheet",
    },
]

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
app.title = "Philippine Mental Health Amid COVID-19 Pandemic"

app.layout = html.Div(
    children=[
        html.Div(
            children=[
                html.Div(html.Img(src=app.get_asset_url('applogo.png'), className="logo"),),
                html.H1(
                    children="Philippine Mental Health Amid COVID-19 Pandemic", className="header-title"
                ),
                html.P(
                    children="A textual analysis approach on Facebook and Twitter data",
                    className="header-description",
                ),
            ],
            className="header"
        ),
        html.Div(
            children=[
                html.H3("Filter by:"),
                html.Div(
                    children=[
                        html.Div(children="Month", className="menu-title"),
                        dcc.Dropdown(
                            id="month_dd",
                            options=MONTHSOPTIONS,
                            value="all",
                            clearable=False,
                            className="dropdown"
                        )
                    ],
                    className="filters"
                ),
                html.Div(
                    children=[
                        html.Div(children="Year", className="menu-title"),
                        dcc.Dropdown(
                            id="year_dd",
                            options=yearsOPTIONS,
                            value="all",
                            clearable=False,
                            className="dropdown"
                        )
                    ],
                    className="filters"
                ),
                html.Div(
                    children=[
                        html.Div(children="Platform", className="menu-title"),
                        dcc.Dropdown(
                            id="platform_dd",
                            options=PLATOPTIONS,
                            value="all",
                            clearable=False,
                            className="dropdown"
                        )
                    ],
                    className="filters"
                )
            ],
            className="menu"
        ),
        html.Div(
            children=[
                html.Div(
                    children=dcc.Graph(
                        id="bar_chart", figure={}),
                    className="card"
                ),
                html.Div(
                    children=dcc.Graph(
                        id="pie_chart", figure={}),
                    className="card"
                ),
                html.Div(
                    children=dcc.Graph(
                        id="line_chart", figure={}),
                    className="card"
                ),
            ],
            className="wrapper"
        ),
        html.Div(
            children=[
                html.Div(
                    className='flex-con',
                    children=[
                        html.H2("Predict Text", style={'fontSize': 18}),
                        html.Button('Upload File', id='upload-file', n_clicks=0)]
                ),
                html.Div(
                    children=dcc.Textarea(
                        id='textarea',
                        value='',
                        style={'width': 900, 'height': 100}),),
                html.Br(),
                html.Div(
                    className='row',
                    children=html.Button('Submit', 
                        id='textarea-state-example-button', n_clicks=0)),
                html.Br(),
                html.Div(id='textarea-state-example-output', style={'whiteSpace': 'pre-line'})
            ],
            className="textCard"
        ),
        html.Div(
            children=[
                html.Div(
                    children=html.H4(children="© Data Diggers 2021", className="footer"),
                ),
            ], 
        )
    ])

### Predictive Model

In [9]:
data_98 = pd.read_csv('98-dataset.csv')
svmTF = analyze('svm', 'tf', data_98, '')
# generate_cross_validation(svmTF['cls'], svmTF['name'], svmTF['X'], svmTF['y'])

svm classification report:
              precision    recall  f1-score   support

     Anxiety       0.50      0.30      0.37        30
  Depression       0.65      0.76      0.70        29
      Normal       0.62      0.67      0.65        30
        PTSD       0.68      0.79      0.73        29

    accuracy                           0.63       118
   macro avg       0.61      0.63      0.61       118
weighted avg       0.61      0.63      0.61       118

Accuracy score	->	62.71 %
Precision score	->	61.13 %
Recall score	->	62.71 %
F1 score	->	61.05 %
ROC AUC score	->	84.24 %


In [10]:
CLASSIFIER = svmTF['cls']
VECTORIZER = svmTF['vectorizer']

def getMentalHealthCondition(result):
    if result == 0: return 'Anxiety'
    if result == 1: return 'Depression'
    if result == 2: return 'Normal'
    if result == 3: return 'PTSD'
    
def predictText(text):
    pre_text = clean_final(text)
    test = VECTORIZER.transform(pre_text)
    result = CLASSIFIER.predict(test)[0]
    return getMentalHealthCondition(result)

### Application

In [None]:
@app.callback(
    [Output(component_id = "line_chart", component_property = "figure"),
     Output(component_id = "pie_chart", component_property = "figure"),
     Output(component_id = "bar_chart", component_property = "figure"),
     Output(component_id = "textarea-state-example-output", component_property = "children")],
    [Input(component_id = "platform_dd", component_property = "value"),
     Input(component_id = "month_dd", component_property = "value"),
     Input(component_id = "year_dd", component_property = "value"),
     Input(component_id = "textarea-state-example-button", component_property = "n_clicks")],
    [State(component_id ='textarea', component_property = 'value')]
)

def update_graph(platform, month, year, n_clicks, value):
    # pie
    colors = ['teal', 'mediumturquoise', 'aquamarine', 'lightseagreen']
    pieChart = getChart('pie', platform, month, year)
    pieChart.update_traces(textfont_size=20, marker=dict(colors=colors, line=dict(color='#000000', width=2)))
    # bar
    barChart = getChart('bar', platform, month, year)
    barChart.update_traces(marker_color=colors)
    # line
    lineChart = getLineChart(year)
    lineChart.update_traces(mode='markers+lines')
  
    #text
    if n_clicks > 0:
        value = value.strip()
        if len(value) == 0:
            value = 'Provide text here.'
        else:
            value = predictText([value])
    return lineChart, pieChart, barChart, 'Classification: {}'.format(value)

if __name__ == "__main__":
    app.run_server(debug=False)

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [25/May/2021 17:33:22] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/May/2021 17:33:23] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/May/2021 17:33:23] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/May/2021 17:33:24] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/May/2021 17:34:05] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/May/2021 17:34:05] "[37mGET /assets/style.css?m=1621935243.7434294 HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/May/2021 17:34:05] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/May/2021 17:34:05] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/May/2021 17:34:05] "[37mGET /assets/favicon.ico?m=1621672541.3951375 HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/May/2021 17:34:06] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
127.0.0.1 - - [25/May/2021 17:34:16] "[37mGET / HTTP/1.1[0m" 200 -