## Import

In [None]:
import os
import re
import json
import sys
import liwc
import pandas as pd
import numpy as np
from tqdm import tqdm
import multiprocessing
import matplotlib.pyplot as plt
from nltk import word_tokenize
from collections import defaultdict, Counter

In [None]:
from nltk import word_tokenize

In [None]:
from collections import Counter
from collections import OrderedDict

In [None]:
sys.path.append("..")
from utils import preprocess
from utils.tool_simple import get_keywords, list_to_txt, txt_to_list, list_drop_duplicate, many_list_count_sum, list_clean_blank, json_to_dict, dict_to_json
from data.dataset import *

## Data loading

In [None]:
path_dir_data = ""
max_length_tweet = 28

In [None]:
path_dir_data = path_dir_data
dict_negative_all = json_to_dict(
    os.path.join(path_dir_data, f"dict_user_negative.json")
)
dict_positive_all = json_to_dict(
    os.path.join(path_dir_data, f"dict_user_positive.json")
)

In [None]:
df_data_positive= pd.DataFrame.from_dict(dict_positive_all, orient='index')
df_data_positive

In [None]:
dict_positive = df_data_positive.to_dict('index')
len(dict_positive)

In [None]:
df_data_positive.describe()

In [None]:
df_data_negative = pd.DataFrame.from_dict(dict_negative_all, orient='index')
df_data_negative.shape

In [None]:
dict_negative = df_data_negative.to_dict('index')
len(dict_negative)

In [None]:
df_data_negative.describe()

In [None]:
df_data_positive.columns

In [None]:
df_data_negative.columns

## Data preparation

In [None]:
list_pos_before = df_data_positive['tweet_before_covid'].apply(eval)
label_pos_before = [0]*len(list_pos_before)
list_pos_after = df_data_positive['tweet_covid_depression'].apply(eval)
label_pos_after = [1]*len(list_pos_after)
list_pos_before = list(zip(list_pos_before, label_pos_before))
list_pos_after = list(zip(list_pos_after, label_pos_after))

## LIWC feature

In [None]:
LIWC_parse, category_names = liwc.load_token_parser('../resources/LIWC2015_English.dic')

In [None]:
def get_liwc_count(data_label):
    data_user, label_user = data_label
    data_user = word_tokenize(" ".join(data_user))
    liwc_count = Counter(category for token in data_user for category in LIWC_parse(token))
    dict_liwc_counts = dict(liwc_count)
    # length_words = sum(dict_liwc_counts.values())
    # for category in dict_liwc_counts.keys():
    #     dict_liwc_counts[category] = dict_liwc_counts[category] / length_words
    dict_liwc_counts['word_length'] = sum(dict_liwc_counts.values())
    dict_liwc_counts['label'] = label_user
    return dict_liwc_counts

In [None]:
num_threads = 32
pbar_data = tqdm(list_pos_before)
pool = multiprocessing.Pool(num_threads)
list_dict_pos_before = pool.map(get_liwc_count, pbar_data)
pool.close()
pool.join()

In [None]:
num_threads = 32
pbar_data = tqdm(list_pos_after)
pool = multiprocessing.Pool(num_threads)
list_dict_pos_after = pool.map(get_liwc_count, pbar_data)
pool.close()
pool.join()

In [None]:
len(list_dict_pos_before), len(list_dict_pos_after)

In [None]:
df_liwc_pos_before = pd.DataFrame(list_dict_pos_before)
df_liwc_pos_before = df_liwc_pos_before.fillna(0)
df_liwc_pos_after = pd.DataFrame(list_dict_pos_after)
df_liwc_pos_after = df_liwc_pos_after.fillna(0)

In [None]:
df_liwc_pos  = df_liwc_pos_before.append(df_liwc_pos_after)
df_liwc_pos.shape

In [None]:
df_liwc_pos.label.unique()

## Chi2

In [None]:
# label == 0
df_liwc_pos_0 = df_liwc_pos[df_liwc_pos['label']==0]
sum_word_length_0 = sum(df_liwc_pos_0['word_length'])
count_0 = pd.Series(df_liwc_pos_0.drop(columns=['word_length', 'label'], axis=1).apply(sum), name='count_0')
p_0 = pd.Series(df_liwc_pos_0.drop(columns=['word_length', 'label'], axis=1).apply(sum)/sum_word_length_0, name='p_0')
# label == 1
df_liwc_pos_1 = df_liwc_pos[df_liwc_pos['label']==1]
sum_word_length_1 = sum(df_liwc_pos_1['word_length'])
count_1 = pd.Series(df_liwc_pos_1.drop(columns=['word_length', 'label'], axis=1).apply(sum), name='count_1')
p_1 = pd.Series(df_liwc_pos_1.drop(columns=['word_length', 'label'], axis=1).apply(sum)/sum_word_length_1, name='p_1')

# merge
df_count = pd.DataFrame({count_0.name:count_0, p_0.name:p_0, count_1.name:count_1, p_1.name:p_1})
df_count = df_count.sort_values(by='count_1', ascending=False)
df_count

In [None]:
sum_word_length_0, sum_word_length_1

In [None]:
dict_category_c_p = df_count.to_dict(orient='index')

### Significant

In [None]:
from scipy.stats import chi2_contingency

In [None]:
def significant_occurrence(name, occurrence_before, occurrence_after, count_before, count_after,  correction=False):
    not_after = count_after-occurrence_after
    not_before = count_before-occurrence_before
    # build 2*2 table
    df_chi2 = pd.DataFrame(columns=['occurrence','Not', 'Sum'], index=['after', 'before'])
    df_chi2.loc['after'] = [occurrence_after, not_after, count_after]
    df_chi2.loc['before'] = [occurrence_before, not_before, count_before]
    # cal
    chi2, P, dof, ex = chi2_contingency(df_chi2.drop('Sum',axis=1).values, correction=correction)
    OR = (occurrence_after*not_before) / (occurrence_before*not_after)
    Mie = 1.96/np.sqrt(chi2)
    interval_Mie = [ np.power(OR, 1-Mie), np.power(OR, 1+Mie) ]
    interval_Mie = np.around(interval_Mie, 2)
    
    return df_chi2, chi2, P, OR, interval_Mie

In [None]:
df_category_chi2 = pd.DataFrame(columns=['category','OR','P','95%CI','Chi2','Count_0','Count_1'])
for category in dict_category_c_p.keys():
    occurrence_0, occurrence_1 = dict_category_c_p[category]['count_0'], dict_category_c_p[category]['count_1']
    p_0, p_1 = dict_category_c_p[category]['p_0'], dict_category_c_p[category]['p_1']
    df_chi2, chi2, P, OR, interval_Mie = significant_occurrence(category, occurrence_before=occurrence_0, occurrence_after=occurrence_1, count_before=sum_word_length_0, count_after=sum_word_length_1)
    s_before = f"{int(occurrence_0)} ({p_0*100:1f}%)"
    s_after = f"{int(occurrence_1)} ({p_1*100:.1f}%)"
    df_category_chi2.loc[len(df_category_chi2)] = [category, OR, P, interval_Mie, chi2, s_before, s_after]

In [None]:
df_category_chi2 = df_category_chi2.sort_values(by='P', ascending=True)
df_category_chi2 = df_category_chi2[df_category_chi2['P']<0.0001]
df_category_chi2

In [None]:
df_count.loc[df_category_chi2['category']]

In [None]:
df_category_chi2_more = df_category_chi2[df_category_chi2['OR']>1].sort_values(by='OR', ascending=False)
df_category_chi2_less = df_category_chi2[df_category_chi2['OR']<1].sort_values(by='OR', ascending=True)

In [None]:
df_category_chi2_more[:20]

In [None]:
df_category_chi2_more.to_excel("df_category_chi2_more.xlsx", index=None)

In [None]:
df_category_chi2_less[:20]

In [None]:
df_category_chi2_less.to_excel("df_category_chi2_less.xlsx", index=None)

### Paint

In [None]:
import plotly.graph_objs as go

In [None]:
list_category_top = df_category_chi2.category.tolist()[:20]
list_category_top

In [None]:
# fig = go.Figure(data=[
#     # go.Bar(name='General', x=list_SNOMED_top, y=df_SNOMED_body_count_percent[:22].percent),
#     go.Bar(name='0', x=list_category_top, y=[ dict_category_c_p[category]['p_0'] for category in list_category_top ]),
#     go.Bar(name='1', x=list_category_top, y=[ dict_category_c_p[category]['p_1'] for category in list_category_top ])
# ])
# # Change the bar mode
# fig.update_layout(barmode='group')
# fig.show()

In [None]:
color_1 = 'indianred' 
color_2 = 'lightsalmon' 
fig = go.Figure()
fig.add_trace(go.Bar(
    x=list_category_top,
    y=[ dict_category_c_p[category]['p_0'] for category in list_category_top ],
    name='0',
    marker_color=color_1,
))
fig.add_trace(go.Bar(
    x=list_category_top,
    y=[ dict_category_c_p[category]['p_1'] for category in list_category_top ],
    name='1',
    marker_color=color_2,
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(
    # title='Symptoms Prevalence of Different variants',
    xaxis_tickfont_size=15,
    xaxis_tickangle=-45,
    yaxis=dict(
        title='Prevalence(%)',
        titlefont_size=16,
        tickfont_size=14,
        ticksuffix='%',
    ),
    legend=dict(
        x=0.95,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)',
        font_size=15
    ),
    barmode='group',
    bargap=0.1, # gap between bars of adjacent location coordinates.
    bargroupgap=0.0, # gap between bars of the same location coordinate.
    height=500,
    width=1000,
    template='simple_white'
)
# fig.write_image(path_dir_figure3+"symptoms_different_variant.svg")
# fig.write_image(path_dir_figure3+"symptoms_different_variant.pdf")
fig.show()