## Import

In [None]:
import os
import re
import json
import liwc
import numpy as np
from tqdm import tqdm
import multiprocessing
import matplotlib.pyplot as plt
from nltk import word_tokenize
from collections import defaultdict, Counter

In [None]:
from nltk import word_tokenize

In [None]:
from collections import Counter
from collections import OrderedDict

In [None]:
import pandas as pd

In [None]:
# import modin.pandas as pd
# import ray
# ray.init(num_cpus=16)

In [None]:
sys.path.append("..")
from utils import preprocess
from utils.tool_simple import get_keywords, list_to_txt, txt_to_list, list_drop_duplicate, many_list_count_sum, list_clean_blank, json_to_dict, dict_to_json
from data.dataset import *

## Data loading

In [None]:
num_case = 'max'
max_length_tweet = 'max'
path_dir_data = f""
path_dir_record = f""
if not os.path.exists(path_dir_record):
    os.mkdir(path_dir_record)
logger = init_logger(path_dir_record+'liwc.log')

In [None]:
dict_negative_all = json_to_dict(
    os.path.join(path_dir_data, f"dict_user_negative.json")
)
dict_positive_all = json_to_dict(
    os.path.join(path_dir_data, f"dict_user_positive.json")
)
logger.info(f"Loading negative: {len(dict_negative_all)}")
logger.info(f"Loading positive: {len(dict_positive_all)}")

num_case = (
    int(num_case) if not isinstance(num_case, str) else len(dict_positive_all)
)
dict_negative = get_dict_part(
    dict_negative_all,
    num_case * len(dict_negative_all) / len(dict_positive_all),
    shuffle=False,
)
dict_positive = get_dict_part(dict_positive_all, num_case, shuffle=False)

num_negative, num_positive, list_data_negative, list_data_positive = process_data_merge(
    dict_negative, dict_positive, max_length_tweet=max_length_tweet
)

In [None]:
list_data = list_data_negative + list_data_positive
list_label = [0]*num_negative + [1]*num_positive
data_train, data_test, label_train, label_test = train_test_split(list_data, list_label, test_size=0.3, random_state=42)

In [None]:
logger.info(
    f"Training Size: {len(data_train)}, {sum(label_train)} positive and {len(label_train)-sum(label_train)} negative"
)
logger.info(
    f"Testing Size: {len(data_test)}, {sum(label_test)} positive and {len(label_test)-sum(label_test)} negative"
)

## LIWC test

In [None]:
LIWC_parse, category_names = liwc.load_token_parser('../resources/LIWC2015_English.dic')

In [None]:
list(LIWC_parse("accept"))

In [None]:
# def tokenize(text):
#     # you may want to use a smarter tokenizer
#     for match in re.finditer(r'\w+', text, re.UNICODE):
#         yield match.group(0)

In [None]:
gettysburg = '''Four score and seven years ago our fathers brought forth on
  this continent a new nation, conceived in liberty, and dedicated to the
  proposition that all men are created equal. Now we are engaged in a great
  civil war, testing whether that nation, or any nation so conceived and so
  dedicated, can long endure. We are met on a great battlefield of that war.
  We have come to dedicate a portion of that field, as a final resting place
  for those who here gave their lives that that nation might live. It is
  altogether fitting and proper that we should do this.'''.lower()
  
gettysburg_tokens = word_tokenize(gettysburg)
gettysburg_tokens[:5]

In [None]:
gettysburg_counts = Counter(category for token in gettysburg_tokens for category in LIWC_parse(token))
# logger.info(gettysburg_counts)
#=> Counter({'funct': 58, 'pronoun': 18, 'cogmech': 17, ...})
dict(gettysburg_counts)

## LIWC feature

In [None]:
def get_liwc_count(data_label):
    data_user, label_user = data_label
    data_user = word_tokenize(data_user)
    liwc_count = Counter(category for token in data_user for category in LIWC_parse(token))
    dict_liwc_counts = dict(liwc_count)
    # length_words = sum(dict_liwc_counts.values())
    # for category in dict_liwc_counts.keys():
    #     dict_liwc_counts[category] = dict_liwc_counts[category] / length_words
    dict_liwc_counts['word_length'] = sum(dict_liwc_counts.values())
    dict_liwc_counts['label'] = label_user
    return dict_liwc_counts

In [None]:
list_data_label_train = list(zip(data_train, label_train))
list_data_label_test = list(zip(data_test, label_test))

In [None]:
num_threads = 32
pbar_data = tqdm(list_data_label_train)
pool = multiprocessing.Pool(num_threads)
list_dict_liwc_counts_train = pool.map(get_liwc_count, pbar_data)
pool.close()
pool.join()

In [None]:
num_threads = 32
pbar_data = tqdm(list_data_label_test)
pool = multiprocessing.Pool(num_threads)
list_dict_liwc_counts_test = pool.map(get_liwc_count, pbar_data)
pool.close()
pool.join()

In [None]:
len(list_dict_liwc_counts_train), len(list_dict_liwc_counts_test)

In [None]:
df_liwc_train = pd.DataFrame(list_dict_liwc_counts_train)
df_liwc_train = df_liwc_train.fillna(0)
df_liwc_test = pd.DataFrame(list_dict_liwc_counts_test)
df_liwc_test = df_liwc_test.fillna(0)

In [None]:
df_liwc_train = df_liwc_train.append(df_liwc_test)

## Chi2

In [None]:
# label == 0
df_liwc_train_0 = df_liwc_train[df_liwc_train['label']==0]
sum_word_length_0 = sum(df_liwc_train_0['word_length'])
count_0 = pd.Series(df_liwc_train_0.drop(columns=['word_length', 'label'], axis=1).apply(sum), name='count_0')
p_0 = pd.Series(df_liwc_train_0.drop(columns=['word_length', 'label'], axis=1).apply(sum)/sum_word_length_0, name='p_0')
# label == 1
df_liwc_train_1 = df_liwc_train[df_liwc_train['label']==1]
sum_word_length_1 = sum(df_liwc_train_1['word_length'])
count_1 = pd.Series(df_liwc_train_1.drop(columns=['word_length', 'label'], axis=1).apply(sum), name='count_1')
p_1 = pd.Series(df_liwc_train_1.drop(columns=['word_length', 'label'], axis=1).apply(sum)/sum_word_length_1, name='p_1')

# merge
df_count = pd.DataFrame({count_0.name:count_0, p_0.name:p_0, count_1.name:count_1, p_1.name:p_1})
df_count = df_count.sort_values(by='count_1', ascending=False)
df_count

In [None]:
sum_word_length_0, sum_word_length_1

In [None]:
dict_category_c_p = df_count.to_dict(orient='index')

### Significant

In [None]:
from scipy.stats import chi2_contingency

In [None]:
def significant_occurrence(name, occurrence_before, occurrence_after, count_before, count_after,  correction=False):
    not_after = count_after-occurrence_after
    not_before = count_before-occurrence_before
    # build 2*2 table
    df_chi2 = pd.DataFrame(columns=['occurrence','Not', 'Sum'], index=['after', 'before'])
    df_chi2.loc['after'] = [occurrence_after, not_after, count_after]
    df_chi2.loc['before'] = [occurrence_before, not_before, count_before]
    # cal
    chi2, P, dof, ex = chi2_contingency(df_chi2.drop('Sum',axis=1).values, correction=correction)
    OR = (occurrence_after*not_before) / (occurrence_before*not_after)
    Mie = 1.96/np.sqrt(chi2)
    interval_Mie = [ np.power(OR, 1-Mie), np.power(OR, 1+Mie) ]
    interval_Mie = np.around(interval_Mie, 2)
    
    return df_chi2, chi2, P, OR, interval_Mie

In [None]:
df_category_chi2 = pd.DataFrame(columns=['category','OR','P','95%CI','Chi2','Count_0','Count_1'])
for category in dict_category_c_p.keys():
    occurrence_0, occurrence_1 = dict_category_c_p[category]['count_0'], dict_category_c_p[category]['count_1']
    p_0, p_1 = dict_category_c_p[category]['p_0'], dict_category_c_p[category]['p_1']
    df_chi2, chi2, P, OR, interval_Mie = significant_occurrence(category, occurrence_before=occurrence_0, occurrence_after=occurrence_1, count_before=sum_word_length_0, count_after=sum_word_length_1)
    s_before = f"{int(occurrence_0)} ({p_0*100:1f}%)"
    s_after = f"{int(occurrence_1)} ({p_1*100:.1f}%)"
    df_category_chi2.loc[len(df_category_chi2)] = [category, OR, P, interval_Mie, chi2, s_before, s_after]

In [None]:
df_category_chi2 = df_category_chi2.sort_values(by='P', ascending=True)
df_category_chi2 = df_category_chi2[df_category_chi2['P']<0.0001]
df_category_chi2

In [None]:
df_category_chi2_more = df_category_chi2[df_category_chi2['OR']>1].sort_values(by='OR', ascending=False)
df_category_chi2_less = df_category_chi2[df_category_chi2['OR']<1].sort_values(by='OR', ascending=True)

In [None]:
df_category_chi2_more[:20]

In [None]:
df_category_chi2_more.to_excel("p2n_df_category_chi2_more.xlsx", index=None)

In [None]:
df_category_chi2_less[:20]

In [None]:
df_category_chi2_less.to_excel("p2n_df_category_chi2_less.xlsx", index=None)

In [None]:
# from statsmodels.stats.proportion import proportions_ztest

### Paint

In [None]:
import plotly.graph_objs as go

In [None]:
list_category_top = df_category_chi2.category.tolist()[:20]
list_category_top

In [None]:
# fig = go.Figure(data=[
#     # go.Bar(name='General', x=list_SNOMED_top, y=df_SNOMED_body_count_percent[:22].percent),
#     go.Bar(name='0', x=list_category_top, y=[ dict_category_c_p[category]['p_0'] for category in list_category_top ]),
#     go.Bar(name='1', x=list_category_top, y=[ dict_category_c_p[category]['p_1'] for category in list_category_top ])
# ])
# # Change the bar mode
# fig.update_layout(barmode='group')
# fig.show()

In [None]:
color_1 = 'indianred' 
color_2 = 'lightsalmon' 
fig = go.Figure()
fig.add_trace(go.Bar(
    x=list_category_top,
    y=[ dict_category_c_p[category]['p_0'] for category in list_category_top ],
    name='0',
    marker_color=color_1,
))
fig.add_trace(go.Bar(
    x=list_category_top,
    y=[ dict_category_c_p[category]['p_1'] for category in list_category_top ],
    name='1',
    marker_color=color_2,
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(
    # title='Symptoms Prevalence of Different variants',
    xaxis_tickfont_size=15,
    xaxis_tickangle=-45,
    yaxis=dict(
        title='Prevalence(%)',
        titlefont_size=16,
        tickfont_size=14,
        ticksuffix='%',
    ),
    legend=dict(
        x=0.95,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)',
        font_size=15
    ),
    barmode='group',
    bargap=0.1, # gap between bars of adjacent location coordinates.
    bargroupgap=0.0, # gap between bars of the same location coordinate.
    height=500,
    width=1000,
    template='simple_white'
)
# fig.write_image(path_dir_figure3+"symptoms_different_variant.svg")
# fig.write_image(path_dir_figure3+"symptoms_different_variant.pdf")
fig.show()

## Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [None]:
# 分类报告
from sklearn.metrics import classification_report
# 混淆矩阵
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
# ROC曲线与AUC
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import PrecisionRecallDisplay
# PR曲线
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [None]:
# data_train_vec = df_liwc_train.drop(columns=['word_length','label'])
cols_significant = df_category_chi2.category.tolist()[:10]
data_train_vec = df_liwc_train[cols_significant]
label_train = df_liwc_train['label']

# data_test_vec = df_liwc_test.drop(columns=['word_length','label'])
data_test_vec = df_liwc_test[cols_significant]
label_test = df_liwc_test['label']

In [None]:
# model_disorder = LogisticRegression().fit(data_train_vec, label_train)
model_disorder = XGBClassifier().fit(data_train_vec, label_train)

### lr

In [None]:
model_lr = LogisticRegression(class_weight='balanced').fit(data_train_vec, label_train)

In [None]:
label_pred_lr = model_lr.predict(data_test_vec)
prob_pred_lr = model_lr.predict_proba(data_test_vec)

In [None]:
logger.info(classification_report(label_test, label_pred_lr))

In [None]:
logger.info(f" acc : {accuracy_score(label_test, label_pred_lr):.4f}")
logger.info(f"  f1 : {f1_score(label_test, label_pred_lr):.4f}")
logger.info(f"auroc: {roc_auc_score(label_test, prob_pred_lr[:,1]):.4f}")
logger.info(f"auprc: {average_precision_score(label_test, prob_pred_lr[:,1]):.4f}")

In [None]:
ConfusionMatrixDisplay.from_predictions(label_test, label_pred_lr, display_labels = ['Normal','Mental'], cmap=plt.cm.Blues)

In [None]:
list_word_importance = list(zip(df_liwc_train.columns, model_lr.coef_[0]))
sorted(list_word_importance, key=lambda x: x[1], reverse=True)

In [None]:
logger.info("ok")