## Import

In [None]:
import re
import os
import time
import json
import copy
import emoji
import random
import numpy as np
from pathlib import Path
from tqdm import tqdm

In [None]:
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [None]:
# 分类报告
from sklearn.metrics import classification_report
# 混淆矩阵
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
# ROC曲线与AUC
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import PrecisionRecallDisplay
# PR曲线
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score, recall_score
from sklearn.metrics import accuracy_score
# split
from sklearn.model_selection import train_test_split

In [None]:
sys.path.append("..")
from utils import preprocess
from utils.tool_simple import get_keywords, list_to_txt, txt_to_list, list_drop_duplicate, many_list_count_sum, list_clean_blank, json_to_dict, dict_to_json, init_logger
from data.dataset import *

## Data loading

In [None]:
num_case = 'max'
max_length_tweet = 'max'
max_word = 100000
path_dir_data = f""
path_dir_record = f""
if not os.path.exists(path_dir_record):
    os.mkdir(path_dir_record)

In [None]:
logger = init_logger(path_dir_record+'tfidf.log')
logger.info(f"num_case: {num_case}")
logger.info(f"max_length_tweet: {max_length_tweet}")

In [None]:
dict_negative_all = json_to_dict(
    os.path.join(path_dir_data, f"dict_user_negative.json")
)
dict_positive_all = json_to_dict(
    os.path.join(path_dir_data, f"dict_user_positive.json")
)
logger.info(f"Loading negative: {len(dict_negative_all)}")
logger.info(f"Loading positive: {len(dict_positive_all)}")

num_case = (
    int(num_case) if not isinstance(num_case, str) else len(dict_positive_all)
)
dict_negative = get_dict_part(
    dict_negative_all,
    num_case * len(dict_negative_all) / len(dict_positive_all),
    shuffle=False,
)
dict_positive = get_dict_part(dict_positive_all, num_case, shuffle=False)

num_negative, num_positive, list_data_negative, list_data_positive = process_data_merge(
    dict_negative, dict_positive, max_length_tweet=max_length_tweet
)

In [None]:
list_data = list_data_negative + list_data_positive
list_label = [0]*num_negative + [1]*num_positive

In [None]:
data_train, data_test, label_train, label_test = train_test_split(list_data, list_label, test_size=0.3, random_state=42)

In [None]:
logger.info(
    f"Training Size: {len(data_train)}, {sum(label_train)} positive and {len(label_train)-sum(label_train)} negative"
)
logger.info(
    f"Testing Size: {len(data_test)}, {sum(label_test)} positive and {len(label_test)-sum(label_test)} negative"
)

## Model

### tf-idf

In [None]:
# data_train_vec = np.load(path_dir_modeling_data + 'tfidf_train_t123.npy', allow_pickle=True)
# data_test_vec = np.load(path_dir_modeling_data + 'tfidf_test_t123.npy', allow_pickle=True)

In [None]:
vec_tf = TfidfVectorizer(max_features=max_word, use_idf=True, smooth_idf=True, stop_words='english')

In [None]:
vec_tf.fit_transform(['hello world'])

In [None]:
data_train_vec = vec_tf.fit_transform(data_train)
data_test_vec = vec_tf.transform(data_test)

In [None]:
data_train_vec.shape

In [None]:
data_train_vec[0]

In [None]:
data_train_vec[0].indices.shape

In [None]:
np.save(path_dir_record + 'tfidf_train.npy', data_train_vec, allow_pickle=False) 
np.save(path_dir_record + 'tfidf_test.npy', data_test_vec, allow_pickle=False) 

In [None]:
vec_test = np.load("", allow_pickle=True)

In [None]:
vec_test.shape

##### xgb

In [None]:
model_xgb = XGBClassifier().fit(data_train_vec, label_train)

In [None]:
label_pred_xgb = model_xgb.predict(data_test_vec)
prob_pred_xgb = model_xgb.predict_proba(data_test_vec)

In [None]:
logger.info(classification_report(label_test, label_pred_xgb))

In [None]:
logger.info(f" acc : {accuracy_score(label_test, label_pred_xgb):.4f}")
logger.info(f" rec : {recall_score(label_test, label_pred_xgb):.4f}")
logger.info(f"  f1 : {f1_score(label_test, label_pred_xgb):.4f}")
logger.info(f"auprc: {average_precision_score(label_test, prob_pred_xgb[:,1]):.4f}")
logger.info(f"auroc: {roc_auc_score(label_test, prob_pred_xgb[:,1]):.4f}")

In [None]:
ConfusionMatrixDisplay.from_predictions(label_test, label_pred_xgb, display_labels = ['Normal','Mental'], cmap=plt.cm.Blues)

In [None]:
list_word_importance = list(zip(vec_tf.get_feature_names(), model_xgb.feature_importances_))    
list_word_importance = sorted(list_word_importance, key=lambda x: x[1], reverse=True)
for idx, word in enumerate(list_word_importance):
    logger.info(word)
    if idx>=100:
        break