# Install

In [None]:
# Libraries you might not have
# !python3 -m pip install --upgrade nbconvert 
# !python3 -m pip install --upgrade nbstripout 
# !python3 -m pip install tomotopy
# !python3 -m pip install sklearn

# Imports

In [None]:
import nltk
# Things to install from nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
%load_ext autoreload
%autoreload 2

import tomotopy as tp
from itertools import chain
import tqdm
import pandas as pd
import numpy as np
from collections import defaultdict
import sklearn.preprocessing
import imblearn.over_sampling

import dataloader
import bow
import slda
import post_classifier
import aggregate
import user_classifier

# Process Train Data

In [None]:
# Uncomment below to process data if you have not already

POSTPATH = './Data/crowd/train/shared_task_posts.csv'
LABELPATH = './Data/crowd/train/crowd_train.csv'
USERPATH = './Data/crowd/train/task_C_train.posts.csv'

users = dataloader.load_user_subset_from_train(USERPATH, subset = 1000)
    
user_to_post, post_to_words, post_to_metadata = dataloader.load_posts(POSTPATH, user_subset = users, append_title = True)
post_to_label = dataloader.load_classification(LABELPATH, user_to_post, post_to_words, post_to_metadata, user_subset = users)
filtered_data, sw_posts, sw_timestamps = dataloader.filter_posts(post_to_label, post_to_metadata, filter_images=True)
print(len(filtered_data))
filtered_data = dataloader.filter_near_SW(filtered_data,post_to_metadata, sw_timestamps)
print(len(filtered_data))

filtered_data = dataloader.filter_stopwords(filtered_data)
sw_posts = dataloader.filter_stopwords(sw_posts)

In [None]:
FOLDERPATH = './Processing/crowd_processed/'
dataloader.save_to_folder(FOLDERPATH, user_to_post, post_to_metadata, filtered_data, sw_posts, sw_timestamps)

# Load Process Train Data

In [None]:
FOLDERPATH = './Processing/crowd_processed/'
user_to_post, post_to_metadata, filtered_data, sw_posts, sw_timestamps = dataloader.load_from_folder(FOLDERPATH)

In [None]:
print(len([filtered_data[key] for key in filtered_data.keys() if filtered_data[key][2] == 'a']))
print(len([filtered_data[key] for key in filtered_data.keys() if filtered_data[key][2] == 'b']))
print(len([filtered_data[key] for key in filtered_data.keys() if filtered_data[key][2] == 'c']))
print(len([filtered_data[key] for key in filtered_data.keys() if filtered_data[key][2] == 'd']))
print(len(filtered_data))

# SLDA Model: Train

## Train Topic Model

In [None]:
model = slda.train_slda_model_from_data(filtered_data, topics=40)

## Topic Model Outputs

In [None]:
slda_coefficients = model.get_regression_coef(0)
data = []
for k in range(model.k):
    top_words = model.get_topic_words(k, top_n=40)
    words = [word for (word, float) in top_words]
    words = ", ".join(words)
    data.append([words, slda_coefficients[k]])
    
indices = np.array(slda_coefficients).argsort()
data = np.array(data)
data = data[indices]

pd.DataFrame(data, columns=["Topic", "Suicidality Coefficient"])

In [None]:
# Uncomment to print example of overly negative topic
print(data[np.shape(data)[0]-1][0])

## sLDA Features: Train

In [None]:
vector_train = slda.get_topic_vecs(model, filtered_data)

In [None]:
#Uncomment to print example feature vector
print(vector_train['hw4uh'][0])

In [None]:
print(vector_train)

# BOW

In [None]:
word2index,index2word = bow.generate_vocabulary(filtered_data)
pca_model, vector_train_bow = bow.get_PCA_vectors_from_post_set(filtered_data, word2index)

In [None]:
print(vector_train_bow)

# Post Classifier: Train

## Post Classifier: Train

In [None]:
def minmax_norm(arr):
    
    return (arr - np.min(arr))/(np.max(arr) -np.min(arr))

In [None]:
#USE JUST SLDA
X_train = np.array([ vector_train[key][0] for key in vector_train.keys()])
y_train = np.array([ vector_train[key][1] for key in vector_train.keys()])
y_train = y_train.reshape(np.shape(y_train)[0])

In [None]:
#USE JUST BOW
X_train = np.array([ vector_train_bow[key][0] for key in vector_train_bow.keys()])
y_train = np.array([ vector_train_bow[key][1] for key in vector_train_bow.keys()])
y_train = y_train.reshape(np.shape(y_train)[0])


In [None]:
#USE SLDA + BOW
X_train = np.array([ np.concatenate([minmax_norm(vector_train[key][0]),minmax_norm(vector_train_bow[key][0])]) for key in vector_train.keys()])
y_train = np.array([ vector_train[key][1] for key in vector_train.keys()])

y_train = y_train.reshape(np.shape(y_train)[0])

### Oversampling

In [None]:
ros = imblearn.over_sampling.RandomOverSampler(random_state=0)
X_train, y_train = ros.fit_resample(X_train, y_train)

In [None]:
print(np.shape(X_train))
print(np.shape(y_train))

### Logistic Regression

In [None]:
#UNCOMMENT TO RUN GRID SEARCH CV
#p_clf = post_classifier.PostClassification("LogReg")
#param_dict = {'C':[0.2,0.5,0.7,1,1.5,2,5]}
#p_clf.train_grid_search_CV(X_train, y_train, param_dict, groups=5)

#RUN WITH OPTIMAL PARAMETERS
p_clf = post_classifier.PostClassification("LogReg")
p_clf.train(X_train, y_train)


### Linear SVM

In [None]:
#UNCOMMENT TO RUN GRID SEARCH CV
#p_clf = post_classifier.PostClassification("LinearSVM")
#param_dict = {'C':[0.2,0.5,1,2]}
#p_clf.train_grid_search_CV(X_train, y_train, param_dict, groups=5)

p_clf = post_classifier.PostClassification("LinearSVM")
p_clf.train(X_train, y_train)

### RBF SVM

In [None]:
#UNCOMMENT TO RUN GRID SEARCH CV
#p_clf = post_classifier.PostClassification("RbfSVM")
#param_dict = {'C':[0.5,1,2,5]}
#p_clf.train_grid_search_CV(X_train, y_train, param_dict, groups=5)

p_clf = post_classifier.PostClassification("RbfSVM")
p_clf.train(X_train, y_train)

### AdaBoost

In [None]:
p_clf = post_classifier.PostClassification("AdaBoost")
p_clf.train(X_train, y_train)

### Random Forest

In [None]:
p_clf = post_classifier.PostClassification("RandomForest")
p_clf.train(X_train, y_train)

### Multi-layer Perceptron

In [None]:
#UNCOMMENT TO RUN GRID SEARCH CV
#p_clf = post_classifier.PostClassification("MLP")
#param_dict = {'hidden_layer_sizes':[(64,64),(64,64,64),(32,32), (32,32,32)], 'learning_rate': ('constant', 'adaptive')}
#p_clf.train_grid_search_CV(X_train, y_train, param_dict, groups=5)

p_clf = post_classifier.PostClassification("MLP")
p_clf.train(X_train, y_train)

## Predict Post Classifier: Train

In [None]:
y_pred_train = p_clf.test(X_train)

In [None]:
p_clf.get_metrics(y_train, y_pred_train)

In [None]:
print(sum(y_pred_train))
print(sum(y_train))

# User Classfier: Train

## Aggregate: Train

In [None]:
# change y from a, b, c, d, control to -, 1
user_to_y_train = defaultdict(int)
for data in tqdm.tqdm(filtered_data.keys()):
    user_to_y_train[filtered_data[data][0]] = (1 if filtered_data[data][2] == 'd' else 0)

In [None]:
post_to_uypred_train = defaultdict(list)

for i, post_id in enumerate(vector_train.keys()):
    user_id = filtered_data[post_id][0]
    post_to_uypred_train[post_id] = [user_id, y_pred_train[i]]

In [None]:
user_to_post_label_train = aggregate.aggregate_posts(FOLDERPATH, post_to_uypred_train)

## Argmax: Train

In [None]:
u_clf_train = user_classifier.UserClassification(user_to_post_label_train)
user_to_ypred_train = u_clf_train.argmax()

In [None]:
user_y_train = []
user_y_pred_train = []
for user_id in user_to_ypred_train:
    user_y_train.append(user_to_y_train[user_id])
    user_y_pred_train.append(user_to_ypred_train[user_id])

In [None]:
u_clf_train.get_metrics(user_y_train, user_y_pred_train)

# Process Data: Test

In [None]:
POSTPATH2 = './Data/crowd/test/shared_task_posts_test.csv'
LABELPATH2 = './Data/crowd/test/crowd_test_C.csv'
USERPATH2 = './Data/crowd/test/task_C_test.posts.csv'
    
user_to_post_test, post_to_words_test, post_to_metadata_test = dataloader.load_posts(POSTPATH2, append_title = True)
post_to_label_test = dataloader.load_classification(LABELPATH2, user_to_post_test, post_to_words_test, post_to_metadata_test)
filtered_data_test, sw_posts_test, sw_timestamps_test = dataloader.filter_posts(post_to_label_test, post_to_metadata_test, filter_images = True)
print(len(filtered_data_test))
filtered_data_test = dataloader.filter_near_SW(filtered_data_test, post_to_metadata_test, sw_timestamps_test)
print(len(filtered_data_test))

filtered_data_test = dataloader.filter_stopwords(filtered_data_test)
sw_posts_test = dataloader.filter_stopwords(sw_posts_test)

In [None]:
FOLDERPATH2 = './Processing/crowd_processed_test/'
dataloader.save_to_folder(FOLDERPATH2, user_to_post_test, post_to_metadata_test, filtered_data_test, sw_posts_test, sw_timestamps_test)

# Load Process Data: Test

In [None]:
FOLDERPATH2 = './Processing/crowd_processed_test/'
user_to_post_test, post_to_metadata_test, filtered_data_test, sw_posts_test, sw_timestamps_test = dataloader.load_from_folder(FOLDERPATH2)

In [None]:
print(len([filtered_data_test[key] for key in filtered_data_test.keys() if filtered_data_test[key][2] == 'a']))
print(len([filtered_data_test[key] for key in filtered_data_test.keys() if filtered_data_test[key][2] == 'b']))
print(len([filtered_data_test[key] for key in filtered_data_test.keys() if filtered_data_test[key][2] == 'c']))
print(len([filtered_data_test[key] for key in filtered_data_test.keys() if filtered_data_test[key][2] == 'd']))
print(len(filtered_data_test))

# Feature Extraction: Test

## sLDA

In [None]:
X_test, y_test = slda.vectorize_data_set(model, FOLDERPATH2)

In [None]:
vector_test = slda.get_topic_vecs(model, filtered_data_test)

## BOW

In [None]:
_, vector_test_bow = bow.get_PCA_vectors_from_post_set(filtered_data_test, word2index, pca_model=pca_model)

In [None]:
vector_test_bow

In [None]:
# USE JUST BOW
X_test = np.array([ vector_test_bow[key][0] for key in vector_test_bow.keys()])
y_test = np.array([ vector_test_bow[key][1] for key in vector_test_bow.keys()])
y_test = y_test.reshape(np.shape(y_test)[0])

In [None]:
#USE SLDA + BOW
X_test = np.array([ np.concatenate([minmax_norm(vector_test[key][0]),minmax_norm(vector_test_bow[key][0])]) for key in vector_test.keys()])
y_test = np.array([ vector_test[key][1] for key in vector_test.keys()])

y_test = y_test.reshape(np.shape(y_test)[0])

In [None]:
print(np.shape(X_test))
print(np.shape(y_test))

# Post Classifier: Test

## Predict Post Classifier: Test

In [None]:
y_pred_test = p_clf.test(X_test)

In [None]:
p_clf.get_metrics(y_test, y_pred_test)

In [None]:
print(sum(y_pred_test))
print(sum(y_test))

# User Classifier: Test

## Aggregate: Test

In [None]:
# change y from a, b, c, d, control to -, 1
user_to_y_test = defaultdict(int)
for data in tqdm.tqdm(filtered_data_test.keys()):
    user_to_y_test[filtered_data_test[data][0]] = (1 if filtered_data_test[data][2] == 'd' else 0)

In [None]:
len(filtered_data_test)

In [None]:
len(vector_test)

In [None]:
post_to_uypred_test = defaultdict(list)

for i, post_id in enumerate(vector_test.keys()):
    user_id = filtered_data_test[post_id][0]
    post_to_uypred_test[post_id] = [user_id, y_pred_test[i]]

In [None]:
user_to_post_label_test = aggregate.aggregate_posts(FOLDERPATH2, post_to_uypred_test)

## Argmax: Test

In [None]:
u_clf_test = user_classifier.UserClassification(user_to_post_label_test)
user_to_ypred_test = u_clf_test.argmax()

In [None]:
user_y_test = []
user_y_pred_test = []
for user_id in user_to_ypred_test:
    user_y_test.append(user_to_y_test[user_id])
    user_y_pred_test.append(user_to_ypred_test[user_id])

In [None]:
u_clf_test.get_metrics(user_y_test, user_y_pred_test)

## Threshold: Test

In [None]:
u_clf_test = user_classifier.UserClassification(user_to_post_label_test)

In [None]:
u_clf_test.find_threshold(user_to_y_test)

In [None]:
user_to_ypred_test = u_clf_test.minimum(1)

In [None]:
user_y_test = []
user_y_pred_test = []
for user_id in user_to_ypred_test:
    user_y_test.append(user_to_y_test[user_id])
    user_y_pred_test.append(user_to_ypred_test[user_id])

In [None]:
u_clf_test.get_metrics(user_y_test, user_y_pred_test)