# Installs

In [None]:
# Libraries you might not have
# !python3 -m pip install --upgrade nbconvert 
# !python3 -m pip install --upgrade nbstripout 
# !python3 -m pip install tomotopy
# !python3 -m pip install sklearn

# Imports

In [None]:
import nltk
# Things to install from nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
%load_ext autoreload
%autoreload 2

import tomotopy as tp
from itertools import chain
import tqdm
import pandas as pd
import numpy as np
from collections import defaultdict
import pickle
import os.path

import dataloader
import bow
import slda
import post_classifier
import aggregate
import user_classifier

# Get Data

In [None]:
def process_data(POSTPATH, LABELPATH, USERPATH, FOLDERPATH, subset = 1000, append_title = False, filter_images=True):
    print('START: Processing data')
    users = dataloader.load_user_subset_from_train(USERPATH, subset = subset)
    
    user_to_post, post_to_words, post_to_metadata = dataloader.load_posts(POSTPATH, user_subset = users, append_title = append_title)
    post_to_label = dataloader.load_classification(LABELPATH, user_to_post, post_to_words, post_to_metadata, user_subset = users)
    filtered_data, sw_posts, sw_timestamps = dataloader.filter_posts(post_to_label, post_to_metadata, filter_images=filter_images)
    print(len(filtered_data))
    filtered_data = dataloader.filter_near_SW(filtered_data,post_to_metadata, sw_timestamps)
    print(len(filtered_data))

    filtered_data = dataloader.filter_stopwords(filtered_data)
    sw_posts = dataloader.filter_stopwords(sw_posts)
    
    dataloader.save_to_folder(FOLDERPATH, user_to_post, post_to_metadata, filtered_data, sw_posts, sw_timestamps)
    print('DONE: Processing data')
    return user_to_post, post_to_metadata, filtered_data, sw_posts, sw_timestamps

def load_processed_data(FOLDERPATH):
    print('IN PROGRESS: Loading data')
    return dataloader.load_from_folder(FOLDERPATH)

In [None]:
def print_label_size(data):
    a_len = len([data[key] for key in data.keys() if data[key][2] == 'a'])
    b_len = len([data[key] for key in data.keys() if data[key][2] == 'b'])
    c_len = len([data[key] for key in data.keys() if data[key][2] == 'c'])
    d_len = len([data[key] for key in data.keys() if data[key][2] == 'd'])
    total_len = len(data)
    control_len = total_len - a_len - b_len - c_len - d_len
    print('a: ', a_len)
    print('b: ', b_len)
    print('c: ', c_len)
    print('d: ', d_len)
    print('controls: ', control_len)
    print('total: ', total_len)

# Feature Extraction

In [None]:
def train_feature_model(data, data_type='filter_normal', sLDA=True, BOW=True, num_topics=40):
    if sLDA:
        slda_model_filename = 'slda_'+data_type+'_'+str(num_topics)+'_model.bin'
        slda_vectors_filename = 'slda_'+data_type+'_'+str(num_topics)+'_vectors.pickle'

        if os.path.exists(slda_model_filename):
            print('IN PROGRESS: Loading sLDA model')
            slda_model = tp.SLDAModel.load(slda_model_filename)
        else:
            print('START: Training sLDA model')
            slda_model = slda.train_slda_model_from_data(data, topics=num_topics)
            slda_model.save(slda_model_filename)
            print('IN PROGRESS: Saving sLDA model')
        # print out topics and top n words
        get_topics(slda_model)
        
        slda_vectors = extract_slda_features(data, slda_model, slda_vectors_filename, test=False)
    else:
        slda_model = 'None'
        slda_vectors = 'None'
        
    if BOW:
        pca_model_filename = 'train_pca_'+data_type+'_'+'model.pickle'
        bow_vectors_filename = 'train_bow_'+data_type+'_'+'vectors.pickle'
        word2index_filename = 'train_bow_word_word2index.pickle'
        index2word_filename = 'train_bow_word_index2word.pickle'
        
        if os.path.exists(pca_model_filename) and os.path.exists(bow_vectors_filename):
            print('IN PROGRESS: Loading PCA model')
            with open(pca_model_filename, 'rb') as f:
                pca_model = pickle.load(f)
            with open(bow_vectors_filename, 'rb') as f:
                bow_vectors = pickle.load(f)
            with open(word2index_filename, 'rb') as f:
                word2index = pickle.load(f)
            with open(index2word_filename, 'rb') as f:
                index2word = pickle.load(f)
        else:
            print('START: Training BOW model')
            word2index, index2word = bow.generate_vocabulary(data)
            pca_model, bow_vectors = bow.get_PCA_vectors_from_post_set(data, word2index)
            print('IN PROGRESS: Saving BOW model')
            with open(pca_model_filename, 'wb') as f:
                pickle.dump(pca_model, f)
            with open(bow_vectors_filename, 'wb') as f:
                pickle.dump(bow_vectors, f)
            with open(word2index_filename, 'wb') as f:
                pickle.dump(word2index, f)
            with open(index2word_filename, 'wb') as f:
                pickle.dump(index2word, f)
    else:
        pca_model = 'None'
        bow_vectors = 'None'
        word2index = 'None'
        index2word = 'None'
        
    return slda_model, slda_vectors, pca_model, bow_vectors, word2index, index2word 

In [None]:
def extract_slda_features(data, slda_model, slda_vectors_filename, test=False):
    if test:
        slda_vectors_filename = 'test_'+slda_vectors_filename
    else:
        slda_vectors_filename = 'train_'+slda_vectors_filename
    
    if os.path.exists(slda_vectors_filename):
        with open(slda_vectors_filename, 'rb') as f:
            print('IN PROGRESS: Loading sLDA vectors')
            slda_vectors=pickle.load(f)
    else:
        print('START: Getting topic sLDA vecs')
        slda_vectors = slda.get_topic_vecs(slda_model, data)
        print('IN PROGRESS: Saving sLDA vectors')
        with open(slda_vectors_filename, 'wb') as f:
            pickle.dump(slda_vectors, f)
    return slda_vectors

def extract_bow_features(data, word2index, pca_model):
    _, bow_vectors = bow.get_PCA_vectors_from_post_set(data, word2index, pca_model=pca_model)
    
    return bow_vectors

In [None]:
def get_topics(model):
    slda_coefficients = model.get_regression_coef(0)
    data = []
    for k in range(model.k):
        top_words = model.get_topic_words(k, top_n=40)
        words = [word for (word, float) in top_words]
        words = ", ".join(words)
        data.append([words, slda_coefficients[k]])

    indices = np.array(slda_coefficients).argsort()
    data = np.array(data)
    data = data[indices]

    topics = pd.DataFrame(data, columns=["Topic", "Suicidality Coefficient"])
    print(topics)

In [None]:
def slda_vectorize_data(model, FOLDERPATH):
    X, y, post_vectors = slda.vectorize_data_set(model, FOLDERPATH)
    return  X, y, post_vectors
    
def minmax_norm(arr):    
    return (arr - np.min(arr))/(np.max(arr) -np.min(arr))

def format_features(slda_model, slda_vectors, pca_model, bow_vectors, FOLDERPATH, num_topics=0,
                    data_type='filter_normal', sLDA=True, BOW=True):
    post_vectors = ''
    X_file = 'X_'+data_type+'_sLDA='+str(sLDA)+'_'+str(num_topics)+'_BOW='+str(BOW)+'.pickle'
    y_file = 'y_'+data_type+'_sLDA='+str(sLDA)+'_'+str(num_topics)+'_BOW='+str(BOW)+'.pickle'
    post_vectors_file = 'post_vectors_'+data_type+'_sLDA='+str(sLDA)+'_'+str(num_topics)+'_BOW='+str(BOW)+'.pickle'
    
    if os.path.exists(X_file) and os.path.exists(y_file) and os.path.exists(post_vectors_file):
        with open(X_file,'rb') as f:
            X = pickle.load(f)
        with open(y_file,'rb') as f:
            y = pickle.load(f)
        with open(post_vectors_file,'rb') as f:
            post_vectors = pickle.load(f)
    else:
        if sLDA and BOW:
            print('START: Vectorize sLDA and BOW')
            X = np.array([np.concatenate([minmax_norm(slda_vectors[key][0]),minmax_norm(bow_vectors[key][0])]) for key in slda_vectors.keys()])
            y = np.array([slda_vectors[key][1] for key in slda_vectors.keys()])
            y = y.reshape(np.shape(y)[0])
        elif sLDA:
            print('START: Vectorize sLDA')
            X, y, post_vectors = slda_vectorize_data(slda_model, FOLDERPATH)
            y = y.reshape(np.shape(y)[0])
        elif BOW:
            print('START: Vectorize BOW')
            X = np.array([bow_vectors[key][0] for key in bow_vectors.keys()])
            y = np.array([bow_vectors[key][1] for key in bow_vectors.keys()])
            y = y.reshape(np.shape(y)[0])
        print('DONE: Vectorizing')
        print('IN PROGRESS: Saving vectors')
        with open(X_file,'wb') as f:
            pickle.dump(X, f)
        with open(y_file,'wb') as f:
            pickle.dump(y, f)
        with open(post_vectors_file,'wb') as f:
            pickle.dump(post_vectors, f)
    return X, y, post_vectors

# Post-level Classification Train

In [None]:
def train_post_classifier(X_train, y_train, post_clf_type='RbfSVM', num_topics=40, 
                          data_type='filter_normal', sLDA=False, BOW=False):
    file = 'p_clf_'+data_type+'_sLDA='+str(sLDA)+'_'+str(num_topics)+'_BOW='+str(BOW)+'_'+post_clf_type+'.pickle'
    if os.path.exists(file):
        print('IN PROGRESS: Loading post classifier')
        with open(file, 'rb') as f:
            p_clf = pickle.load(f)
    else:
        print('START: Training post classifier')
#         post_clf_types = ['LogReg', 'LinearSVM', 'RbfSVM', 'AdaBoost', 'RandomForest', 'MLP']
#         if post_clf_types[0] == post_clf_type:
            #param_dict = {'C':[0.2,0.5,0.7,1,1.5,2,5]}
#         elif post_clf_types[1] == post_clf_type:
            #param_dict = {'C':[0.2,0.5,1,2]}
        #p_clf = post_classifier.PostClassification(post_clf_type)
#         elif post_clf_types[2] == post_clf_type:
            #param_dict = {'C':[0.5,1,2,5]}           
#         elif post_clf_types[5] == post_clf_type:
            #param_dict = {'hidden_layer_sizes':[(64,64),(64,64,64),(32,32), (32,32,32)], 'learning_rat
        #p_clf.train_grid_search_CV(X_train, y_train, param_dict, groups=5)

        
        p_clf = post_classifier.PostClassification(post_clf_type)
        p_clf.train(X_train, y_train)
        print('IN PROGRESS: Saving post classifier')
        with open(file, 'wb') as f:
            pickle.dump(p_clf, f)
    return p_clf

# Post-level Classification Test

In [None]:
def predict_posts(p_clf, X_test, y_test, print_metrics=True):
    y_pred = p_clf.test(X_test)
    if print_metrics:
        p_clf.get_metrics(y_test, y_pred)
    return y_pred

# User-level Classifcation Test

In [None]:
def format_post_labels(data, post_vectors, y_pred, FOLDERPATH, sLDA=True, BOW=False):
    # relabel the user labels from 1 for d and 0 for all else
    user_to_y = defaultdict(int)
    for post_id in tqdm.tqdm(data.keys()):
        user_to_y[data[post_id][0]] = (1 if data[post_id][2] == 'd' else 0)
    
    # format the data to get user for each post
    post2user_ypred = defaultdict(list)
    for i, post_id in enumerate(post_vectors.keys()):
        user_id = data[post_id][0]
        post2user_ypred[post_id] = [user_id, y_pred[i]]
        
    user_to_post_label = aggregate.aggregate_posts(FOLDERPATH, post2user_ypred)
    
    return user_to_post_label, user_to_y

In [None]:
def format_user_pred(user_to_y, user_to_ypred):
    user_y = []
    user_y_pred = []
    for user_id in user_to_ypred:
        user_y.append(user_to_y[user_id])
        user_y_pred.append(user_to_ypred[user_id])
    return user_y, user_y_pred

def predict_users(user_to_post_label, user_to_y, user_clf_type='Max'):
    u_clf = user_classifier.UserClassification(user_to_post_label)
    if user_clf_type=='Max':
        user_to_ypred = u_clf.argmax()
    elif user_clf_type=='Threshold':
        user_to_ypred = u_clf.find_threshold(user_to_y)
    elif user_clf_type=='Minimum':
        user_to_ypred = u_clf.minimum(1)
    
    user_y, user_ypred = format_user_pred(user_to_y, user_to_ypred)
    u_clf.get_metrics(user_y, user_ypred)

# Wrapped

In [None]:
def wrapped(train_data_processed=True, dev_data_processed=True, sLDA=True, 
            data_type='filter_normal', num_topics=40, BOW=True, post_clf_type='RbfSVM',
            user_clf_types=['Max']):
    # Make sure you have created TRAIN_FOLDERPATH and DEV_FOLDERPATH directories on local
    TRAIN_POSTPATH = './Data/crowd/train/shared_task_posts.csv'
    TRAIN_LABELPATH = './Data/crowd/train/crowd_train.csv'
    TRAIN_USERPATH = './Data/crowd/train/task_C_train.posts.csv'
    
    DEV_POSTPATH = './Data/crowd/test/shared_task_posts_test.csv'
    DEV_LABELPATH = './Data/crowd/test/crowd_test_C.csv'
    DEV_USERPATH = './Data/crowd/test/task_C_test.posts.csv'
    
    TRAIN_FOLDERPATH = './Processing/crowd_processed/'
    DEV_FOLDERPATH = './Processing/crowd_processed_test/'
    
    # Proccess or load processed training data
    if not train_data_processed:
        user_to_post, post_to_metadata, filtered_data, sw_posts, sw_timestamps = process_data(TRAIN_POSTPATH, TRAIN_LABELPATH, 
                                                                                              TRAIN_USERPATH, TRAIN_FOLDERPATH)
    else:
        user_to_post, post_to_metadata, filtered_data, sw_posts, sw_timestamps = load_processed_data(TRAIN_FOLDERPATH)
    
    # print class sizes
    print_label_size(filtered_data)
    slda_model, slda_vectors, pca_model, bow_vectors, word2index, index2word  = train_feature_model(filtered_data, data_type=data_type, 
                                                                                                    sLDA=sLDA, BOW=BOW, num_topics=num_topics)
    
    X_train, y_train, post_vectors = format_features(slda_model, slda_vectors, pca_model, 
                                                     bow_vectors, TRAIN_FOLDERPATH, sLDA=sLDA, BOW=BOW)
    y_train = y_train.reshape(np.shape(y_train)[0])
    
    p_clf = train_post_classifier(X_train, y_train, post_clf_type=post_clf_type, num_topics=num_topics, 
                          data_type=data_type, sLDA=sLDA, BOW=BOW)
    
    print('DONE: Training complete')
    
    
#     Proccess or load processed development data
    if not dev_data_processed:
        user_to_post, post_to_metadata, filtered_data, sw_posts, sw_timestamps = process_data(DEV_POSTPATH, 
                                                                                              DEV_LABELPATH, DEV_USERPATH, DEV_FOLDERPATH)
    else:
        user_to_post, post_to_metadata, filtered_data, sw_posts, sw_timestamps = load_processed_data(DEV_FOLDERPATH)
    
    print_label_size(filtered_data)
    
    if sLDA:
        slda_vectors_filename = 'slda_'+data_type+'_'+str(num_topics)+'_vectors.pickle'
        slda_vectors = extract_slda_features(filtered_data, slda_model, slda_vectors_filename, test=True)
    if BOW:
        bow_vectors= extract_bow_features(filtered_data, word2index, pca_model)
        
    X_test, y_test, post_vectors = format_features(slda_model, slda_vectors, pca_model, bow_vectors, DEV_FOLDERPATH, data_type=data_type,
                                                   sLDA=sLDA, BOW=BOW, num_topics=num_topics)
    
    y_pred = predict_posts(p_clf, X_test, y_test, print_metrics=True)
    
    user_to_post_label, user_to_y = format_post_labels(filtered_data, slda_vectors, y_pred, DEV_FOLDERPATH)
    
    for user_clf_type in user_clf_types:
        print('START: User classification post:', user_clf_type)
        predict_users(user_to_post_label, user_to_y, user_clf_type=user_clf_type)
        

In [None]:
post_clf_typs = ['LogReg', 'LinearSVM', 'RbfSVM', 'AdaBoost', 'RandomForest', 'MLP']
user_clf_types = ['Max', 'Threshold']
wrapped(train_data_processed=True, BOW=True, post_clf_type=post_clf_types[0])