In [None]:
# Default Packages
import os
import sys
import pickle
import numpy as np
import pandas as pd
import os.path as path

# Torch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# Using standard huggingface tokenizer for compatability
from transformers import (BertTokenizer, BertModel, 
                          get_linear_schedule_with_warmup)

# PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy, f1, auroc, recall, precision
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import WandbLogger

# Internal Packages
from datasets import *
from models import *

In [None]:

def prepare_wandb (project, display_name, lr):
    return WandbLogger(
        name = display_name,
        project=project,
        config = {
            'model':BERT_MODEL,
            'classes':CLASSES,
            'batch_size':BATCH_SIZE,
            'num_epochs':NUM_EPOCHS,
            'learning_rate':lr,
            'early_stopping_patience':PATIENCE,
            'max_example_len':MAX_EXAMPLE_LEN,
            'n_examples':N_EXAMPLES,
        }
    )

def generate_callbacks (display_name):
    save_dir = path.join (RUNNING_DIR, 'model_checkpoints', display_name)
    os.makedirs(save_dir)
    early_stopping_callback = EarlyStopping(
        monitor='valid_loss', patience=PATIENCE)
    
    checkpoint_callback = ModelCheckpoint(
        dirpath=save_dir,
        filename="best-checkpoint",
        save_top_k=1,
        verbose=True,
        monitor="valid_loss",
        mode="min"
    )
    
    return checkpoint_callback, early_stopping_callback

def generate_trainer_params (project, lr):
    with open (path.join(RUNNING_DIR, 'words.txt')) as f:
        display_name = '-'.join (np.random.choice ((''.join (f.readlines()).split ('\n')), size=2))
    print ('Using display_name: {} for project: {}'.format(display_name, project))
        
    wandb_logger = prepare_wandb (project, display_name, lr)
    callbacks = generate_callbacks (display_name)
    
    return {
        'logger':wandb_logger,
        'checkpoint_callback':callbacks[0],
        'callbacks':[callbacks[1]]
    }

def generate_trainer (trainer_params):
    return pl.Trainer(
        max_epochs=NUM_EPOCHS,
        progress_bar_refresh_rate=30,
        gpus=1,
        **trainer_params
    )

In [None]:
def read_twitterSI(data_path):
    df = pd.read_csv(data_path)
    df.reset_index(inplace=True, drop=True)

    return df

def read_redditSI (data_path):
    # Data Loading
    reddit_df = pd.read_csv (data_path)[['text', 'class']]
    reddit_df = reddit_df.sample (N_EXAMPLES).reset_index(drop=True)
    reddit_df.rename (columns= {'class':'label'}, inplace=True)
    
    return reddit_df


In [None]:
RUNNING_DIR = r'C:\Code\NLP\ProfileLevel_SI_Classifier'
datasets_dir = path.join(RUNNING_DIR, 'Datasets')

BERT_MODEL = 'bert-base-uncased'
CLASSES = ['suicidal']
BATCH_SIZE = 12
NUM_EPOCHS = 10
# LEARNING_RATE = 2e-5
R_LEARNING_RATE = 1.5e-5
T_LEARNING_RATE = 1.5e-5
PATIENCE = 2
MAX_EXAMPLE_LEN =100

N_EXAMPLES = 3000

reddit_df = read_redditSI (
    path.join (datasets_dir, 'Implicitly_Labeled_Suicide_Reddit.csv'))
twitter_df = read_twitterSI(
    path.join (datasets_dir, 'Origional Suicidal Tweets.csv'))

# Tokenization and Batching
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)
reddit_data_module = RedditImplicitDataModule (
    reddit_df,
    tokenizer,
    splits=[0.8,0.2],
    max_example_len = MAX_EXAMPLE_LEN, 
    shuffle=True,
    batch_size=BATCH_SIZE,
)

twitter_data_module = TwitterDataModule (
    twitter_df,
    tokenizer,
    splits=[0.7,0.3],
    max_example_len = MAX_EXAMPLE_LEN, 
    shuffle=True,
    batch_size=BATCH_SIZE,
)


reddit_training_steps = (len(reddit_df)//BATCH_SIZE)*NUM_EPOCHS
twitter_training_steps = len (twitter_df)//BATCH_SIZE*NUM_EPOCHS

model = SuicideClassifier (
    output_classes= CLASSES,
    training_steps = reddit_training_steps,
    warmup_steps=reddit_training_steps/5,
    lr=R_LEARNING_RATE, 
    metrics=['ROC','binary_report']
)

In [None]:
reddit_trainer_params = generate_trainer_params ("BERT Implicitly Labeled Reddit v2", R_LEARNING_RATE)
twitter_trainer_params = generate_trainer_params ("TwitterSI Classification", T_LEARNING_RATE)

reddit_trainer = generate_trainer (reddit_trainer_params)
twitter_trainer = generate_trainer (twitter_trainer_params)

In [None]:
reddit_trainer.fit(model, reddit_data_module)
reddit_trainer.test()



In [None]:
loaded_model = SuicideClassifier.load_from_checkpoint(
  checkpoint_path = reddit_trainer.checkpoint_callback.best_model_path,
  training_steps = twitter_training_steps, 
  warmup_steps=twitter_training_steps/5,
  lr = T_LEARNING_RATE
)


In [None]:
twitter_trainer.fit (loaded_model, twitter_data_module)