In [None]:
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, TensorDataset
import torch

In [None]:
from google.colab import auth
auth.authenticate_user()
!curl https://sdk.cloud.google.com | bash
!gcloud init

In [None]:
df_name = "Aladag_sample.csv"
aladag_preprocessed = "Aladag_sample_preprocessed.csv"
aladag_preprocessed_test = "Aladag_labeled_preprocessed.csv"
mental_condition_name = "SMHD_train.csv"
mental_condition_name_test = "SMHD_test.csv"

mental_condition_name_test = "SMHD_dev.csv"
model_name = ""
df_mental_preprocessed = "df_mental_balanced_preprocessed_preprocessed.csv"
df_mental_preprocessed_test = "df_mental_valid_preprocessed_preprocessed.csv"

In [None]:
!gsutil cp  gs://masterthesisbert/{df_name} /content/{df_name}
!gsutil cp  gs://masterthesisbert/{aladag_preprocessed} /content/{aladag_preprocessed}
!gsutil cp  gs://masterthesisbert/{aladag_preprocessed_valid} /content/{aladag_preprocessed_valid}
!gsutil cp  gs://masterthesisbert/{mental_condition_name} /content/{mental_condition_name}
!gsutil cp  gs://masterthesisbert/{mental_condition_name_test} /content/{mental_condition_name_test}
!gsutil cp  gs://masterthesisbert/{mental_condition_name_valid} /content/{mental_condition_name_valid}
!gsutil cp  gs://masterthesisbert/{df_mental_preprocessed} /content/{df_mental_preprocessed}
!gsutil cp  gs://masterthesisbert/{df_mental_preprocessed_valid} /content/{df_mental_preprocessed_valid}

#Mental health condition extraction

In [None]:
#only one dataset at the time
before_training = False
aladag = True
preprocess = False
reddit_500 = False
smhd = False
preprocess = False
smhd_preprocessed = False

if aladag == True:
    df_mental = pd.read_csv(aladag_preprocessed)
#df_mental = df_mental.astype({"text": str, "label":str}, errors='raise')
    df_mental = df_mental.rename(columns={"binary_annotation": "label", "selftext": "text"})

    df_mental_test = pd.read_csv(aladag_preprocessed_test)
    df_mental_test = df_mental_test.rename(columns={"binary_annotation": "label", "selftext": "text"})

elif reddit_500:

    !gsutil cp gs://masterthesisbert/reddit_500_final_val.csv  /content/reddit_500_final_val.csv
    !gsutil cp gs://masterthesisbert/reddit_500_final_train.csv  /content/reddit_500_final_train.csv

    df_name2 = "/content/reddit_500_final_val.csv"
    df_mental_test = pd.read_csv(df_name2)
    

    df_name = "/content/reddit_500_final_train.csv"
    df_mental= pd.read_csv(df_name)


    df_mental = df_mental.rename(columns={"Label": "label", "selftext": "text"})
    df_mental_test = df_mental_test.rename(columns={"Label": "label", "selftext": "text"})


elif (smhd == True and preprocess == False):

    #add the names of finaly preprocessed dataframes here 
    !gsutil cp  gs://masterthesisbert/df_mental_balanced_preprocessed_preprocessed.csv /content/df_mental_balanced_preprocessed_preprocessed.csv
    !gsutil cp  gs://masterthesisbert/df_mental_valid_preprocessed_preprocessed.csv /content/df_mental_valid_preprocessed_preprocessed.csv

!gsutil cp  gs://masterthesisbert/finetuned_BERT_10classes_epoch_3.model /content/finetuned_BERT_10classes_epoch_3.model

In [None]:
!pip install tokenizers
!pip install transformers

In [None]:
from transformers import BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)                                          

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    df_mental.text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val= tokenizer.batch_encode_plus(
    df_mental_test.text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


In [None]:
possible_labels = sorted(df_mental.label.unique())

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

In [None]:
df_mental['code'] = df_mental.label.replace(label_dict)
df_mental_test['code'] = df_mental_test.label.replace(label_dict)

In [None]:
df_mental_test.code.value_counts()
len(df_mental.code.value_counts())

In [None]:
df_mental.label.value_counts()

In [None]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df_mental.code.values)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(df_mental_test.code.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_test, attention_masks_test, labels_test)

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 3

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(df_mental), 
                              batch_size=batch_size)

dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(df_mental_test), 
                                   batch_size=batch_size)

In [None]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=10,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 3

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)


from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, preds_flat, average='weighted')

def precision_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return precision_score(labels_flat, preds_flat, average='weighted')

def recall_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return recall_score(labels_flat, preds_flat, average='weighted')


def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

In [None]:
import random
from tqdm.notebook import tqdm

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


def evaluate(dataloader_val, only_predict = False):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    
        
    if only_predict:

        for batch in dataloader_val:
        
            batch = tuple(b.to(device) for b in batch)
            
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1]
                    }
       
            with torch.no_grad():        
                outputs = model(**inputs)
         
            logits = outputs[0]
            
            logits = logits.detach().cpu().numpy()
      
            predictions.append(logits)
            

        predictions = np.concatenate(predictions, axis=0)


        return predictions


    else:

        for batch in dataloader_val:
        
            batch = tuple(b.to(device) for b in batch)
            
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[2],
                    }

            with torch.no_grad():        
                outputs = model(**inputs)


            loss = outputs[0]
            logits = outputs[1]
            loss_val_total += loss.item()

            logits = logits.detach().cpu().numpy()
            label_ids = inputs['labels'].cpu().numpy()
            predictions.append(logits)
            true_vals.append(label_ids)
    
        loss_val_avg = loss_val_total/len(dataloader_val) 
        
        predictions = np.concatenate(predictions, axis=0)
        true_vals = np.concatenate(true_vals, axis=0)
                
        return loss_val_avg, predictions, true_vals

In [None]:
model.load_state_dict(torch.load('finetuned_BERT_10classes_epoch_3.model', map_location=torch.device('cpu')))

#loss_val_avg, predictions, true_vals = evaluate(dataloader_validation)

In [None]:
predictions_train = evaluate(dataloader_train, only_predict = True)

In [None]:
predictions_test = evaluate(dataloader_test, only_predict = True)

In [None]:
a = ["anxiety", "depression", "control", "ADHD", "bipolar disorder", "autism", "PTSD", "OCD", "schizophrenia", "eating disorder"]
diseases = sorted(map(lambda x: x.lower(), a))

In [None]:
mental_results_test = pd.DataFrame(predictions_test, columns=diseases)

In [None]:
mental_results_train = pd.DataFrame(predictions_train, columns=diseases)

In [None]:
df_mental_test = pd.concat([df_mental_test, mental_results_test], axis=1)

In [None]:
df_mental_train = pd.concat([df_mental, mental_results_train], axis=1)

#Extract emotions

In [None]:
#4 basic emotions from text

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!git clone https://github.com/MilaNLProc/xlm-emo.git

Cloning into 'xlm-emo'...
remote: Enumerating objects: 90, done.[K
remote: Counting objects: 100% (90/90), done.[K
remote: Compressing objects: 100% (61/61), done.[K
remote: Total 90 (delta 40), reused 69 (delta 24), pack-reused 0[K
Unpacking objects: 100% (90/90), done.


In [None]:
!mv /content/xlm-emo /content/xlm_emo

In [None]:
from transformers import Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from datasets import Dataset
import numpy as np
from typing import List
from xlm_emo.xlm_emo.dataset import prepare_dataset

class EmotionClassifier:

        def __init__(self, model="t"):
            if model == "t":
                self.tokenizer = AutoTokenizer.from_pretrained("MilaNLProc/xlm-emo-t")
                self.model = AutoModelForSequenceClassification.from_pretrained("MilaNLProc/xlm-emo-t")
            else:
                raise Exception("Not Yet Implemented")

        def predict(self, text: List):

            df = pd.DataFrame({"texts": text})

            train_dataset = Dataset.from_pandas(df)
            train_dataset = prepare_dataset(train_dataset, self.tokenizer)

            trainer = Trainer(model=self.model)

            local_results = trainer.predict(train_dataset)

            mapper = {0: "anger", 1: "fear", 2: "joy", 3: "sadness"}

            return local_results

In [None]:
ec = EmotionClassifier()

In [None]:
emot_test = ec.predict(df_mental_test["text"].tolist())[0]

In [None]:
emot_train = ec.predict(df_mental["text"].tolist())[0]

In [None]:
emotion_columns = ["anger", "fear", "joy", "sadness"]

In [None]:
emot_test_df = pd.DataFrame(emot_test, columns=emotion_columns)

In [None]:
emot_train_df = pd.DataFrame(emot_train, columns=emotion_columns)

In [None]:
df_mental_test = pd.concat([df_mental_test, emot_test_df], axis=1)

In [None]:
df_mental_train = pd.concat([df_mental_train, emot_train_df], axis=1)

In [None]:
df_mental_train

#Extract sentiment

In [None]:
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
 
 
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}-latest"
model_senti = AutoModelForSequenceClassification.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [None]:
df_mental_test[["negative", "neutral", "positive"]] = 0.0
df_mental_train[["negative", "neutral", "positive"]] = 0.0

In [None]:
def get_sentiment(df):
    for i in range(len(df)):
        text = preprocess(df.text[i])
        encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        output = model_senti(**encoded_input)
      
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        
        for j in range(3):

            df.iloc[[i], [len(df.columns)-3+j]] = scores[j]

    return df

In [None]:
df_mental_test = get_sentiment(df_mental_test)

In [None]:
df_mental_train = get_sentiment(df_mental_train)

#3rd and 1st person pronouns ratio 

In [None]:
third = ["he ", "he'" "she ", "she'", "it ", "it'", "one ",  "they ", "they'", "him ", "her ", "it ", "their ",  " them ", "his ", "hers ", " theirs ", " himself ", " herself ", " itself ", " oneself ",  " themselves "] 
first = [" i ", "i'", "we ", "we'"," me ", " us ", " myself ", " mine ", " ours ", " myself ", " ourselves " ]
second = ["you", "you'", " u ", "u'",  "yours", "yourself", "yourselves" ," u "]

In [None]:
df_mental_test[["first","third"]] = 0.0

In [None]:
df_mental_train[["first","third"]] = 0.0

In [None]:
def get_ratio(df):

    for i in range(len(df)):
        count_third = 0
        count_first = 0
        count_second = 0
        for j in third:
            count_third +=  df.text[i].lower().count(j)
        for k in first:
            count_first += df.text[i].lower().count(k)
        for k in second:
            count_second += df.text[i].lower().count(k)
        
        words = len(df.text[i])
        third_ratio = count_third/words
        first_ratio = count_first/words
        second_ratio = count_second/words
      
        #df["third"][i] = third_ratio/(third_ratio + first_ratio + second_ratio)

        df["third"][i] = third_ratio/words

        df["first"][i] = first_ratio/words

    return df   

In [None]:
df_mental_test = get_ratio(df_mental_test)

In [None]:
df_mental_train = get_ratio(df_mental_train)

In [None]:
df_mental_train

In [None]:
if aladag:
    df_mental_test.to_csv("Aladag_labeled_preprocessed_features.csv")
    df_mental_train.to_csv("Aladag_sample_preprocessed_features.csv")
    !gsutil cp  /content/Aladag_labeled_preprocessed_features.csv gs://masterthesisbert/Aladag_labeled_preprocessed_features.csv 
    !gsutil cp   /content/Aladag_sample_preprocessed_features.csv gs://masterthesisbert/Aladag_sample_preprocessed_features.csv

elif reddit_500:
    df_mental_test.to_csv("reddit_500_final_val_features.csv")
    df_mental_train.to_csv("reddit_500_final_train_features.csv")
    !gsutil cp  /content/reddit_500_final_val_features.csv gs://masterthesisbert/reddit_500_final_val_features.csv 
    !gsutil cp   /content/reddit_500_final_train_features.csv gs://masterthesisbert/reddit_500_final_train_features.csv