# Access the MentalRiskEs data and interact with the server

This notebook has been developed by the [SINAI](https://sinai.ujaen.es/) research group for its usage in the [MentalRiskES](https://sites.google.com/view/mentalriskes/) evaluation campaign at IberLEF 2023.

**NOTE 1**: Please visit the [MentalRiskES competition website](https://sites.google.com/view/mentalriskes/evaluation) to read the instructions about how to download the data and interact with the server to send the predictions of your system.

**NOTE 2**: Along the code, please replace "URL" by the URL server and "TOKEN" by your personal token.

Remember this is a support to help you to develop your own system of communication with our server. We recommend you to download it as a Python script instead of working directly on colab and adapt the code to your needs. 

# Install CodeCarbon package

In [None]:
# -- Install libraries
!pip install llvmlite --ignore-installed
!pip install pycaret -U
!pip install empath
!python -m spacy download en_core_web_sm
!pip install codecarbon
!pip install emoji
!pip install beautifulsoup4
!pip install emosent-py
!pip install googletrans==3.1.0a0 -U
!pip install empath
!pip install spacy
!pip install typer==0.6.1
!pip install transformers
!pip install torch
!pip install syntok
!pip install NRCLex
!pip install readability
!pip install vaderSentiment
!pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html

!python -m spacy download en_core_web_lg
!python -m textblob.download_corpora

# Import libraries

In [1]:
from emosent           import get_emoji_sentiment_rank
from empath import Empath
import requests, zipfile, io
from typing import List, Dict
from requests.adapters import HTTPAdapter, Retry
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from googletrans import Translator
import os
from codecarbon import EmissionsTracker
from   nltk.stem                       import WordNetLemmatizer
from   tqdm                            import tqdm
from   nrclex                          import NRCLex
from   textblob                        import TextBlob
import transformers
import syntok.segmenter                as segmenter
import matplotlib.pyplot               as plt
import pandas                          as pd
import numpy                           as np
import readability
import lightgbm
import spacy
import json
import pickle
import torch
import emosent
import time
import nltk
import gc
import re

tqdm.pandas()

emoji_data_df = pd.read_csv('emoji_data_df.csv', sep=';')

12040 | INFO | NumExpr defaulting to 8 threads.


In [15]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# Endpoints
These URL addresses are necessary for the connection to the server. 

**IMPORTANT:** Replace "URL" by the URL server and "TOKEN" by your user token.

In [2]:
URL = "http://s3-ceatic.ujaen.es:8036" 
TOKEN = "bb8e33a437c180e88c4b79f8aed93dc00df4f6a9" 

# Download endpoints
ENDPOINT_DOWNLOAD_MESSAGES_TRIAL = URL+"{TASK}/download_trial/{TOKEN}"
ENDPOINT_DOWNLOAD_GOLD_TRIAL = URL+"{SUBTASK}/download_trial/{TOKEN}"
ENDPOINT_DOWNLOAD_MESSAGES_TRAIN = URL+"{TASK}/download_train/{TOKEN}"
ENDPOINT_DOWNLOAD_GOLD_TRAIN = URL+"{SUBTASK}/download_train/{TOKEN}"

# Trial endpoints
ENDPOINT_GET_MESSAGES_TRIAL = URL+"/{TASK}/getmessages_trial/{TOKEN}"
ENDPOINT_SUBMIT_DECISIONS_TRIAL = URL+"/{SUBTASK}/submit_trial/{TOKEN}/{RUN}"

# Test endpoints
ENDPOINT_GET_MESSAGES = URL+"/{TASK}/getmessages/{TOKEN}"
ENDPOINT_SUBMIT_DECISIONS = URL+"/{SUBTASK}/submit/{TOKEN}/{RUN}"

# Emissions Tracker Config
config = {
    "save_to_file": True,
    "log_level": "DEBUG",
    "tracking_mode": "process",
    "output_dir": ".", 
}

# -- Variables
SUB_POL_LIST   = ['fe_subjectivity', 'fe_polarity']
SENTIMENT_LIST = ['fe_roberta_base_sentiment_negative', 'fe_roberta_base_sentiment_neutral', 'fe_roberta_base_sentiment_positive', 'fe_vader_positive_sentiment', 'fe_vader_neutral_sentiment', 'fe_vader_negative_sentiment']
EMOTION_LIST   = ['fe_distilbert_emotion_optimism', 'fe_distilbert_emotion_joy', 'fe_distilbert_emotion_sadness', 'fe_distilbert_emotion_anger', 'fe_nrclex_emotion_fear', 'fe_nrclex_emotion_anger', 'fe_nrclex_emotion_anticip', 'fe_nrclex_emotion_trust', 'fe_nrclex_emotion_surprise', 'fe_nrclex_emotion_positive', 'fe_nrclex_emotion_negative', 'fe_nrclex_emotion_sadness', 'fe_nrclex_emotion_disgust', 'fe_nrclex_emotion_joy']
TOXIC_LIST     = ['fe_toxic', 'fe_severe_toxic', 'fe_obscene', 'fe_threat', 'fe_insult', 'fe_identity_hate']

# -- Empath features
EMOTE_LIST = list(set(['hate', 'envy', 'health', 'nervousness', 'weakness', 'horror', 'suffering', 'kill', 'fear', 'friends', 'sexual', 'body', 'family',
                       'irritability','violence','sadness','disgust','exasperation','emotional','anger','poor','pain','timidity','cheerfulness', 'night', 'college', 'sports', 'neglect',
                       'medical_emergency','rage','alcohol','positive_emotion','negative_emotion','ugliness','weapon','shame','torment','office','help',
                       'sleep', 'money', 'school', 'home', 'hygiene', 'phone', 'work', 'appereance', 'optimism', 'youth', 'joy', 'valuable', 'swearing_terms',
                       'disappointment', 'children', 'contentment', 'music', 'musical', 'deception', 'blue_collar_job', 'clothing', 'white_collar_job', 'exercise']))
# -- POS features
POS_FEATURES_LIST = ['fe_pos_adjs', 'fe_pos_advs', 'fe_pos_verbs', 'fe_pos_nouns', 'fe_pos_past_tense_verbs']

# -- Subjectivity and polarity - ROUND
SUB_POL_LIST = ['fe_subjectivity', 'fe_polarity']

# -- Text features 
TEXT_FEATURES_LIST = ['fe_punct_signs', 'fe_number_uppercase_words', 'fe_num_first_person_pronouns', 'fe_num_antidepressants', 'fe_num_words', 'fe_num_sentences', 'fe_num_paragraphs', 'fe_num_long_words', 'fe_num_complex_words', 'fe_num_emojis_emoticons', 'fe_num_negations', 'fe_num_pos_emojis', 'fe_num_neut_emojis', 'fe_num_neg_emojis']

# -- Date features
DATE_FEATURES = ['fe_posted_early_morning', 'fe_posted_morning', 'fe_posted_afternoon', 'fe_posted_night', 'fe_posted_first_season', 'fe_posted_second_season', 'fe_posted_third_season', 'fe_posted_fourth_season']

# -- Readability index
READABILITY_LIST = ['fe_kincaid_readability_index', 'fe_ari_readability_index', 'fe_coleman_readability_index', 'fe_flesch_readability_index', 'fe_gunning_fog_readability_index', 'fe_lix_readability_index', 'fe_smog_index']

# -- Depression terms
TERMS_LIST = ['fe_num_anhedonia_terms', 'fe_num_concentration_terms', 'fe_num_eating_terms', 'fe_num_fatigue_terms', 'fe_num_mood_terms', 'fe_num_psychomotor_terms', 'fe_num_self-esteem_terms', 'fe_num_self-harm_terms', 'fe_num_sleep_disorder_terms', 'fe_num_panic_attacks_terms']

# -- Transformers - Sentiment - ROUND
SENTIMENT_LIST = ['fe_roberta_base_sentiment_negative', 'fe_roberta_base_sentiment_neutral', 'fe_roberta_base_sentiment_positive', 'fe_vader_positive_sentiment', 'fe_vader_neutral_sentiment', 'fe_vader_negative_sentiment']

# -- Transformers - Emotion - ROUND
EMOTION_LIST = ['fe_distilbert_emotion_optimism', 'fe_distilbert_emotion_joy', 'fe_distilbert_emotion_sadness', 'fe_distilbert_emotion_anger', 'fe_nrclex_emotion_fear', 'fe_nrclex_emotion_anger', 'fe_nrclex_emotion_anticip', 'fe_nrclex_emotion_trust', 'fe_nrclex_emotion_surprise', 'fe_nrclex_emotion_positive', 'fe_nrclex_emotion_negative', 'fe_nrclex_emotion_sadness', 'fe_nrclex_emotion_disgust', 'fe_nrclex_emotion_joy']

# -- Transformers - Toxic - ROUND
TOXIC_LIST = ['fe_toxic', 'fe_severe_toxic', 'fe_obscene', 'fe_threat', 'fe_insult', 'fe_identity_hate']

ORIGINAL_SUM_FEATURE_COLS = EMOTE_LIST + POS_FEATURES_LIST + TEXT_FEATURES_LIST + DATE_FEATURES + TERMS_LIST
ORIGINAL_MEAN_FEATURE_COLS= SENTIMENT_LIST + EMOTION_LIST + TOXIC_LIST + SUB_POL_LIST
ORIGINAL_REDABILITY_LIST  = READABILITY_LIST

# Load pre-trained transformers

In [None]:
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}-latest"
pipe_sentiment = transformers.pipeline("text-classification", model=MODEL, return_all_scores=True, max_length=511)

In [None]:
pipe_emotion = transformers.pipeline("text-classification", model="cardiffnlp/roberta-base-emotion", return_all_scores=True)

In [None]:
pipe_toxicity = transformers.pipeline(model="unitary/toxic-bert", return_all_scores=True)

In [3]:
# -- Load models and features per model
import pickle
with open('./lightgbm_2023_05_13_google_translator.pkl', 'rb') as f:
    lightgbm_model = pickle.load(f)

sum_feature_cols_lightgbm  = [feature for feature in ORIGINAL_SUM_FEATURE_COLS if feature in lightgbm_model.feature_name_]
mean_feature_cols_lightgbm = [feature + '_mean' for feature in ORIGINAL_MEAN_FEATURE_COLS if feature + '_mean' in lightgbm_model.feature_name_]
readability_list_lightgbm  = [feature for feature in ORIGINAL_REDABILITY_LIST if feature in lightgbm_model.feature_name_]
features_lightgbm          = [sum_feature_cols_lightgbm, mean_feature_cols_lightgbm, readability_list_lightgbm]

with open('./rf_2023_05_13_google_translator.pkl', 'rb') as f:
    rf_model = pickle.load(f)

sum_feature_cols_rf  = [feature for feature in ORIGINAL_SUM_FEATURE_COLS if feature in rf_model.feature_names_in_]
mean_feature_cols_rf = [feature + '_mean' for feature in ORIGINAL_MEAN_FEATURE_COLS if feature + '_mean' in rf_model.feature_names_in_]
readability_list_rf  = [feature for feature in ORIGINAL_REDABILITY_LIST if feature in rf_model.feature_names_in_]
features_rf          = [sum_feature_cols_rf, mean_feature_cols_rf, readability_list_rf]

with open('./lr_2023_05_13_google_translator.pkl', 'rb') as f:
    lr_model = pickle.load(f)

sum_feature_cols_lr  = [feature for feature in ORIGINAL_SUM_FEATURE_COLS if feature in lr_model.feature_names_in_]
mean_feature_cols_lr = [feature + '_mean' for feature in ORIGINAL_MEAN_FEATURE_COLS if feature + '_mean' in lr_model.feature_names_in_]
readability_list_lr  = [feature for feature in ORIGINAL_REDABILITY_LIST if feature in lr_model.feature_names_in_]
features_lr          = [sum_feature_cols_lr, mean_feature_cols_lr, readability_list_lr]

In [None]:
# -- Aux functions
import typer
import spacy
empath = Empath()
lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_lg")

diccionario = {k:v for v,k in emoji_data_df.values if len(k.split(' ')) > 1}
def emojize(texto):
  # Paso 1: ordenar claves por longitud en orden descendente
  claves = sorted(diccionario, key=len, reverse=True)

  # Paso 2: reemplazar claves por valores en orden descendente
  for clave in claves:
      valor = diccionario[clave]
      texto = texto.replace(clave, valor)

  return texto.replace('signo de interrogación', '?').replace('signo de exclamación', '!')

def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def get_empath_features(df):
    for empath_ in ['suffering', 'violence', 'timidity', 'fear', 'body', 'youth', 'pain', 'sadness', 'nervousness', 'money']:
        df[empath_] = df['text'].apply(lambda x: empath.analyze(lemmatizer.lemmatize(str(x)), categories=[empath_])[empath_])
    return df

# -- Function to get Part of Speech (POS) tags, using Spacy library (filtering by POS)
def get_pos_tags(df, tag, colname):
    pos = []
    for doc in nlp.pipe(df['cleaned_text'].values, batch_size=16):
        pos_list = [token.pos_ for token in doc if token.pos_ == tag]
        pos.append(len(pos_list))
    df[colname] = pos
    return df

# -- Function to get other text features
def other_text_features(df):
    # -- Punctuation signs
    df["fe_punct_signs"] = df["text"].apply(lambda x: len(re.findall(r"[\?¿!¡]", str(x))))

    # -- Get first person pronouns
    forms = r"|".join(regex for regex in [r'\bi\b' , r'\bmy+\b', r'\bme+\b', r'\bmine+\b'])
    df['fe_num_first_person_pronouns'] = df['text'].apply(lambda x: len(re.findall(forms, str(x).lower())))

    # -- Get number of long words
    readability_preprocessing = lambda x: '\n\n'.join('\n'.join(' '.join(token.value for token in sentence) for sentence in paragraph) for paragraph in segmenter.analyze(x))
    df['fe_num_long_words'] = df['text'].apply(lambda x: readability.getmeasures(readability_preprocessing(clean_text_no_lower(str(x))), lang='en')['sentence info']['long_words']\
                                                if re.search(r"[a-zA-Z]", clean_text_no_lower(str(x))) != None else 0)
    
    return df

# -- Function to get mood-problem terms
def get_mood_terms(df):
    ## Load negative forms
    neg_forms = ["rarely","never","no more","regain control of","not","don[\']t","didn[\']t","wouldn[\']t","not very","scarcely","no longer","hardly","contradictorily","invalidly","seldom","barely ever","hardly ever","under no circumstances","in no way","on no condition","wasn[\']t","isn[\']t","was not","is not","aren[\']t","are not","maybe","stop","stopped","end of","doesn[\']t","doesn not","couldn[\']t","could not","can[\']t","can not","cannot","pretend to","feel no"]
    neg_forms_joined = "".join("(?<!" + neg_regex + ")" for neg_regex in neg_forms)
    ## Sample with mood
    mood_df = pd.read_table('mood_problem.txt', header=None)
    mood_df.drop_duplicates(inplace=True)

    # -- but these past few weeks I've had random crying phases that pop up several times a day; I feel completely alone; 
    mood_list = list(mood_df[0].apply(lambda x: x.replace("(", "(?:").replace("( )", "").replace("?:?<!", "?<!").replace("?:?!", "?!").replace("(?: )", "").replace("#", "").replace("?(\w+)?", "?(?:[a-zA-Z ]*)").replace('’', "'").lower()))
    mood_list.extend(["i\'?m so+ sad", "i’?\'?m so+ depressed", "i\'?m so+ unhappy", "i cry", "am( so+)? sad", r"my life\'s (pretty much|like) a roller coaster with( major)? ups and downs", r"anti\-depressant",
                    r"it tears me up", r"i just break down", r"i( just)? feel emotionless", r"i wish i could find some magical cure that can make me happy but i don\'t think there is one",
                    r"i even had a major mental breakdown at work where all my co\-workers saw me cry",
                    r"i[\' ](m|am) so tired of being so sad", r"i[\' ](have|ve) noticed moodiness", r"i[\' ](m|am)( feeling| just| still| so| stuck in this| fucking| stuck and| feel just as| feel)? miserable",
                    r"i spent most of the rest of the day feeling miserable", r"i can(\'t|not) stop crying", "i feel very sad", "i feel a mix of anger and sadness", "i sat downstairs alone at night crying to myself",
                    r"i\'?m alone", r"i( just)? feel( so)? alone", r"i\'ve had a relapse of sadness", r"i feel like i have no one", r"i can\'t feel happy", r"i\'?( am|m) (at|in) the end of my rope", r"i am sitting here with tears",
                    r"i feel totally wrong and get emotional when i get back home, crying and anxious, the whole bit", r"i\'ve find myself crying when the memory hits"])
    mod_list = [regex for regex in mood_list if "depress" not in regex]
    df['fe_num_mood_terms_aux'] = df['text'].apply(lambda x: sum([len(list(set(re.findall(term, str(x).lower().replace('’', "'"))))) \
                                                                                                                    for term in mood_list])
                                                                                                        )
    # -- Apply to mood
    mood_list = list(mood_df[0].apply(lambda x: neg_forms_joined + " ?" + x.replace("(", "(?:").replace("( )", "").replace("?:?<!", "?<!").replace("?:?!", "?!").replace("(?: )", "").replace("#", "").replace("?(\w+)?", "?(?:[a-zA-Z ]*)").replace('’', "'").lower()))
    df['fe_num_mood_terms'] = df.apply(lambda x: sum([len(list(set(re.findall(term, str(x['text']).lower().replace('’', "'"))))) \
                                                                                                   for term in mood_list])\
                                                                                                        if x['fe_num_mood_terms_aux'] > 0 else 0,
                                                                                                    axis=1
                                                                                            )
    df.drop(['fe_num_mood_terms_aux'], axis=1, inplace=True)
    return df

# -- Function to preprocess text (based on HugginFace transformer)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if 'http' in t else t
        new_text.append(t)
    return " ".join(new_text)

# -- Function to get RoBERTa base sentiment + VADER
def get_sentiment_features(df):
    with torch.no_grad():
        df["fe_sentiment"] = df["text"].progress_apply(lambda x: pipe_sentiment(preprocess(str(x).replace("\n", " ").replace("\t", " "))))
    emotions = ['negative', 'neutral', 'positive']
    for i, emotion in enumerate(emotions):
        df['fe_roberta_base_sentiment_' + emotion + '_mean'] =  df['fe_sentiment'].apply(lambda x: x[i]['score'])
    df.drop(["fe_sentiment"], axis=1, inplace=True)

    sid_obj = SentimentIntensityAnalyzer()
    df['fe_vader_positive_sentiment_mean'] = df['text'].apply(lambda x: sid_obj.polarity_scores(clean_text_no_lower(str(x)))['pos'])
    df['fe_vader_neutral_sentiment_mean']  = df['text'].apply(lambda x: sid_obj.polarity_scores(clean_text_no_lower(str(x)))['neu'])
    df['fe_vader_negative_sentiment_mean'] = df['text'].apply(lambda x: sid_obj.polarity_scores(clean_text_no_lower(str(x)))['neg'])
    
    # -- Emotion features
    emotion_list = ['optimism', 'joy', 'sadness']
    with torch.no_grad():
        df["fe_distilbert_emotion"] = df["text"].apply(lambda x: pipe_emotion(preprocess(str(x).replace("\n", " ").replace("\t", " ")), max_length=511))
    
    for i, emotion in enumerate(emotion_list):
        df['fe_distilbert_emotion_' + emotion + '_mean'] =  df['fe_distilbert_emotion'].apply(lambda x: x[i]['score'])

    df.drop('fe_distilbert_emotion', axis=1, inplace=True)
    
    # -- NRCLex emotion
    df['fe_nrclex_emotions'] = df['text'].apply(lambda x: NRCLex(clean_text_no_lower(str(x)).replace("\n", " ").replace("\t", " ")).affect_frequencies)
    
    for emotion in ['surprise', 'positive', 'sadness', 'joy']:
        df['fe_nrclex_emotion_' + emotion + '_mean'] = df['fe_nrclex_emotions'].apply(lambda x: x[emotion])
    df.drop('fe_nrclex_emotions', axis=1, inplace=True)
    
    # -- Toxicity
    with torch.no_grad():
        df["fe_toxicity"] = df["text"].apply(lambda x: pipe_toxicity(preprocess(str(x).replace("\n", " ").replace("\t", " "))))

    df['fe_toxic_mean'] = df['fe_toxicity'].apply(lambda x: x[0]['score'])
    df['fe_insult_mean'] = df['fe_toxicity'].apply(lambda x: x[4]['score'])
    df['fe_identity_hate_mean'] = df['fe_toxicity'].apply(lambda x: x[5]['score'])
    df.drop(['fe_toxicity'], axis=1, inplace=True)
    
    # -- Polarity
    df["fe_polarity_mean"] = df["text"].apply(lambda x: TextBlob(clean_text_no_lower(str(x))).sentiment.polarity)
    return df

def get_readability_features(df):
    # 'fe_coleman_readability_index', 'fe_smog_index'
    readability_preprocessing = lambda x: '\n\n'.join('\n'.join(' '.join(token.value for token in sentence) for sentence in paragraph) for paragraph in segmenter.analyze(x))
    df['fe_coleman_readability_index'] = df['text'].apply(lambda x: readability.getmeasures(readability_preprocessing(clean_text_no_lower(str(x))), lang='en')['readability grades']['Coleman-Liau'] if re.search(r"[a-zA-Z]", clean_text_no_lower(str(x))) != None else 0)
    df['fe_smog_index'] = df['text'].apply(lambda x: readability.getmeasures(readability_preprocessing(clean_text_no_lower(str(x))), lang='en')['readability grades']['SMOGIndex'] if re.search(r"[a-zA-Z]", clean_text_no_lower(str(x))) != None else 0)
    return df

# -- Function to remove URLs from text
def clean_text_no_lower(x):
    return re.sub(r"(#[a-zA-Z0-9]+;)|(\/r\/[a-zA-Z]+)|([a-zA-Z%_0-9]+=[a-zA-Z0-9%&_\.-]* ?\)?\]?|(%[a-zA-Z0-9_\.=,\-\+%]+(watch|facebook|reddit|http|youtube\.)[a-zA-Z0-9_\.=,\-%\+]+( \+O%27Nymous)?\)?\]?))|(!\/?[a-zA-Z0-9.\/_]+\.[a-zA-Z]{2,3}\)?\]?)", "",
                  re.sub(r'\(?\[?(?:(?:http|https|www)\:*(\/+\/+\**|\.))[a-zA-Z0-9\.\/\?\:@\-_=#\*\\\.]+\.((?:[a-zA-Z]){2,6}(?:[a-zA-Z0-9\.\&\/\?\:@\-_=#\*\\\.])*)?\)?\]?', '', 
                  str(re.sub(' +', ' ', str(x))), flags=re.MULTILINE))

# -- Function (general) to clean texts (removing URLs, hashtags, punctuation sings, words with less than 2 chars) + lowercase
def clean_text(x):
    x = str(x).lower()
    x = re.sub(r"(#[a-zA-Z0-9]+;)|(\/r\/[a-zA-Z]+)|([a-zA-Z%_0-9]+=[a-zA-Z0-9%&_\.-]* ?\)?\]?|(%[a-zA-Z0-9_\.=,\-\+%]+(watch|facebook|reddit|http|youtube\.)[a-zA-Z0-9_\.=,\-%\+]+( \+O%27Nymous)?\)?\]?))|(!\/?[a-zA-Z0-9.\/_]+\.[a-zA-Z]{2,3}\)?\]?)", "",
                  re.sub(r'\(?\[?(?:(?:http|https|www)\:*(\/+\/+\**|\.))[a-zA-Z0-9\.\/\?\:@\-_=#\*\\\.]+\.((?:[a-zA-Z]){2,6}(?:[a-zA-Z0-9\.\&\/\?\:@\-_=#\*\\\.])*)?\)?\]?', '', 
                  str(re.sub(' +', ' ', str(x))), flags=re.MULTILINE))
    return x

# Download Data
To download the data, you can make use of the **functions defined in the following**.

The following function download the trial data. To adapt it to download the train and test data, follow the instructions given in the [website of the competition](https://sites.google.com/view/mentalriskes/evaluation).

In [None]:
def download_messages_trial(task: str,subtasks:List[str], token: str) -> List[Dict]:
    response = requests.get(ENDPOINT_DOWNLOAD_MESSAGES_TRIAL.format(TASK=task, TOKEN=token))

    if response.status_code != 200:
        print("Trial - Status Code " + task + ": " + str(response.status_code) + " - Error: " + str(response.text))
    else:
      z = zipfile.ZipFile(io.BytesIO(response.content))
      os.makedirs("./data/{task}/trial/subjects_trial/".format(task=task))
      z.extractall("./data/{task}/trial/subjects_trial/".format(task=task))

    for subtask in subtasks:
        response = requests.get(ENDPOINT_DOWNLOAD_GOLD_TRIAL.format(SUBTASK=subtask, TOKEN=token))
        
        if response.status_code != 200:
            print("Trial - Status Code " + subtask + ": " + str(response.status_code) + " - Error: " + str(response.text))
        else:
          file_object = open("./data/{task}/trial/gold_trial_{subtask}.txt".format(task=task, subtask=subtask), "w")
          file_object.write(response.text)

# Client Server
This class simulates communication with our server. The following code established the conection with the server client and simulate the GET and POST requests. 

**IMPORTANT NOTE:** Please pay attention to the basic functions and remember that it is only a base for your system. 

In [109]:
class Client_taskX:
    def __init__(self, task: str, subtasks: List[str], token: str, number_of_runs: int, tracker: EmissionsTracker):
        # Task in which you participate
        self.task = task
        # Subtasks in which you participate
        self.subtasks = subtasks
        # Token identifier
        self.token = token
        # Number of runs (Max: 3)
        self.number_of_runs = number_of_runs
        # Object to calculate CO2 emissions
        self.tracker = tracker
        self.relevant_cols = ['duration', 'emissions', 'cpu_energy', 'gpu_energy', 'ram_energy', 
            'energy_consumed', 'cpu_count', 'gpu_count', 'cpu_model', 'gpu_model', 'ram_total_size']

    # Here a GET request is sent to the server to extract the data.
    def get_messages(self, retries: int, backoff: float) -> Dict:
        session = requests.Session()
        retries = Retry( 
                        total = retries,
                        backoff_factor = backoff,
                        status_forcelist = [500, 502, 503, 504]
                        )
        session.mount('https://', HTTPAdapter(max_retries=retries))
        response = session.get(ENDPOINT_GET_MESSAGES_TRIAL.format(TASK=self.task, TOKEN=self.token))
        if response.status_code != 200:
          print("GET - Status Code " + self.task + ": " + str(response.status_code) + " - Error: " + str(response.text))
          return []
        else:
          return json.loads(response.content)

    # The POST requests are sent to the server to send predictions and carbon emission data
    def submit_decission(self, run: int, subtask: int, decisions: Dict, emissions:Dict, retries, backoff):

        data = {
            "predictions": decisions,
            "emissions": emissions
        }

        data = json.dumps(data)
        ## Session to POST request
        session = requests.Session()
        retries = Retry(
                        total = retries,
                        backoff_factor = backoff,
                        status_forcelist = [500, 502, 503, 504]
                        )
        session.mount('https://', HTTPAdapter(max_retries=retries))
        response = session.post(ENDPOINT_SUBMIT_DECISIONS.format(SUBTASK=self.subtasks[subtask], TOKEN=self.token, RUN=run), json=[data])
        if response.status_code != 200:
            print("POST - Status Code " + self.task + ": " + str(response.status_code) + " - Error: " + str(response.text))
        else:
            print("Subtask {}: - run {}".format(self.subtasks[subtask], run))

    # Main thread
    def run_taskX(self, retries: int, backoff: float):
        # -- Load googletranslator
        translator = Translator()

        # Get messages for taskX
        messages = self.get_messages(retries, backoff)
        #messages = [{"id_message": 123,"round": 1,"nick": "subject1","message": "Hola a todos! La verdad es que me alegro de estar aquí!","date": "2021/02/10"},
        #            {"id_message": 124,"round": 1,"nick": "subject1","message": "Adiós a todos! Ha sido un verdadero placer! cara llorando","date": "2021/02/10"},
        #            {"id_message": 125,"round": 1,"nick": "subject2","message": "Adiós a todos! Ha sido un verdadero placer! cara llorando","date": "2021/02/10"}]

        # If there are no messages
        if len(messages) == 0:
            print("All rounds processed")
            return

        # -- Main while loop
        all_messages = pd.DataFrame(columns=['id_message', 'round', 'nick', 'message'])
        while len(messages) > 0:
            print("------------------- Processing round {}".format(messages[0]["round"]))
            # Save subjects
            with open('./data/rounds_trial/round{}.json'.format(messages[0]["round"]), 'w+', encoding='utf8') as json_file:
                json.dump(messages, json_file, ensure_ascii=False)
                            
            self.tracker.start()
            # -- Step 1: convert dict to Pandas DataFrame
            messages_df["message_emojized"]       = messages_df["message"].apply(emojize)
            messages_df['message_without_emojis'] = messages_df['message_emojized'].apply(lambda x: remove_emojis(x))

            # -- Step 2: translate it using googletrans library
            messages_df["text"] = messages_df["message_without_emojis"].apply(lambda x: translator.translate(str(x), dest="en").text)

            # -- Step 3: feature engine
            messages_df = get_empath_features(messages_df)

            # -- Step 4: POS features - adverbs
            messages_df['cleaned_text'] = messages_df['text'].apply(clean_text_no_lower)
            messages_df = get_pos_tags(messages_df, 'ADV', 'fe_pos_advs')

            # -- Step 5: Get other text features
            messages_df = other_text_features(messages_df)

            # -- Step 6: Get mood problems terms
            messages_df = get_mood_terms(messages_df)

            # -- Step 7: Get sentiment + toxicity + emotion features
            messages_df = get_sentiment_features(messages_df)

            # -- Step 8: Get readability features
            messages_df = get_readability_features(messages_df)
            
            # -- Concat with all_messages
            all_messages = pd.concat([all_messages, messages_df], axis=0)
            
            if not all_messages.empty():
                messages_df = pd.concat([messages_df, all_messages[all_messages['nick'].isin(messages_df['nick'])]],
                                        axis=0)

            for i, (model, model_features) in enumerate(zip([lightgbm_model, rf_model, lr_model],
                                                            [features_lightgbm, features_rf, features_lr])):
                
                
                messages_df_grouped = messages_df.groupby(['nick'])[model_features[0]].apply(lambda x : x.astype(int).sum()).reset_index().merge(
                                                                    messages_df.groupby(['nick'])[model_features[2]].apply(lambda x : x.mean()).reset_index(),
                                                                    on=['nick']
                                                            ).merge(
                                                                messages_df.groupby(['nick'])[model_features[1]].apply(lambda x : x.mean()).reset_index(),
                                                                on=['nick']
                                                            )
                
                ## -- Calculate prediction
                if type(model).__name__ in ['RandomForestClassifier', 'LogisticRegression']:
                    predictions = model.predict(messages_df_grouped[model.feature_names_in_])
                else:
                    predictions = model.predict(messages_df_grouped[model.feature_name_])
                decissions  = {id: int(pred) for id, pred in zip(messages_df_grouped['nick'], predictions)}

                emissions = self.tracker.stop()
                emissions_df = pd.read_csv("emissions.csv")
                measurements = emissions_df.iloc[-1][self.relevant_cols].to_dict()
                self.submit_decission(subtask=0, run=i, decisions=decissions, emissions=measurements, retries=retries, backoff=backoff)
                time.sleep(3)
        print("All rounds processed")
        return decissions

In [None]:
all_messages = pd.DataFrame(columns=['id_message', 'round', 'nick', 'message'])
all_messages.empty()

# Main

Please, replace the symbol 'X' by the desired task. For example, for task 1 it would be: task1, task1a and task1b.

In [110]:
tracker = EmissionsTracker(**config)

number_runs = 3 # Max: 3

# Prediction period
client_taskX = Client_taskX("task2", ["task2a"], TOKEN, number_runs, tracker)
client_taskX.run_taskX(5, 0.1)

[codecarbon INFO @ 14:33:15] [setup] RAM Tracking...
[codecarbon INFO @ 14:33:15] [setup] GPU Tracking...
[codecarbon INFO @ 14:33:15] No GPU found.
[codecarbon INFO @ 14:33:15] [setup] CPU Tracking...
[codecarbon DEBUG @ 14:33:15] Not using PowerGadget, an exception occurred while instantiating IntelPowerGadget : Intel Power Gadget executable not found on darwin
[codecarbon DEBUG @ 14:33:15] Not using the RAPL interface, an exception occurred while instantiating IntelRAPL : Platform not supported by Intel RAPL Interface
[codecarbon INFO @ 14:33:17] CPU Model on constant consumption mode: Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz
[codecarbon INFO @ 14:33:17] >>> Tracker's metadata:
[codecarbon INFO @ 14:33:17]   Platform system: macOS-10.16-x86_64-i386-64bit
[codecarbon INFO @ 14:33:17]   Python version: 3.8.3
[codecarbon INFO @ 14:33:17]   CodeCarbon version: 2.2.1
[codecarbon INFO @ 14:33:17]   Available RAM : 8.000 GB
[codecarbon INFO @ 14:33:17]   CPU count: 4
[codecarbon INFO @ 14:

------------------- Processing round 1


100%|██████████| 3/3 [00:00<00:00, 12.33it/s]
[codecarbon INFO @ 14:33:21] Energy consumed for RAM : 0.000000 kWh. RAM Power : 0.5084638595581055 W
[codecarbon DEBUG @ 14:33:21] RAM : 0.51 W during 1.13 s [measurement time: 0.0118]
[codecarbon INFO @ 14:33:21] Energy consumed for all CPUs : 0.000014 kWh. Total CPU Power : 42.5 W
[codecarbon DEBUG @ 14:33:21] CPU : 42.50 W during 1.14 s [measurement time: 0.0008]
[codecarbon INFO @ 14:33:21] 0.000014 kWh of electricity used since the beginning.
[codecarbon DEBUG @ 14:33:21] last_duration=1.144747018814087
------------------------
[codecarbon DEBUG @ 14:33:21] We apply an energy mix of 190 g.CO2eq/kWh for Spain
[codecarbon DEBUG @ 14:33:21] EmissionsData(timestamp='2023-05-20T14:33:21', project_name='codecarbon', run_id='f944fe69-1dca-479c-a1dd-3186000aec01', duration=1.158128023147583, emissions=2.5980071092583097e-06, emissions_rate=2.243281448451092e-06, cpu_power=42.5, gpu_power=0.0, ram_power=0.5084638595581055, cpu_energy=1.3514374

{"predictions": {"subject1": 0, "subject2": 0}, "emissions": {"duration": 1.158128023147583, "emissions": 2.5980071092583097e-06, "cpu_energy": 1.3514374527666304e-05, "gpu_energy": 0, "ram_energy": 1.5934710000901004e-07, "energy_consumed": 1.3673721627675314e-05, "cpu_count": 4, "gpu_count": NaN, "cpu_model": "Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz", "gpu_model": NaN, "ram_total_size": 8.0}}


[codecarbon DEBUG @ 14:33:24] We apply an energy mix of 190 g.CO2eq/kWh for Spain
[codecarbon DEBUG @ 14:33:24] EmissionsData(timestamp='2023-05-20T14:33:24', project_name='codecarbon', run_id='f944fe69-1dca-479c-a1dd-3186000aec01', duration=4.277153015136719, emissions=2.5980071092583097e-06, emissions_rate=6.074150492311215e-07, cpu_power=42.5, gpu_power=0.0, ram_power=0.5084638595581055, cpu_energy=1.3514374527666304e-05, gpu_energy=0, ram_energy=1.5934710000901002e-07, energy_consumed=1.3673721627675314e-05, country_name='Spain', country_iso_code='ESP', region='madrid', cloud_provider='', cloud_region='', os='macOS-10.16-x86_64-i386-64bit', python_version='3.8.3', codecarbon_version='2.2.1', cpu_count=4, cpu_model='Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz', gpu_count=None, gpu_model=None, longitude=-3.4719, latitude=40.4586, ram_total_size=8.0, tracking_mode='process', on_cloud='N')


{"predictions": {"subject1": 0, "subject2": 0}, "emissions": {"duration": 4.277153015136719, "emissions": 2.5980071092583097e-06, "cpu_energy": 1.3514374527666304e-05, "gpu_energy": 0, "ram_energy": 1.5934710000901004e-07, "energy_consumed": 1.3673721627675314e-05, "cpu_count": 4, "gpu_count": NaN, "cpu_model": "Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz", "gpu_model": NaN, "ram_total_size": 8.0}}


[codecarbon DEBUG @ 14:33:27] We apply an energy mix of 190 g.CO2eq/kWh for Spain
[codecarbon DEBUG @ 14:33:27] EmissionsData(timestamp='2023-05-20T14:33:27', project_name='codecarbon', run_id='f944fe69-1dca-479c-a1dd-3186000aec01', duration=7.3230791091918945, emissions=2.5980071092583097e-06, emissions_rate=3.547697724577771e-07, cpu_power=42.5, gpu_power=0.0, ram_power=0.5084638595581055, cpu_energy=1.3514374527666304e-05, gpu_energy=0, ram_energy=1.5934710000901002e-07, energy_consumed=1.3673721627675314e-05, country_name='Spain', country_iso_code='ESP', region='madrid', cloud_provider='', cloud_region='', os='macOS-10.16-x86_64-i386-64bit', python_version='3.8.3', codecarbon_version='2.2.1', cpu_count=4, cpu_model='Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz', gpu_count=None, gpu_model=None, longitude=-3.4719, latitude=40.4586, ram_total_size=8.0, tracking_mode='process', on_cloud='N')


{"predictions": {"subject1": 0, "subject2": 0}, "emissions": {"duration": 7.3230791091918945, "emissions": 2.5980071092583097e-06, "cpu_energy": 1.3514374527666304e-05, "gpu_energy": 0, "ram_energy": 1.5934710000901004e-07, "energy_consumed": 1.3673721627675314e-05, "cpu_count": 4, "gpu_count": NaN, "cpu_model": "Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz", "gpu_model": NaN, "ram_total_size": 8.0}}
All rounds processed


{'subject1': 0, 'subject2': 0}