# Normalizing data

### Importing libraries

In [1]:
import pandas as pd
import os 
import json 
import spacy
import re
nlp = spacy.load("es_core_news_sm")
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /home/bruno-
[nltk_data]     rg/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

### Normalize function

In [3]:
# Función que normaliza el texto, según una lista de etiquetas dada. 
reGex = r"^[a-zA-Z]+$"
def normalize(text:str): 
    tags =  ('PUNCT')
    normalized_text = []
    doc = nlp(text)
    for token in doc:
            # Se verifica que el token no sea alguna etiqueta dada. 
            if token.tag_ not in tags: 
                if re.match(reGex, token.text):
                    normalized_text.append(token.lemma_)
    # Se regresa el texto normalizado
    _text = ""
    for word in normalized_text:
        _text += word + " "
    return _text

# ADP : adposition = preposicion
# CONJ : coordinating conjunction = conjuncion coordinante
# SCONJ :  subordinating conjunction = conjuncion subordinante
# PROP : pronoun = pronombre
# DET : determiner = articulo

def analyzer(text:str):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(text)
    return scores

### Extract data and normalice

In [4]:
# Extracting data and cleaning
def extract_and_clean(path:str, path_labels:str):
    directory = os.listdir(path)

    # Creating the dataframe for labels
    df_labels = pd.read_csv(path_labels, sep=',', header=0)

    # Creating the dataframe for data 
    messages = []
    labels = []
    negatives = []
    neutrals = []
    positives = []

    for name_file in directory:
        # Getting the index using the username
        index = df_labels.index[df_labels.loc[:, 'Subject'] == str(name_file.split('.')[0])][0]
        # To get the respective label
        label = df_labels.loc[index,:]["label"]

        # Opening JSON file
        with open(path+name_file) as json_file:
            data = json.load(json_file)
            for item in data: 
                text_norm = normalize(item['message']).strip()
                if len(text_norm) != 0:
                    scores = analyzer(item['message'])
                    messages.append(text_norm.lower())
                    labels.append(label)
                    negatives.append(scores['neg'])
                    neutrals.append(scores['neu'])
                    positives.append(scores['pos'])
                    
    return ( messages , labels , negatives, neutrals, positives)

### Calling functions

In [5]:
# Directory of trial
path_dir_trial = './data/task2/trial/subjects_trial/'
# File with labels for trial
path_labels_trial = './data/task2/trial/gold_trial_task2a.csv'

# Directory of train
path_dir_train = './data/task2/trial/subjects_train/'
# File with labels for train
path_labels_train = './data/task2/trial/gold_train_task2a.csv'


messages_trail, labels_trial, negatives_trial, neutrals_trial, positives_trial = extract_and_clean(path_dir_trial, path_labels_trial)
messages_train, labels_train, negatives_train, neutrals_train, positives_train = extract_and_clean(path_dir_train, path_labels_train)

In [7]:
messages_df = messages_trail + messages_train
labels_df = labels_trial + labels_train
negatives_df = negatives_trial + negatives_train
neutrals_df = neutrals_trial + neutrals_train
positives_df = positives_trial + positives_train

# Final dataframe cleaned
df = pd.DataFrame()
df['Message'] = messages_df
df['Negatives'] = negatives_df
df['Neutrals'] = neutrals_df
df['Positives'] = positives_df
df['Labels'] = labels_df

# Saving the dataframe to a csv file
df.to_csv('./data_processed/task2_depression.csv', encoding='utf-8')

In [None]:
df.head(5)

Unnamed: 0,Message,Negatives,Neutrals,Positives,Labels
0,haber ser confuso de momento bien y,0.0,1.0,0.0,0
1,espero ser disfrutar tu estancia en el grupo c...,0.0,1.0,0.0,0
2,ser bueno ese hacer cosa nuevo y que ayudar a ...,0.0,1.0,0.0,0
3,pensar hacer tarea y no yo concentrar,0.306,0.694,0.0,0
4,yo alguno vez él hacer pero este vez tener el ...,0.103,0.794,0.103,0
