# Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Majority 

In [None]:
dt = pd.read_csv('T1/train/g_qrels_majority_2.csv')

In [None]:
dt.head()

In [None]:
print("Number of rows: ", dt.shape[0])
print("Number of columns: ", dt.shape[1])

In [None]:
dt.info()

In [None]:
dt.isnull().sum()

In [None]:
rel = dt['rel']

# Crear el histograma
counts, bins, patches = plt.hist(rel, bins=[-0.5, 0.5, 1.5], edgecolor='white')

# Centrar las etiquetas en las barras
plt.xticks([0, 1])

# Mostrar el valor de cada barra
for count, bin, patch in zip(counts, bins, patches):
    plt.text(bin+0.5, count+0.5, int(count), ha='center', va='bottom')

plt.xlabel('Relevancia')
plt.ylabel('Frecuencia')
plt.show()


Let's now see the distribution of the majority class in the training set where the relevance score is 1.

In [None]:
dt_filtered = dt[dt['rel'] == 1]

# Obtener la columna 'query' del DataFrame filtrado
query = dt_filtered['query']

# Crear el histograma
plt.figure(figsize=(10, 6))

counts, bins, patches = plt.hist(query, bins=range(1, 23), edgecolor='white', rwidth=0.7)

# Centrar las etiquetas en las barras
plt.xticks(np.arange(1.5, 22.5, 1), range(1, 22))

# Mostrar el valor de cada barra
for count, bin, patch in zip(counts, bins, patches):
    plt.text(bin+0.5, count+0.5, int(count), ha='center'  , va='bottom')

plt.xlabel('Síntoma')
plt.ylabel('Frecuencia')
plt.show()


## Unanimity

In [None]:
dt2 = pd.read_csv('T1/train/g_rels_consenso.csv')

In [None]:
dt2.head()

In [None]:
print("Number of rows: ", dt2.shape[0])
print("Number of columns: ", dt2.shape[1])

In [None]:
dt2.info()

In [None]:
dt2.isnull().sum()

In [None]:
rel = dt2['rel']

# Crear el histograma
counts, bins, patches = plt.hist(rel, bins=[-0.5, 0.5, 1.5], edgecolor='white')

# Centrar las etiquetas en las barras
plt.xticks([0, 1])

# Mostrar el valor de cada barra
for count, bin, patch in zip(counts, bins, patches):
    plt.text(bin+0.5, count+0.5, int(count), ha='center', va='bottom')

plt.xlabel('Relevancia')
plt.ylabel('Frecuencia')
plt.show()


Let's now see the distribution of the unanimity class in the training set where the relevance score is 1.

In [None]:
dt_filtered2 = dt2[dt2['rel'] == 1]

# Obtener la columna 'query' del DataFrame filtrado
query = dt_filtered2['query']

# Crear el histograma
plt.figure(figsize=(10, 6))

counts, bins, patches = plt.hist(query, bins=range(1, 23), edgecolor='white', rwidth=0.7)

# Centrar las etiquetas en las barras
plt.xticks(np.arange(1.5, 22.5, 1), range(1, 22))

# Mostrar el valor de cada barra
for count, bin, patch in zip(counts, bins, patches):
    plt.text(bin+0.5, count+0.5, int(count), ha='center'  , va='bottom')

plt.xlabel('Síntoma')
plt.ylabel('Frecuencia')

### Difference between majority and unanimity

In [None]:

rel1 = dt['rel'].value_counts()
rel2 = dt2['rel'].value_counts()

diff = rel2 - rel1

diff_df = pd.DataFrame({'Rel': diff.index, 'Diferencia': diff.values})

plt.figure(figsize=(10, 6))
barplot = sns.barplot(x='Rel', y='Diferencia', data=diff_df, hue='Rel', palette='pastel', legend=False)

plt.xlabel('Relevancia')
plt.ylabel('Diferencia')

for p in barplot.patches:
    barplot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),
                     ha='center', va='top', xytext=(0, 10), textcoords='offset points')
plt.show()


# Actual Training Data

In [None]:
import os
import xml.etree.ElementTree as ET
pd.set_option('display.max_colwidth', None)

In [None]:
directory = 'T1/train/data/'
total_posts = 0
total_phrases = 0
total_users = 0

# Recorrer todos los archivos en el directorio
for path in os.listdir(directory):
    filename = os.path.join(directory, path)
    total_users += 1
    with open(filename, 'r', encoding='utf8' ) as file:
        content = '<ROOT>' + file.read() + '</ROOT>'
    root = ET.fromstring(content)
    last_post_id = None
    for doc in root.findall('DOC'):
        post_id = doc.find('DOCNO').text.split('_')[2]
        if post_id != last_post_id:
            total_posts += 1
            last_post_id = post_id
        total_phrases += 1

# Calcular medias
average_posts_per_user = total_posts / total_users
average_phrases_per_post = total_phrases / total_posts
average_phrases_per_user = total_phrases / total_users

In [None]:
print(f'Número total de usuarios: {total_users}')
print(f'Número total de frases: {total_phrases}')
print(f'Número medio de posts por usuario: {average_posts_per_user}')
print(f'Número medio de frases por post: {average_phrases_per_post}')
print(f'Número medio de frases totales por usuario: {average_phrases_per_user}')

# Text Size Analysis

Vamos a crear un dataset con las oraciones que son relevantes y vamos a estudiar la distribucion de la longitud de las oraciones para cada sintoma, para el dataset de mayoria.

In [None]:
dt = pd.read_csv('T1/train/g_qrels_majority_2.csv')

In [None]:
relevant = dt[dt['rel'] == 1]

In [None]:
data = pd.DataFrame({'symptom': relevant['query'], 'docid': relevant['docid'], 'text': ''})

In [None]:
data.reset_index(inplace=True, drop=True)

In [None]:
data.head(10)

In [None]:
from collections import defaultdict
docid_dict = defaultdict(list)
for i, docid in enumerate(data['docid']):
    docid_dict[docid].append(i)

In [None]:
directory = 'T1/train/data/'
# Recorrer todos los archivos en el directorio
for path in os.listdir(directory):
    filename = os.path.join(directory, path)
    with open(filename, 'r', encoding='utf8') as file:
        try:
            content = '<ROOT>' + file.read() + '</ROOT>'
            root = ET.fromstring(content)
            for doc in root.findall('DOC'):
                docid = doc.find('DOCNO').text
                text = str(doc.find('TEXT').text).strip()
                #data.loc[data['docid'] == docid, 'text'] = text
                if docid in docid_dict:
                    for i in docid_dict[docid]:
                        data.loc[i, 'text'] = text
        except Exception as e:
            print(f"Error processing file {filename}: {e}")


In [None]:
data["length_text"]=data['text'].apply(lambda x: len(x.split()))

In [None]:
data.head()

In [None]:
data.sample(10)

In [None]:
data['length_text'].describe(percentiles=[0, 0.25, 0.50, 0.75, 0.95])

Vamos a mostrar los datos de manera más visual.

In [None]:
colors = ['plum', 'violet', 'red', 'coral', 'darksalmon', 'olivedrab', 'yellowgreen', 'darkseagreen', 'lightgreen', 'steelblue', 'skyblue', 'navy', 'blue', 'purple', 'magenta', 'pink', 'crimson', 'orange', 'gold', 'yellow', 'lime']
labels = sorted(data['symptom'].unique().tolist())
#print(labels)
dict_color = dict(zip(labels, colors))

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.hist(data['length_text'],  color = 'purple', edgecolor = 'white')
ax.set_xlabel('Número de tokens', fontsize=14)
ax.set_ylabel('Número de textos', fontsize=14)

plt.show()

In [None]:
fig_size = (20, 10)
plt.figure(figsize=fig_size)
labels = sorted(data["symptom"].unique())
for name in labels:
    # Subset to the language
    subset = data[data['symptom'] == int(name)]
    # Draw the density plot
    sns.kdeplot(subset['length_text'], color=dict_color[name], label=name, legend=True)

plt.xlabel('Número de tokens del texto original', fontsize=14 )
plt.ylabel('Densidad', fontsize=14)
plt.legend(prop={'size': 15}, title='Síntomas', loc="upper right")
plt.show()

Sin embargo, esto lo hemos hecho con el método split(), que no es lo más preciso. Por eso, ahora utilizaremos la libreria spacy para tokenizar las oraciones y así obtener el número de tokens de manera más precisa.

In [None]:
import spacy
import string
#python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [None]:
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

In [None]:
def spacy_tokenizer(sentence):
    doc = nlp(sentence)
    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]
    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    sentence = " ".join(mytokens)
    # return preprocessed list of tokens
    return sentence

In [None]:
data['tokens'] = data['text'].apply(spacy_tokenizer)

In [None]:
data["length_tokens"]=data['tokens'].apply(lambda x: len(x.split()))

In [None]:
data.head()

In [None]:
data.sample(10)

In [None]:
data['length_tokens'].describe(percentiles=[0, 0.25, 0.50, 0.75, 0.95])

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.hist(data['length_tokens'],  color = 'purple', edgecolor = 'white')
ax.set_xlabel('Número de tokens', fontsize=14)
ax.set_ylabel('Número de textos', fontsize=14)

plt.show()

In [None]:
fig_size = (20, 10)
plt.figure(figsize=fig_size)
labels = sorted(data["symptom"].unique())
for name in labels:
    # Subset to the language
    subset = data[data['symptom'] == int(name)]
    # Draw the density plot
    sns.kdeplot(subset['length_tokens'], color=dict_color[name], label=name, legend=True)

plt.xlabel('Número de tokens del texto procesado por Spacy', fontsize=14 )
plt.ylabel('Densidad', fontsize=14)
plt.legend(prop={'size': 15}, title='Síntomas', loc="upper right")
plt.show()


In [None]:
#data.to_csv('T1/train/relevant_texts.csv', index=False)