# Proyecto de PLN

## Realizado por: David Sanchis Morales, Pablo Rubio Asensi y Raúl Pina Fornés

### Clase preprocesado de texto

In [14]:
import re
import spacy


nlp = spacy.load("es_core_news_lg")

def normalizarDoc(doc):
    doc_norm = []
    for tweet in doc:
        tweet_norm = _normalizarTexto(tweet)
        doc_norm.append(tweet_norm)
    return doc_norm


def _normalizarTexto(texto):
    texto = _quitarSignos(texto)
    texto = _quitarNumeros(texto)

    doc = nlp(texto)
    tokens = [t.lemma_.lower()
              for t in doc if not t.is_punct and not t.is_space and len(t.text) > 2]
    salida = ' '.join(tokens)

    return salida


def _quitarSignos(texto):
    texto = re.sub(r"([\.\?])", r"\1 ", texto)
    return texto

def _quitarNumeros(texto):
    texto = re.sub(r'(\d+|\n)','',texto)
    return texto



### Función para cargar y guardar los datos

In [15]:
import pandas as pd
     
def cargarDatos(ruta):
     pd.set_option('display.max_colwidth', None)
     # Leemos los datos
     df = pd.read_csv(ruta, index_col=None)
     return df
def guardarDatos(ruta, pandas):
     pandas.to_csv(ruta)

### Función para obtener las estadisticas de los resultados

In [16]:
from sklearn import metrics
import numpy as np

def get_metrics(true_labels, predicted_labels):
    """Calculamos distintas métricas sobre el
    rendimiento del modelo. Devuelve un diccionario
    con los parámetros medidos"""

    return {
        'Accuracy': np.round(
            metrics.accuracy_score(true_labels,
                                   predicted_labels),
            3),
        'Precision': np.round(
            metrics.precision_score(true_labels,
                                    predicted_labels,
                                    average='weighted',
                                    zero_division=0),
            3),
        'Recall': np.round(
            metrics.recall_score(true_labels,
                                 predicted_labels,
                                 average='weighted',
                                 zero_division=0),
            3),
        'F1 Score': np.round(
            metrics.f1_score(true_labels,
                             predicted_labels,
                             average='weighted',
                             zero_division=0),
            3)}

### 

In [17]:
from scipy import sparse
from sklearn.calibration import LinearSVC
from sklearn.multioutput import MultiOutputClassifier

from sklearn.model_selection import  train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

dfCsv = cargarDatos('./Datos_test/sem_eval_train_es.csv')
corpus = list(dfCsv['Tweet'])
corpus = normalizarDoc(corpus)

dfTags = dfCsv.drop('ID', axis=1).drop('Tweet', axis=1)
labels = dfTags.to_numpy()

train_corpus, test_corpus, y_train, y_test = train_test_split(corpus,
                                                                 labels,
                                                                 test_size=0.3,
                                                                 random_state=0
                                                                 )

tfidfVectorizer = TfidfVectorizer()
tfidf_train = tfidfVectorizer.fit_transform(train_corpus)
tfidf_test = tfidfVectorizer.transform(test_corpus)
y_train_sparse = sparse.csr_matrix(y_train)

modelSvc = LinearSVC(C=0.7,  
                    penalty='l1', 
                    dual=False)

clf = MultiOutputClassifier(estimator=modelSvc)


clf.fit(tfidf_train, y_train_sparse.toarray())
y_pred = clf.predict(tfidf_test)
metrica = get_metrics(true_labels=y_test, predicted_labels=y_pred)
data = pd.DataFrame([("multi_class='ovr'", metrica['Accuracy'], metrica['F1 Score'], metrica['Precision'],metrica['Recall'])], columns=['Modelo', 'Accuracy', 'F1 Score', 'Precision', 'Recall'])
print('Nuestro mejor resultado obtenido')
print(data)

Nuestro mejor resultado obtenido
              Modelo  Accuracy  F1 Score  Precision  Recall
0  multi_class='ovr'     0.244     0.437      0.636   0.355


In [18]:
dfCsv = cargarDatos('./Datos_test/sem_eval_train_es.csv')
train_corpus = list(dfCsv['Tweet'])
train_corpus = normalizarDoc(train_corpus)

dfCsvTest = cargarDatos('./Datos_test/sem_eval_test_grupo_8.csv')
test_corpus = list(dfCsvTest['Tweet'])
test_corpus = normalizarDoc(test_corpus)

dfTags = dfCsv.drop('ID', axis=1).drop('Tweet', axis=1)
y_train = dfTags.to_numpy()



tfidfVectorizer = TfidfVectorizer()
tfidf_train = tfidfVectorizer.fit_transform(train_corpus)
tfidf_test = tfidfVectorizer.transform(test_corpus)
y_train_sparse = sparse.csr_matrix(y_train)



modelSvc = LinearSVC(C=0.7,  
                    penalty='l1', 
                    dual=False)
clf = MultiOutputClassifier(estimator=modelSvc)
clf.fit(tfidf_train, y_train_sparse.toarray())

y_pred = clf.predict(tfidf_test)


### Creamos el Dataframe para guardar los datos con los IDs y mostramos la info

In [19]:
ids = list(dfCsvTest['ID'])
df = pd.DataFrame(ids, columns=['ID'])
df[dfTags.columns] = y_pred
guardarDatos('./resultados_grupo_8.csv', df)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 679 entries, 0 to 678
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            679 non-null    object
 1   anger         679 non-null    bool  
 2   anticipation  679 non-null    bool  
 3   disgust       679 non-null    bool  
 4   fear          679 non-null    bool  
 5   joy           679 non-null    bool  
 6   love          679 non-null    bool  
 7   optimism      679 non-null    bool  
 8   pessimism     679 non-null    bool  
 9   sadness       679 non-null    bool  
 10  surprise      679 non-null    bool  
 11  trust         679 non-null    bool  
dtypes: bool(11), object(1)
memory usage: 12.7+ KB
