##### Detección de tópicos en ciencia básica: topicos con modelo entrenado

1. Filtrar documentos
2. Aplicar tf-idf
3. Guardar



#### **To do** 
- detectar en inngles
- Documentos repetidos entre repositorios y convocatorias
- mismo proyecto 
-


In [1]:
import pickle

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from os import listdir
from math import sqrt

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk.tokenize import sent_tokenize, word_tokenize

import heapq
import seaborn as sns

stemmer = SnowballStemmer('spanish')

pd.options.mode.chained_assignment = None  # default='warn'

from sklearn.externals import joblib 
import warnings




# 0. Clean and Steamming

In [2]:
def clean_str_series(s):

    """
    Convierte caracteres de utf8 a ascii y elimina errores


    Parameters:
    -----------
    s: string


    Returns:
    --------
    s: string
    """

    s = s.str.normalize('NFKD').str.encode(
        'ascii', errors='ignore').str.decode('utf-8') \
        .str.capitalize().str.strip().str.replace('[^\w\s]', '')
    return s

def text_cleaner(df, columns_to_clean, columns_not_na):

    """
    Elimina filas de un df en caso de ser vacías y aplica la función
    clean_str_series


    Parameters:
    -----------
    df: dataframe a limpiar
    columns_to_clean: columnas a aplicar clean_str_series
    columns_not_na: columnas a deshechar en caso de que sean NA


    Returns:
    --------
    df: dataframe con columnas limpias
    """

    # Quitar registros no validos
    df = df.dropna(subset=columns_not_na, axis=0)

    # Formato texto
    for d in columns_to_clean:
        if df[d].dtype == object:
            df[d] = clean_str_series(df[d])

    return df

def stemSentence(sentence, min_len=4):

    """
    Aplica steamming a un string


    Parameters:
    -----------
    sentence: string a aplicar steamming
    min_len: mínimo de caracteres en palabras


    Returns:
    --------
    stem_sentence: string con steamming
    """

    token_words = word_tokenize(sentence)
    stem_sentence = []

    for word in token_words:
        if len(word) > min_len:
            stem_sentence.append(stemmer.stem(
                WordNetLemmatizer().lemmatize(word, pos='v')))
            stem_sentence.append(" ")
    return "".join(stem_sentence)


# 1. TF-IDF
 regresa matriz documentos raices

In [3]:
def tfidf_train(texto, max_df, min_df, n_features):

    """
    Genera el vocabulario y la matriz de pesos usando tf-idf


    Parameters:
    -----------
    texto: string a aplicar tfidf

    max_df : float in range [0.0, 1.0] or int (default=1.0)
        When building the vocabulary ignore terms that have a document
        frequency strictly higher than the given threshold (corpus-specific
        stop words).
        If float, the parameter represents a proportion of documents,
        integer absolute counts. This parameter is ignored if vocabulary
        is not None.

    min_df : float in range [0.0, 1.0] or int (default=1)
        When building the vocabulary ignore terms that have a document
        frequency strictly lower than the given threshold. This value is
        also called cut-off in the literature.
        If float, the parameter represents a proportion of documents,
        integer absolute counts. This parameter is ignored if vocabulary
        is not None.

    n_features : int or None (default=None)
        If not None, build a vocabulary that only consider the top
        max_features ordered by term frequency across the corpus.
        This parameter is ignored if vocabulary is not None.

    Returns:
    --------
    tfidf_vectorizer:  TfidfVectorizer fiteado

    tfidf : sparse matrix, [n_samples, n_features]
        Tf-idf-weighted document-term matrix.

    tfidf_vectorizer: feature names de tfidf_vectorizer
    """

    print("...tfidf_train")
    
    #definimos las stop words
    with open("./pipeline/stop_words_spanish.txt", 'r') as f:
        stop_words_spanish = f.readlines()[0].split(" ")
    
    tfidf_vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df,
                                       max_features=n_features,stop_words=stop_words_spanish)
    tfidf = tfidf_vectorizer.fit_transform(texto)
    
    
    nombre_archivo_tf_idf = str("_max_df" + str(max_df) + "_min_df" +  str(min_df) + "_n_features" + str(n_features))
    #joblib.dump(tfidf_vectorizer, str('./trained_models/tfidf_vectorizer' + nombre_archivo_tf_idf + '.pkl'))
    #joblib.dump(tfidf, str('./trained_models/tfidf' + nombre_archivo_tf_idf + '.pkl'))
    
    joblib.dump(tfidf_vectorizer, str('./trained_models/tfidf_vectorizer.pkl'))
    joblib.dump(tfidf, str('./trained_models/tfidf.pkl'))
    

    return tfidf_vectorizer, tfidf, tfidf_vectorizer.get_feature_names()

def tfidf_test(texto, tfidf_vectorizer):

    """
    Transforma nuevos textos usando tfidf_vectorizer


    Parameters:
    -----------
    texto: list of strings a aplicar transformación usando tfidf_vectorizer

    tfidf_vectorizer : model_tfidf


    Returns:
    --------
    tfidf_test: matriz de TFIDF para nuevos textos.
    """

    print("...tfidf_test")

    tfidf_test = tfidf_vectorizer.transform(texto)
    return tfidf_test


# 2. Tópicos por NMF

Fit the NMF model (generalized Kullback-Leibler divergence)

In [4]:
    def train_nmf(tfidf, n_components, beta_loss='kullback-leibler',
                  solver='mu', max_iter=100, alpha=.1, l1_ratio=.5):
        """
        Genera tópicos usando matríz de tfidf


        Parameters:
        -----------
        tfidf: string
            matriz de pesos generado en tfidf

        n_components : int or None
            Number of components, if n_components is not set all features
            are kept.

        beta_loss : float or string, default ‘frobenius’
            String must be in {‘frobenius’, ‘kullback-leibler’,
             ‘itakura-saito’}. Beta divergence to be minimized, measuring
             the distance between X and the dot product WH. Note that
             values different from ‘frobenius’ (or 2) and
             ‘kullback-leibler’ (or 1) lead to significantly slower fits.
             Note that for beta_loss <= 0 (or ‘itakura-saito’), the input
             matrix X cannot contain zeros. Used only in ‘mu’ solver.


        solver : ‘cd’ | ‘mu’
            Numerical solver to use: ‘cd’ is a Coordinate Descent solver.
            ‘mu’ is a Multiplicative Update solver.

        max_iter : integer, default: 100
            Maximum number of iterations before timing out.

        max_iter : integer, default: 200
            Maximum number of iterations before timing out.

        1_ratio : double, default: 0.
            The regularization mixing parameter, with 0 <= l1_ratio <= 1
            For l1_ratio = 0 the penalty is an elementwise L2 penalty
            (aka Frobenius Norm). For l1_ratio = 1 it is an elementwise L1
            penalty. For 0 < l1_ratio < 1, the penalty is a combination
            of L1 and L2.

        Returns:
        --------
        topic_model: matrix or sparse array
            matriz de tópicos generados por NMF.
        """
        print("...train_nmf")

        topic_model = NMF(n_components, random_state=123,
                          beta_loss=beta_loss,
                          solver=solver, max_iter=max_iter,
                          alpha=alpha, l1_ratio=l1_ratio)

        topic_model.fit(tfidf)
        
        joblib.dump(topic_model, str('./trained_models/topic_model.pkl'))

        return topic_model

#### Guardar vocabulario y pesos los topicos generados por NMF

In [5]:
def vocabulario_nmf(topic_model, feature_names):
    """
    Genera un diccionario con el index,palabra y peso de cada topico y lo
    pasa a un DF

    Parameters:
    -----------
    topic_model: matrix
        matriz generado usando NMF matriz de pesos generado en tfidf

    feature_names: list
        nombre de palabras de diccionario.

    Returns:
    --------
    df_topicos: matrix or sparse array
        matriz de tópicos generados por NMF.
    """
    print("...vocabulario_nmf")

    topic_data = []
    for topic_idx, topic in enumerate(topic_model.components_):
        index = [i for i in range(len(topic))]
        words = [feature_names[i] for i in index]
        value = [topic[i] for i in index]

        topic_data.append({"index": index, "words": words, "value": value})

    filter_id = "topic-"
    df_topicos = pd.DataFrame([t['value'] for t in topic_data])
    df_topicos.index = [filter_id + str(t).zfill(3)
                        for t in df_topicos.index]

    df_topicos.columns = topic_data[0]['words']
    joblib.dump(df_topicos, './trained_models/df_topicos.pkl')
    return df_topicos, topic_data

## 2.2 filtro de valores de NMF por threshold

Nos quedamos con las topk palabras de la matriz de componentes de NMF

In [6]:
def filtro_vactores_nmf(thresh_percentile, topic_data):
    """
    filtra los valores de peso de cada tópico de nmf segun el
    percentil del topico para cada topico

    Parameters:
    -----------
    thresh_percentile: float in [0,100]
        percentil para filtrar

    topic_data: NMF model
        Matriz NMF

    Returns:
    --------
    topic_data: NMF model
        Matriz NMF
    """

    print("...filtro_vactores_nmf")

    for topic in range(len(topic_data)):

        # valor de el filtro a partir de percentile
        thresh_filter = np.percentile(np.array(
            topic_data[topic]["value"]), thresh_percentile)

        values_filtered = [row if row > thresh_filter else 0
                           for row in topic_data[topic]["value"]]
        topic_data[topic]["value"] = values_filtered

    joblib.dump(topic_data, './trained_models/topic_model_threshold.pkl')

    return topic_data

# 3. topicos por texto


Asignamos un vectore de topicos para cada texto

In [7]:
def topico_a_texto(df_topicos, tfidf, topic_data, status="test"):

    """
    Añade el vector de topicos a partir del producto punto entre la matriz
    de topicos de  NMF y la matrz de pesos de TFidf de cada documento.


    Parameters:
    -----------
    df_topicos: matrix or sparse array
        matriz de tópicos generados por NMF.

    tfidf : sparse matrix, [n_samples, n_features]
        Tf-idf-weighted document-term matrix.

    topic_data: NMF model
        Matriz NMF
    Returns:
    --------
    dataframe_values: df
        dataframe con topico por columna para cada texto
    """

    print("...topico_a_texto")

    lista_topicos = df_topicos.index.tolist()
    dataframe_values = pd.DataFrame(columns=lista_topicos,
                                    index=[row for row in range(
                                        tfidf[:].shape[0])])

    topics_results = []

    # computa el producto punto
    for i_doc in range(tfidf[:].shape[0]):
        valor_topico = [
            np.dot(
                tfidf[i_doc].todense().tolist()[0],
                topic_data[topic_id]["value"])
            for topic_id in range(len(topic_data))]  # pesos de topicos

        dataframe_values.iloc[i_doc, :] = valor_topico
        topics_results.append(str(valor_topico))

        # guardamos el vector generado para elastic solo en enntrenamiento
    if status == "train":
        df_texto_eval["topic_vector"] = topics_results
        # TF_idf_vector
        df_texto_eval["tfidf_vector"] = [tfidf.toarray()[row] for row
                                         in range(len(df_texto_eval))]

        joblib.dump(df_texto_eval, './trained_models/df_texto_eval.pkl')

    return dataframe_values

#### 4.  Vector por evaluador

converetimos a numeriico cols de peso y pegamos la columna ID_PROYECTO

In [None]:
def topico_a_texto(df_topicos, tfidf, topic_data, status="test"):

    """
    Añade el vector de topicos a partir del producto punto entre la matriz
    de topicos de  NMF y la matrz de pesos de TFidf de cada documento.


    Parameters:
    -----------
    df_topicos: matrix or sparse array
        matriz de tópicos generados por NMF.

    tfidf : sparse matrix, [n_samples, n_features]
        Tf-idf-weighted document-term matrix.

    topic_data: NMF model
        Matriz NMF
    Returns:
    --------
    dataframe_values: df
        dataframe con topico por columna para cada texto
    """

    print("...topico_a_texto")

    lista_topicos = df_topicos.index.tolist()
    dataframe_values = pd.DataFrame(columns=lista_topicos,
                                    index=[row for row in range(
                                        tfidf[:].shape[0])])

    topics_results = []

    # computa el producto punto
    for i_doc in range(tfidf[:].shape[0]):
        valor_topico = [
            np.dot(
                tfidf[i_doc].todense().tolist()[0],
                topic_data[topic_id]["value"])
            for topic_id in range(len(topic_data))]  # pesos de topicos

        dataframe_values.iloc[i_doc, :] = valor_topico
        topics_results.append(str(valor_topico))

        # guardamos el vector generado para elastic solo en enntrenamiento
    if status == "train":
        df_texto_eval["topic_vector"] = topics_results
        # TF_idf_vector
        df_texto_eval["tfidf_vector"] = [tfidf.toarray()[row] for row
                                         in range(len(df_texto_eval))]

        joblib.dump(df_texto_eval, './trained_models/.pkl')

    return dataframe_values

In [8]:


def texto_a_evaluador(dataframe_values, status="test"):

    """
    Asigna el vector de topicos de cada texto revisado por un evaluador y
    devuelve el vector promedio de todos los textos que ha evaluado.

    Parameters:
    -----------
    df_topicos: matrix or sparse array
        matriz de tópicos generados por NMF.

    tfidf : sparse matrix, [n_samples, n_features]
        Tf-idf-weighted document-term matrix.

    topic_data: NMF model
        Matriz NMF
    Returns:
    --------
    dataframe_values: df
        dataframe con topico por columna para cada texto
    """

    """
    Asigna el vector de topicos de cada texto revisado por un evaluador y
    devuelve el vector promedio
    de todos los textos que ha evaluado.
    """
    print("...texto_a_evaluador")

    index_dataframe_values = dataframe_values.columns.tolist()[:]

    dataframe_values[index_dataframe_values] = dataframe_values[
        index_dataframe_values].apply(
        pd.to_numeric, errors='coerce').reset_index(drop=True)

# informacion de proyecto y evaluador

    dataframe_values["ID_PROYECTO"] = df_texto_eval["ID_PROYECTO"
                                                    ].reset_index(
        drop=True)
    # poner campos en primera posicion
    dataframe_values = dataframe_values.set_index(
        ["ID_PROYECTO"]).reset_index(drop=False)

    if status == "train":
        # merge con evaluadores
        df_info_eval = pd.read_csv(
            "./data/data_training.csv").reset_index(drop=True)
        df_info_eval = df_info_eval[["ID_PROYECTO", "USUARIO",
                                    "CVU", "CVE_RCEA"]]

        dataframe_values = df_info_eval.merge(dataframe_values,
                                              on="ID_PROYECTO",
                                              how="inner")

    # groupby de los vectores por evaluador
        topics_evaluador = dataframe_values.groupby(
            ["CVE_RCEA", "USUARIO"])[index_dataframe_values[3:]].mean()

        joblib.dump(topics_evaluador,
                    './trained_models/topics_evaluador.pkl')
        return topics_evaluador
    else:
        joblib.dump(dataframe_values,
                    './trained_models/topicos_port_texto_test.pkl')
        return dataframe_values




___

### Train pipeline

In [9]:
def pipeline_topic_train(texto, max_df, min_df,n_features,   #tfidf
                    n_components, beta_loss,solver,l1_ratio, # NMF
                  thresh_percentile):   # filtro de vectores por topico 
    """
    Pipeline de entreneamiento: 
        con el conjunto de entrenamiento:
        1. TFIDF
        2. NMF
        3. ajusta treshhold de vectores de NMF
    
    Output:
        1. tfidf_vectorizer
        2. topic_model
        
    """
    # tfidf
    tfidf_vectorizer, tfidf, feature_names = tfidf_train(texto, max_df, min_df,n_features)
    
    #NMF
    topic_model = train_nmf(tfidf, n_components, beta_loss='kullback-leibler',
              solver='mu', max_iter=200, alpha=.1,l1_ratio=.5)
    
    #guardar vocabulario (no necesario)
    df_topicos, topic_data = vocabulario_nmf(topic_model,feature_names)
    
    # filtro de vector por umbral
    topic_data = filtro_vactores_nmf(thresh_percentile,topic_data)

    dataframe_values = topico_a_texto(df_topicos,tfidf,topic_data,status="train")
    
    topics_evaluador = texto_a_evaluador(dataframe_values,status="train")
    
    return tfidf_vectorizer,topic_model,topics_evaluador

### Test pipeline

In [10]:
def pipeline_cleaning_steamming(data_topics,columns_not_na,columns_interes):
    
    """
    limpieza y steammingtest dataset

    """

    data_topics_clean = text_cleaner(data_topics, columns_to_clean=columns_interes,
                                     columns_not_na=columns_not_na)
    
    
    # steamming
    text_data = data_topics_clean["DESCRIPCION_PROYECTO"]
    text_data = text_data.apply(stemSentence)

    data_topics_clean["DESCRIPCION_PROYECTO"] = text_data
    data_topics_clean.reset_index(drop=True,inplace=True)
    return data_topics_clean

In [11]:
def pipeline_topic_test(data_topics, tfidf_vectorizer, topic_data, to_clean=1):
    # filtro de vectores por topico
    """
    Pipeline de nuevos datos:
        con el conjunto de entrenamiento:
        1. TFIDF usando el vectorizador ya entrenado
        2. obtener valores de tfidf.NMF usando la matriz entrenada
        3. devuelve cada texto con su peso
    """
    # pipeline limpieza

    if to_clean == 1:
        columns_not_na = ["PALABRAS_CLAVE1", "PALABRAS_CLAVE2",
                          "PALABRAS_CLAVE3", "DESCRIPCION_PROYECTO"]
        columns_interes = ["DESCRIPCION_PROYECTO"]

        data_topics = pipeline_cleaning_steamming(
            data_topics, columns_not_na=columns_not_na,
            columns_interes=columns_interes)

    texto = data_topics["DESCRIPCION_PROYECTO"]

    tfidf_test_1 = tfidf_test(texto, tfidf_vectorizer)
    # NMF

    # guardar vocabulario (no necesario)
    # df_topicos, topic_data = vocabulario_nmf(topic_model,feature_names)

    # asigna el vector de NMF a cada texto
    dataframe_values = topico_a_texto(df_topicos,tfidf,topic_data,status="test")
    # promedia el valor de los vectores de cada texto evaluado por un evaluador
    dataframe_values = texto_a_evaluador(dataframe_values)

    return dataframe_values

---

# Train


#### df_entrenaiento


In [12]:
df_texto_eval = pd.read_csv("./data/data_training.csv")
df_texto_eval = df_texto_eval.reset_index(drop=True)
df_texto_eval = df_texto_eval.drop_duplicates(subset=["ID_PROYECTO","NUMERO_CONVOCATORIA","ANIO"], keep="last")

#### TFidf params

In [13]:
texto = df_texto_eval["DESCRIPCION_PROYECTO"]
n_features = 512 #number of max words
#n_top_words = 30 #words per topic
#doc_similarity_thr = 0.15
max_df = .15
min_df = 5
#prueba de distribucion de pesos, ponemos lo defaul

#### NMF params

In [14]:
n_components = 50
n_components # borrar con nuevos docs
max_iter = 30
beta_loss='kullback-leibler'
solver='mu'
alpha=.1
l1_ratio=.5
#### thresh
thresh_percentile = 90

In [15]:
tfidf_vectorizer,topic_model, topics_evaluador = pipeline_topic_train(texto, max_df, min_df,n_features,   #tfidf
                    n_components, beta_loss,solver,l1_ratio, # NMF
                  thresh_percentile)

...tfidf_train
...train_nmf
...vocabulario_nmf
...filtro_vactores_nmf
...topico_a_texto
...texto_a_evaluador


---

# Test

#### cargamos modelos

In [16]:
topic_data_train = joblib.load('./trained_models/topic_model_threshold.pkl')
tfidf_vectorizer_train = joblib.load('./trained_models/tfidf_vectorizer.pkl')
tfidf_train = joblib.load('./trained_models/tfidf.pkl')
df_topicos = joblib.load('./trained_models/df_topicos.pkl')
df_texto_eval_train = joblib.load('./trained_models/df_texto_eval.pkl')
topics_evaluador_train = joblib.load('./trained_models/topics_evaluador.pkl')

FileNotFoundError: [Errno 2] No such file or directory: './trained_models/df_texto_eval.pkl'

#### conjunto de entrenamiento

In [None]:
df_texto_eval = pd.read_csv("./data/data_training.csv").reset_index(drop=True)
df_texto_eval = df_texto_eval.reset_index(drop=True)
df_texto_eval = df_texto_eval.drop_duplicates(
    subset=["ID_PROYECTO", "NUMERO_CONVOCATORIA", "ANIO"], keep="last")

target = df_texto_eval[["USUARIO", "CVU", "CVE_RCEA"]]
df_texto_eval.drop(["USUARIO", "CVU", "CVE_RCEA"], axis=1, inplace=True)

In [None]:
df_texto_eval.head(3)

In [None]:
dataframe_values = pipeline_topic_test(
    df_texto_eval, tfidf_vectorizer_train, topic_data_train, to_clean=0)
print(dataframe_values.head(10))