In [1]:
import sys
import os
import pandas as pd
import numpy as np

import multiprocessing
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk import download
import warnings
import nltk

warnings.filterwarnings(action = 'ignore')
 
from gensim.models import Word2Vec
from curses.ascii import isalpha
from gensim.models import KeyedVectors


from nltk.corpus import stopwords
spanish_stop_words = set(stopwords.words('spanish'))


import seaborn as sns
import matplotlib.pyplot as plt

# Import adjustText, initialize list of texts
from adjustText import adjust_text

from sklearn.manifold import TSNE

# Para calcular similitud de vectores de misma longitud
from gensim.matutils import unitvec

# Stemmer de palabras
from nltk.stem.snowball import SnowballStemmer

snowballstemmer = SnowballStemmer('spanish')

# Categorias relevantes
variables = ["macroeconomia","sostenibilidad","regulaciones","reputacion","alianzas","innovacion"]

In [2]:
# cambiar url para leer los datos de otro subfolder con path relativo
# os.chdir('../')

# Descarga de archivos nltk necesarios para modelar, descargue todo
# download()

In [3]:
noticias = pd.read_csv('../datos/noticias.csv')
clientes_noticias = pd.read_csv('../datos/clientes_noticias.csv')

# Funciones

In [4]:
def clean_text_news(df):
    df['len_titular'] = df['news_title'].apply(lambda x: len(x))
    df['len_content'] = df['news_text_content'].apply(lambda x: len(x))
    return df
    
def create_tokenization(x):
    data = []
    for j in word_tokenize(x,language='spanish'):
        term = j.lower()

        if term.isalpha() and not term in spanish_stop_words:
            #term = snowballstemmer.stem(term)
            data.append(term)
        
    return data

def document_vector(word2vec_model, doc, vocab):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in vocab]
    return np.mean(word2vec_model.wv[doc], axis=0)

# Function that will help us drop documents that have no word vectors in word2vec
def has_vector_representation(vocab, doc):
    """check if at least one word of the document is in the
    word2vec dictionary"""
    return not all(word not in vocab for word in doc)

# Filter out documents
def filter_docs(corpus, texts, condition_on_doc,vocab):
    """
    Filter corpus and texts given the function condition_on_doc which takes a doc. The document doc is kept if condition_on_doc(doc) is true.
    """
    number_of_docs = len(corpus)

    if texts is not None:
        texts = [text for (text, doc) in zip(texts, corpus)
                 if has_vector_representation(vocab,doc)]

    corpus = [doc for doc in corpus if has_vector_representation(vocab,doc)]

    print("{} docs removed".format(number_of_docs - len(corpus)))

    return (corpus, texts)

from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

def similitud(v1,v2):
    """
        Esta funcion calcula la similitud de dos vectores de misma longitud

        Inputs:
            v1: vector 1
            v2: vector 2
        
        Output: float

            Similitud de vectores

    """
    return np.dot(unitvec(v1), unitvec(v2))

def df_categoria_noticia(df_noticas,lista_temas,modelo,vector_promedio_noticias):

    diccionario_resultados = {}
    for variable in lista_temas:
        vector_tema = modelo.wv[variable]
        similitud_noticias = [similitud(a,vector_tema) for a in vector_promedio_noticias]
        diccionario_resultados[variable] = similitud_noticias

    resultados_temas = pd.DataFrame(diccionario_resultados)
    resultados_temas['Categoria'] = resultados_temas.idxmax(axis=1)
    resultados_temas['Similitud'] = resultados_temas.max(axis=1)

    # df_salida = pd.concat([df_noticas,resultados_temas[['Categoria']]],axis=1)
    df_salida = pd.concat([df_noticas,resultados_temas[['Categoria','Similitud']]],axis=1)

    return df_salida

def preprocessing_noticias(df):

    print(df.shape)
    df = df[df['news_text_content']!=' '].reset_index(drop=True).copy()
    print(df.shape)
    df = df.drop_duplicates()
    print(df.shape)
    data = df['news_text_content'].apply(create_tokenization)
    # data_titulas = df['news_title'].apply(create_tokenization)
    
    return [df, data]

def matrix_by_new(df,model, vocab, vector_size, output_name = None, run = False, save = False):

    if run:
        new_mean_vector = []
        for doc in df: # append the vector for each document
            try:
                output = document_vector(model, doc, vocab=vocab)
            except:
                output = np.zeros(vector_size)
            new_mean_vector.append(output)

        mean_vector = np.array(new_mean_vector) # list to array
        if save:
            with open(output_name+'.npy', 'wb') as f:
                np.save(f, new_mean_vector)

    else:
        mean_vector = np.load('array_mean_vector_model.npy')
        
    return mean_vector

In [5]:
data = preprocessing_noticias(noticias)

(23377, 6)
(23346, 6)
(23346, 6)


In [13]:
# from time import time  # To time our operations

run_model = False
path_model = "models/Model/word2vec.model"
# semilla = 2022
vector_size = 100

if run_model:

    model = Word2Vec(data[1],min_count=1,
                     window=5,
                     vector_size=vector_size,
                    #  sample=6e-5, 
                    #  alpha=0.03, 
                    #  min_alpha=0.0007, 
                    #  hs = 0,
                    #  negative=20,
                     workers=cores-1,
                    #  seed = semilla,
                     sg = 1)
    
    # model.save(path_model)

else:
    model = Word2Vec.load(path_model)

vocab = list(model.wv.index_to_key)                                     

In [26]:
sims = model.wv.most_similar('innovacion', topn=20)
sims

[('tecnologia', 0.8312691450119019),
 ('sncti', 0.8216502666473389),
 ('iccti', 0.8028162121772766),
 ('proinnovate', 0.7993803024291992),
 ('senescyt', 0.7975841760635376),
 ('micitt', 0.7975763082504272),
 ('jannixia', 0.7932529449462891),
 ('ciencia', 0.7913185954093933),
 ('emprendimiento', 0.7911211252212524),
 ('fusionando', 0.7889145016670227),
 ('ctel', 0.7845596075057983),
 ('tecnologica', 0.7810786962509155),
 ('greentech', 0.7798160910606384),
 ('macrozona', 0.7785157561302185),
 ('mincyt', 0.7744529843330383),
 ('macrotendencias', 0.7741804718971252),
 ('habilitadores', 0.7730867266654968),
 ('addotar', 0.7724708318710327),
 ('ctci', 0.77205491065979),
 ('emprendedurismo', 0.769287109375)]

In [14]:
vector_mean = matrix_by_new(df=data[1], vocab=vocab, model=model, run=True, vector_size=vector_size)
salida_categorias = df_categoria_noticia(df_noticas=data[0],lista_temas=variables,modelo=model,vector_promedio_noticias=vector_mean)

In [15]:
salida_categorias.groupby('Categoria').mean()['Similitud']

Categoria
alianzas          0.597509
innovacion        0.626025
macroeconomia     0.606658
regulaciones      0.602943
reputacion        0.541312
sostenibilidad    0.617127
Name: Similitud, dtype: float64

In [24]:
salida_categorias.sort_values(['Categoria','Similitud'],ascending = False, inplace=True)
muestra = salida_categorias.groupby('Categoria').head(50)
muestra['Categoria'].value_counts()
# muestra.to_csv('muestra_categorias.csv')

sostenibilidad    50
reputacion        50
regulaciones      50
macroeconomia     50
innovacion        50
alianzas          50
Name: Categoria, dtype: int64

In [25]:
muestra.groupby('Categoria').mean()['Similitud']

Categoria
alianzas          0.697760
innovacion        0.773155
macroeconomia     0.685679
regulaciones      0.696711
reputacion        0.584877
sostenibilidad    0.749972
Name: Similitud, dtype: float64

In [76]:
original_muestra = pd.read_csv('muestra_categorias_original.csv',index_col='Unnamed: 0')
original_muestra.groupby('Categoria').mean()['Similitud']
original_muestra['val_cat'] = original_muestra['news_url_absolute'].apply(lambda x: x.split('/')[3])

In [89]:
# nota
# 2022

val = original_muestra[original_muestra['Categoria']=='macroeconomia']
val[val['val_cat']=='nota']

Unnamed: 0,news_id,news_url_absolute,news_init_date,news_final_date,news_title,news_text_content,Categoria,Similitud,val_cat
17649,news77900,https://www.letrap.com.ar/nota/2022-8-5-10-32-...,2022-07-30,2022-08-14,"Un ortodoxo con pasado en Lavagna, el vice ...",Con la designacion de Gabriel Rubinstein frent...,macroeconomia,0.688243,nota
9273,news45579,https://www.eldiariodelarepublica.com/nota/202...,2022-07-15,2022-07-30,"Batakis se reunio con la titular del FMI, que ...","La ministra de Economia, Silvina Batakis, mant...",macroeconomia,0.685096,nota
9537,news46743,https://www.letrap.com.ar/nota/2022-7-16-17-47...,2022-07-15,2022-07-30,"Entre Guzman y su sucesora, Pesce no duda: ""Es...",Las criticas que antes resonaban por los pasil...,macroeconomia,0.66493,nota


In [None]:
id = 1550
# Filter the list of vectors to include only those that Word2Vec has a vector for
vector_list = [model2.wv[word] for word in data[id] if word in vocab]

# Create a list of the words corresponding to these vectors
words_filtered = [word for word in data[id] if word in vocab]

# Zip the words together with their vector representations
word_vec_zip = zip(words_filtered, vector_list)

In [None]:
# Cast to a dict so we can turn it into a DataFrame
word_vec_dict = dict(word_vec_zip)
df = pd.DataFrame.from_dict(word_vec_dict, orient='index')

In [None]:
# Initialize t-SNE
tsne = TSNE(n_components = 2, init = 'random', random_state = 10, perplexity = 3)

# Use only 400 rows to shorten processing time
tsne_df = tsne.fit_transform(df[:400])

In [None]:
sns.set()
# Initialize figure
fig, ax = plt.subplots(figsize = (11.7, 8.27))
sns.scatterplot(x = tsne_df[:, 0], y = tsne_df[:, 1], alpha = 0.5)

texts = []
words_to_plot = list(np.arange(0, df.shape[0], 10))

# Append words to list
for word in words_to_plot:
    texts.append(plt.text(tsne_df[word, 0], tsne_df[word, 1], df.index[word], fontsize = 14))
    
# Plot text using adjust_text (because overlapping text is hard to read)
adjust_text(texts, force_points = 0.4, force_text = 0.4, 
            expand_points = (2,1), expand_text = (1,2),
            arrowprops = dict(arrowstyle = "-", color = 'black', lw = 0.5))

plt.show()

In [None]:
salida_categorias[salida_categorias['macroeconomia']>0.6].sort_values('macroeconomia',ascending = False)

In [None]:
salida_categorias.groupby('Categoria').count()['news_id']

In [None]:
noti_categorias = salida_categorias[variables]

corr = noti_categorias.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

In [None]:
salida_categorias['len_content'].describe()

In [None]:
salida_categorias['len_content'].max()

In [None]:
salida_categorias_filtro = salida_categorias[salida_categorias['len_content']<300]
print(salida_categorias.shape[0])
print(salida_categorias_filtro.shape[0])
print(salida_categorias_filtro.shape[0]/salida_categorias.shape[0])

In [None]:
salida_categorias_filtro['len_content'].hist(bins=50)

In [None]:
salida_categorias_filtro[salida_categorias_filtro['len_content']==254]

In [None]:
# tsne no icnluido dentro del procesameinto anterior

In [None]:
# Initialize t-SNE
tsne = TSNE(n_components = 2, init = 'random', random_state = 10, perplexity = 100)

# Again use only 400 rows to shorten processing time
tsne_df = tsne.fit_transform(X)
fig, ax = plt.subplots(figsize = (14, 10))
sns.scatterplot(x = tsne_df[:, 0], y = tsne_df[:, 1], alpha = 0.5)

texts = []
titles_to_plot = list(np.arange(0, 800, 80)) # plots every 40th title in first 400 titles

# Append words to list
for title in titles_to_plot:
    texts.append(plt.text(tsne_df[title, 0], tsne_df[title, 1], titles_list[title], fontsize = 14))
    
# Plot text using adjust_text
adjust_text(texts, force_points = 0.4, force_text = 0.4, 
            expand_points = (2,1), expand_text = (1,2),
            arrowprops = dict(arrowstyle = "-", color = 'black', lw = 0.5))

plt.show()
