In [1]:
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfTransformer
import json
import pickle
import numpy as np

In [2]:
df = pd.read_table('../../dataset/Export_agg_country_prod_yr.txt', encoding="latin-1",sep=',',dtype={'year':int,'HS':str,'SITC':str, 'CUCI': str})

  """Entry point for launching an IPython kernel.


In [3]:
#voy a reutilizar los códigos en el shiny
#pd.DataFrame(df.groupby(["reporter","rep_iso"]).size()).reset_index()[["reporter","rep_iso"]].to_csv("codigos_paises.csv",index=None)

In [3]:
class Procesamiento_LDA():
    
        def __init__(self, df, level=4):
            df = self.clean(df, level)
            productos = df.pivot_table(index=['rep_iso','year'],columns='SITC',values='value').columns
            product_df = pd.read_excel("../names/UN Comtrade Commodity Classifications.xlsx")
            ClassS3 = product_df[product_df["Classification"]=='S3']
            Codes = ClassS3.set_index('Code')['Description'].to_dict()
            self.df = df
            self.productos = productos
            self.Codes = Codes
        
        def clean(self, df, level):
            df = df[df["SITC"].notnull()]
            df["value"] = df["value"].fillna(0)
            df["level"]=df.apply(lambda x: len(x["SITC"]),1)
            df = df.loc[df["level"]==5, ['year', 'reporter', 'rep_iso','SITC','value']]
            df['SITC'] = df['SITC'].apply(lambda x: x[0:level]) #Me quedo con los primeros 4 dígitos
            df= df.groupby(['year','reporter','rep_iso','SITC']).sum()
            return df
        
        def matrix_transform(self, df):
            matrix = df.pivot_table(index=['rep_iso','year'],columns='SITC',values='value')
            matrix = matrix.fillna(0)
            matrix = matrix.to_sparse(fill_value=0)
            return matrix
        
        def lda_transform(self,k,save_model=True):
            tfidf = TfidfTransformer()
            matrix = self.matrix_transform(self.df)
            matrix = tfidf.fit_transform(matrix)
            lda_model = LatentDirichletAllocation(n_components=k, max_iter=75, random_state=1234,batch_size=200, learning_method='batch') #instance
            lda_model.fit(matrix) #fit
            if save_model:
                pickle.dump(lda_model, open("lda{}.pkl".format(k), 'wb')) #save    
            return lda_model, matrix, k
        
        def norm_comp(self,lda_model):
            norm_components = lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis]
            return norm_components
        
        def print_components(self, lda_model):
            
            norm_components = self.norm_comp(lda_model)
            for i in range(lda_model.components_.shape[0]):
                topic = norm_components[i]
                print("Componente %d:" % (i+1))
                top_n = [self.productos[j] for j in np.argsort(topic)[:-15 :-1]]
                top_n_props = topic[np.argsort(topic)][:-15 :-1]
                for j,producto in enumerate(top_n):
                    try:
                        print("{}-{}. Prop: {}%".format(producto, self.Codes[producto],+np.round(top_n_props[j]*100,3)))
                    except:
                        pass
                print("----------------------")

        def save_results(self, lda_model,matrix, k):
            
            norm_components = self.norm_comp(lda_model)
            components_df = pd.DataFrame(norm_components)
            components_df.columns = self.productos

            components_df.to_csv("results/Dist_cadenas{}.csv".format(k), index=False)

            result = lda_model.transform(matrix)
            result = pd.DataFrame(result)
            result.index = df.pivot_table(index=['rep_iso','year'],columns='SITC',values='value').index
            result = result.reset_index()
            result.to_csv("results/Dist_paises{}.csv".format(k),index=False)
        



In [4]:
procesadora_lda = Procesamiento_LDA(df, 4)

In [9]:
# 2 componentes
lda_model, matrix, k = procesadora_lda.lda_transform(2,save_model = True)
procesadora_lda.save_results(lda_model,matrix, k)

# 4 componentes
lda_model, matrix, k = procesadora_lda.lda_transform(4,save_model = True)
procesadora_lda.save_results(lda_model,matrix, k)

# 6 componentes
lda_model, matrix, k = procesadora_lda.lda_transform(6,save_model = True)
procesadora_lda.save_results(lda_model,matrix, k)

# 8 componentes
lda_model, matrix, k = procesadora_lda.lda_transform(8,save_model = True)
procesadora_lda.save_results(lda_model,matrix, k)

# 10 componentes
lda_model, matrix, k = procesadora_lda.lda_transform(10,save_model = True)
procesadora_lda.save_results(lda_model,matrix, k)

In [None]:
# 20 componentes
lda_model, matrix, k = procesadora_lda.lda_transform(20,save_model = True)
procesadora_lda.save_results(lda_model,matrix, k)

# 30 componentes
lda_model, matrix, k = procesadora_lda.lda_transform(30,save_model = True)
procesadora_lda.save_results(lda_model,matrix, k)

# 40 componentes
lda_model, matrix, k = procesadora_lda.lda_transform(40,save_model = True)
procesadora_lda.save_results(lda_model,matrix, k)

In [5]:
# 100 componentes
lda_model, matrix, k = procesadora_lda.lda_transform(100,save_model = True)
procesadora_lda.save_results(lda_model,matrix, k)

In [6]:
# 200 componentes
lda_model, matrix, k = procesadora_lda.lda_transform(200,save_model = True)
procesadora_lda.save_results(lda_model,matrix, k)