In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
(df, data_samples, tfidf, tfidf_feature_names) = joblib.load( "df-data_samples-tfidf-tfidf_names.pkl" )

Get the model that we generated earlier:

In [3]:
k_topics,alpha,l1,W,H = joblib.load("model_selected.pkl") 
param = [k_topics,alpha,l1,W,H]
param_names = ["k_topics", "alpha", "l1"]
for name in param_names:
    print(name, '=', repr(eval(name)))

k_topics = 38
alpha = 0.14
l1 = 0.1


### Run the model

In [4]:
WH = W.dot(H)
WH.shape

(3813, 1262)

In [5]:
W.shape

(3813, 38)

In [6]:
liste = []
for n in range(len(H)):
    liste.append(np.sum(H[n,:]))
liste[0:10]

[15.1730907621389,
 20.227097829889743,
 13.301814321789639,
 12.978146379613381,
 15.724714185637081,
 14.96553079870673,
 44.754204124736916,
 12.107770235531934,
 17.82703621099995,
 13.624605996787592]

### Create matrix 

In [7]:
matrix = np.zeros(W.shape)
for k in range(k_topics):
    topic_index = k
    sum_terms_topic = np.sum(H[topic_index,:])
    for n in range(len(df)):
        abstract_index = n
        sum_terms_abstract = np.sum(WH[abstract_index,:])
        abstract_topic = W[abstract_index,topic_index] * sum_terms_topic
        matrix[abstract_index,topic_index] = abstract_topic/sum_terms_abstract

In [8]:
matrix[:].round(2)

array([[0.12, 0.  , 0.11, ..., 0.  , 0.  , 0.04],
       [0.05, 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.18, 0.  , ..., 0.  , 0.06, 0.01],
       ...,
       [0.  , 0.07, 0.  , ..., 0.  , 0.01, 0.  ],
       [0.  , 0.06, 0.  , ..., 0.  , 0.12, 0.08],
       [0.  , 0.03, 0.  , ..., 0.  , 0.01, 0.  ]])

In [9]:
liste = []
for n in range(len(df)):
    liste.append(np.sum(matrix[n,:]))
liste[0:10]

[1.0,
 1.0,
 0.9999999999999999,
 1.0,
 1.0,
 1.0,
 0.9999999999999999,
 0.9999999999999998,
 0.9999999999999998,
 1.0]

In [10]:
def get_5_words( all_terms, H, topic_index):
    # reverse sort the values to sort the indices
    top_indices = np.argsort( H[topic_index,:] )[::-1]
    top_terms=[]
    for i in range(5):
        term = tfidf_feature_names[top_indices[i]]
        top_terms.append(term)
    return top_terms

In [11]:
columns_topics = []
list_words_topics = []
for k in range(k_topics):
    words_topics = get_5_words(tfidf_feature_names, H , k)
    list_words_topics.append(words_topics)
    column_name = str(k) + '_'+ "_".join(words_topics)        
    columns_topics.append(column_name)

In [12]:
liste_rows = ["tot_pap","pap_wo_top","pap_w1top","pap_w2top","pap_w3top","pap_w4top","pap_w5top"] + columns_topics

In [13]:
def liste_seuils(pas,itera):
    #arrondir à la 2eme décimale
    seuil = round(0.05,2)
    seuils = [seuil]
    seuils_columns = ["seuil_"+ str(round(seuil,2))]
    for k in range(itera):
        seuil+= pas
        seuils.append(round(seuil,2))
        seuils_columns.append("seuil_"+ str(round(seuil,2)))
    return(seuils, seuils_columns)

In [14]:
liste_s = liste_seuils(0.01,10)

In [34]:
def prop_topics(matrix, columns_topics, liste_seuils, k_topics):
    seuils = liste_seuils[0]
    seuils_columns = liste_seuils[1]
    df = pd.DataFrame(columns= seuils_columns)
    df.columns.name = 'seuil'
    df['topic']= columns_topics
    df.set_index('topic', inplace = True)
    compteur_col = 0
    for seuil in seuils:
        column = seuils_columns[compteur_col] 
        compteur_col+=1
        b = (matrix>seuil).astype(int)
        df_binaire = pd.DataFrame(data = b)
        #Total sum per row: 
        df_binaire.loc['Total',:]= df_binaire.sum(axis=0)
        #Total sum per column: 
        df_binaire.loc[:,'Total'] = df_binaire.sum(axis=1)
        #How many topics characterize how many papers
        df_paper = pd.DataFrame(df_binaire.Total.value_counts(), columns=['Total'])
        print(df_paper.head(10))
        for i in range(k_topics):
            row = columns_topics[i]
            df.loc[row,[column]] = df_binaire.loc['Total',[i]].values[0]
    return(df, df_binaire)        

In [35]:
prop = prop_topics(matrix, columns_topics, liste_s, k_topics)[0]

      Total
5.0     899
6.0     790
4.0     672
7.0     602
3.0     392
8.0     239
9.0     110
2.0      80
10.0     20
1.0       7
         Total
5.0       1001
4.0        906
6.0        725
3.0        573
7.0        343
2.0        145
8.0         87
9.0         23
1.0          9
18311.0      1
         Total
4.0       1115
5.0        943
3.0        826
6.0        501
2.0        242
7.0        140
1.0         22
8.0         22
9.0          2
16339.0      1
         Total
4.0       1188
3.0       1146
5.0        775
2.0        364
6.0        250
1.0         48
7.0         41
14636.0      1
8.0          1
         Total
3.0       1380
4.0       1170
2.0        569
5.0        514
6.0        100
1.0         73
7.0          7
13250.0      1
         Total
3.0       1547
4.0        977
2.0        831
5.0        300
1.0        123
6.0         33
7.0          2
12046.0      1
         Total
3.0       1598
2.0       1101
4.0        761
1.0        210
5.0        134
6.0          9
10974.0      

In [17]:
df_binaire = prop_topics(matrix, columns_topics, liste_s, k_topics)[1]

In [18]:
prop

seuil,seuil_0.05,seuil_0.06,seuil_0.07,seuil_0.08,seuil_0.09,seuil_0.1,seuil_0.11,seuil_0.12,seuil_0.13,seuil_0.14,seuil_0.15
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0_energi_effici_consumpt_demand_secur,1155,1022,885,760,681,595,528,453,387,326,283
1_climat_chang_impact_global_adapt,1135,1020,911,823,742,679,619,553,507,462,424
2_power_generat_plant_wind_capac,590,526,467,407,366,329,303,272,245,217,196
3_vehicl_fleet_car_hybrid_passeng,282,256,240,228,219,207,197,181,170,164,151
4_scenario_bau_altern_three_compar,990,844,718,600,524,444,381,330,286,246,205
5_forest_sink_sequestr_wood_stock,250,229,212,204,199,187,171,166,163,155,151
6_polici_develop_sustain_govern_countri,2038,1951,1869,1776,1693,1614,1556,1496,1419,1325,1264
7_oil_product_crude_export_produc,253,214,181,166,141,128,121,107,96,87,75
8_air_pollut_pm25_qualiti_health,468,434,394,374,345,319,296,279,262,244,227
9_ccs_storag_captur_plant_geolog,246,220,195,185,168,159,149,143,135,131,124


In [19]:
df1 = prop.unstack(level=1)

In [20]:
df1.name = 'number_of_papers'
df2 = df1.fillna(0)
df2 = df2.reset_index()

import plotly.express as px

fig = px.line(df2, x="seuil", y="number_of_papers", color='topic')
fig.show()

In [21]:
number_of_abs = []
for i in range(k_topics):
    value = prop.iloc[:,0:1].values[i].copy()
    number_of_abs.append(value[0])

In [22]:
seuil1 = pd.DataFrame()
#seuil1.sort_values(by = ['number_of_abs'], ascending = [True], inplace = True)

In [23]:
seuil1['topics'] = columns_topics

In [24]:
seuil1['number_of_abstracts'] = number_of_abs

In [25]:
seuil1.sort_values(by = ['number_of_abstracts'], ascending =[True], inplace = True) 

In [26]:
seuil1

Unnamed: 0,topics,number_of_abstracts
34,34_cement_product_materi_substitut_effici,69
25,25_hydrogen_cell_produc_chain_infrastructur,80
14,14_drought_precipit_extrem_frequenc_flood,105
16,16_tax_revenu_polici_model_effect,184
23,23_nuclear_power_energi_plant_govern,190
12,12_water_resourc_river_use_hydrolog,216
22,22_heat_pump_district_system_boiler,221
13,13_coal_plant_coalfir_clean_util,234
30,30_soil_deposit_organ_stock_sequestr,235
29,29_natur_consumpt_resourc_price_suppli,241


In [27]:
def get_top_abstracts( data_samples, W, topic_index, n_top_abstracts ):
    # reverse sort the values to sort the indices
    top_indices = np.argsort( W[:,topic_index] )[::-1]
    # now get the abstracts corresponding to the top-ranked indices
    top_abstracts = []
    for doc_index in top_indices[0:n_top_abstracts]:
        top_abstracts.append( data_samples[doc_index] )
    return top_abstracts

In [28]:
df['stem_abstract'] = pd.Series(data_samples)
df['topic_charact'] = np.nan
df_top_topic_abs = pd.DataFrame()

for k in range(k_topics):
    top_topic_abstracts = get_top_abstracts( data_samples, W, k, number_of_abs[k] )
    for i, abstract in enumerate(top_topic_abstracts):
        df_tempo = df[df['stem_abstract'].isin([str(abstract)])].copy()
        df_tempo['topic_charact'] = columns_topics[k]
        df_tempo['number_of_abs'] = number_of_abs[k]
        df_top_topic_abs = df_top_topic_abs.append(df_tempo)

In [29]:
df_top_topic_abs.reset_index(0,inplace = True)        
df_top_topic_abs['index1'] = df_top_topic_abs.index
col = ['index1','topic_charact','number_of_abs','title','abstract']
df_top_topic_abs = df_top_topic_abs.reindex(columns=col)

In [30]:
df_top_topic_abs.sort_values(by = ['number_of_abs','index1'], ascending = [True,True], inplace = True)

In [31]:
joblib.dump((df_top_topic_abs, list_words_topics,columns_topics), "dfseuil005_wordslist_columnstopics.pkl") 



['dfseuil005_wordslist_columnstopics.pkl']