In [1]:
import pandas as pd
import numpy as np
import joblib

### Download data and models
#### Database with additional treatment

In [2]:
df = pd.read_csv("../2_Treatment_database/output/database_one_row_each_paper.csv")
print( "Loaded %d X %d dataframe with unique papers" % (len(df), len(df.columns) ))

Loaded 4691 X 14 dataframe with unique papers


#### Preprocessed abstracts

In [3]:
abstracts_prepro = pd.read_csv("./interm/processed_abstracts.csv")
print( "%d preprocessed abstracts" % (len(abstracts_prepro)) )

4691 preprocessed abstracts


#### TF-IDF normalised document-term matrix and terms

In [4]:
(tfidf, tfidf_feature_names) = joblib.load( "./interm/tfidf_matrix-features_names.pkl" )
print( "Loaded %d X %d document-term matrix" % (tfidf.shape[0], tfidf.shape[1]) )

Loaded 4691 X 1300 document-term matrix


#### Model with optimal parameters:

In [5]:
k_topics,alpha,l1,W,H = joblib.load("./output/model_selected.pkl") 
param = [k_topics,alpha,l1,W,H]
param_names = ["k_topics", "alpha", "l1"]
for name in param_names:
    print(name, '=', repr(eval(name)))

k_topics = 39
alpha = 0.1
l1 = 0.9


### Run the model

In [6]:
WH = W.dot(H)
WH.shape

(4691, 1300)

In [7]:
W.shape

(4691, 39)

### Normalize W 
(the sum of coefficients of each row is equal to 1)

In [8]:
W_norm = np.zeros(W.shape)
for k in range(k_topics):
    topic_index = k
    sum_terms_topic = np.sum(H[topic_index,:])
    for n in range(len(df)):
        abstract_index = n
        sum_terms_abstract = np.sum(WH[abstract_index,:])
        abstract_topic = W[abstract_index,topic_index] * sum_terms_topic
        W_norm[abstract_index,topic_index] = abstract_topic/sum_terms_abstract

### Create topics columns

In [9]:
def get_5_words( all_terms, H, topic_index):
    # reverse sort the values to sort the indices
    top_indices = np.argsort( H[topic_index,:] )[::-1]
    top_terms=[]
    for i in range(5):
        term = tfidf_feature_names[top_indices[i]]
        top_terms.append(term)
    return top_terms

In [10]:
columns_topics = []
list_words_topics = []
for k in range(k_topics):
    words_topics = get_5_words(tfidf_feature_names, H , k)
    list_words_topics.append(words_topics)
    column_name = str(k) + '_'+ "_".join(words_topics)        
    columns_topics.append(column_name)

### Create a dataframe (df_binaire) with binary topic variables (1 if normalized coefficient $\geq$ 0.02, else 0)

In [11]:
b = (W_norm>0.02).astype(int)
df_binaire = pd.DataFrame(data = b)
#Create row Total: 
df_binaire.loc['Total',:]= df_binaire.sum(axis=0)
#Create column Total: 
df_binaire.loc[:,'Total'] = df_binaire.sum(axis=1)

### Create a dataframe (prop) containing the number of publications per topic

In [12]:
prop = pd.DataFrame()
prop['topic']= columns_topics
prop['seuil_0.02']=np.nan
prop.set_index('topic', inplace = True)
for i in range(k_topics):
    row = columns_topics[i]
    prop.loc[row,['seuil_0.02']] = df_binaire.loc['Total',[i]].values[0]

#### Rename columns with the five weightest terms charactering each topic

In [13]:
columns_topics.append('Total_topics_in_paper')

In [14]:
df_binaire.columns = columns_topics
df_binaire.head(1)

Unnamed: 0,0_energi_effici_consumpt_save_demand,1_climat_chang_temperatur_futur_project,2_power_generat_plant_sector_capac,3_vehicl_fleet_car_hybrid_passeng,4_industri_cement_sector_product_process,5_forest_sequestr_wood_sink_stock,6_steel_iron_product_materi_save,7_build_residenti_stock_sector_construct,8_water_resourc_basin_river_irrig,9_air_pollut_pm2_qualiti_health,...,30_price_et_market_trade_polici,31_drought_precipit_sever_frequenc_index,32_crop_yield_soil_wheat_fertil,33_peak_around_reach_earlier_non,34_oil_product_crude_natur_export,35_land_use_soil_area_chang,36_target_achiev_indc_meet_ndc,37_wast_landfil_solid_municip_treatment,38_system_model_transit_integr_pathway,Total_topics_in_paper
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


### How many publications per topic

In [15]:
prop

Unnamed: 0_level_0,seuil_0.02
topic,Unnamed: 1_level_1
0_energi_effici_consumpt_save_demand,1732.0
1_climat_chang_temperatur_futur_project,2000.0
2_power_generat_plant_sector_capac,797.0
3_vehicl_fleet_car_hybrid_passeng,387.0
4_industri_cement_sector_product_process,497.0
5_forest_sequestr_wood_sink_stock,314.0
6_steel_iron_product_materi_save,105.0
7_build_residenti_stock_sector_construct,356.0
8_water_resourc_basin_river_irrig,291.0
9_air_pollut_pm2_qualiti_health,674.0


### How many topics characterize how many papers

Remove "Total" row

In [16]:
df_binaire = df_binaire.drop(['Total'])

In [17]:
df_paper = pd.DataFrame(df_binaire.Total_topics_in_paper.value_counts(), columns=['Total_topics_in_paper'])
df_paper

Unnamed: 0,Total_topics_in_paper
5.0,972
4.0,956
6.0,832
3.0,648
7.0,517
8.0,283
2.0,238
9.0,136
10.0,56
1.0,31


### Add processed abstracts to the main database

In [18]:
df['stem_abstract'] = abstracts_prepro["abstracts_prepro"]

### Add topics attribution to the main database

In [19]:
df = df.reindex(columns= np.append(df.columns,columns_topics))
df.update(df_binaire)

In [20]:
df.head(2)

Unnamed: 0,ISO_3,Country,Region,title,authors,source,doi,abstract,author_keywords,publication_year,...,30_price_et_market_trade_polici,31_drought_precipit_sever_frequenc_index,32_crop_yield_soil_wheat_fertil,33_peak_around_reach_earlier_non,34_oil_product_crude_natur_export,35_land_use_soil_area_chang,36_target_achiev_indc_meet_ndc,37_wast_landfil_solid_municip_treatment,38_system_model_transit_integr_pathway,Total_topics_in_paper
0,AFG,Afghanistan,Asia,The renewable energy sector in Afghanistan: Po...,"Fahimi, A; Upham, P",WILEY INTERDISCIPLINARY REVIEWS-ENERGY AND ENV...,10.1002/wene.280,Afghanistan has one of the lowest rates of acc...,,2018.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
1,AGO,Angola,Africa,Determinación del Potencial energético de los ...,"González Diaz, Yudith; Gato Clavell, Tania; Gi...",Tecnología Química,,The biological conversion of the Urban Solid R...,biogas; sanitary fillers; urban solid residual...,2015.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7.0


In [21]:
df.to_csv('./output/database_seuil_0.02.csv',index=False)