In [1]:
import pandas as pd
import numpy as np
import joblib
import string
from stemming.porter2 import stem

#### Database with additional treatment

In [2]:
df = pd.read_csv("../2_Treatment_database/output/database_one_row_each_paper.csv")
print( "Loaded %d X %d dataframe with unique papers" % (len(df), len(df.columns) ))

Loaded 4691 X 16 dataframe with unique papers


#### Preprocessed abstracts

In [3]:
abstracts_prepro = pd.read_csv("./interm/processed_abstracts.csv")
print( "%d preprocessed abstracts" % (len(abstracts_prepro)) )

4691 preprocessed abstracts


#### TF-IDF normalised document-term matrix and list of terms

In [4]:
(tfidf, tfidf_feature_names) = joblib.load("./interm/tfidf_matrix-features_names.pkl" )
print( "Loaded %d X %d document-term matrix" % (tfidf.shape[0], tfidf.shape[1]) )

Loaded 4691 X 1300 document-term matrix


#### Model  with optimal parameters:

In [5]:
k_topics,alpha,l1,W,H = joblib.load("./output/model_selected.pkl") 
param = [k_topics,alpha,l1,W,H]
param_names = ["k_topics", "alpha", "l1"]
for name in param_names:
    print(name, '=', repr(eval(name)))

k_topics = 39
alpha = 0.1
l1 = 0.9


#### Topics and how many abstracts are associated to each topic

In [6]:
prop = pd.read_csv("./output/nb_papers_seuil_0.02.csv")

In [7]:
column_topics = prop['topic'].to_list()

In [8]:
number_of_abs = prop['seuil_0.02'].astype(int).to_list()

### Associate abstracts to topics 

Function that return the 'n_top_abstracts' associated to 'topic_index' according to their weight in W

In [9]:
def get_top_abstracts( data_samples, W, topic_index, n_top_abstracts ):
    # reverse sort the values to sort the indices
    top_indices = np.argsort( W[:,topic_index] )[::-1]
    # now get the abstracts corresponding to the top-ranked indices
    top_abstracts = []
    for doc_index in top_indices[0:n_top_abstracts]:
        top_abstracts.append( data_samples[doc_index] )
    return top_abstracts

Add to the database the preprocessed abstracts and a column used further to contain the topic to which the paper is related

In [10]:
df['stem_abstract'] = abstracts_prepro["abstracts_prepro"]
df['topic_charact'] = np.nan

#### Create a dataframe that contains for each topic all the rows charactering papers from the main database 
Start y creating an empty dataframe

In [11]:
df_topic_abs = pd.DataFrame()

Double loop on all topics and te associated abstracts (with a non nul coefficient in W)

In [12]:
for k in range(k_topics):
    topic_abstracts = get_top_abstracts(abstracts_prepro["abstracts_prepro"].to_list(), W, k, number_of_abs[k] )
    for i, abstract in enumerate(topic_abstracts):
        df_tempo = df[df['stem_abstract'].isin([str(abstract)])].copy()
        df_tempo['topic_charact'] = column_topics[k]
        df_tempo['number_of_abs'] = number_of_abs[k]
        df_topic_abs = df_topic_abs.append(df_tempo)

#### Keep the index to memorize the weight order for papers

In [13]:
df_topic_abs.reset_index(0,inplace = True)        
df_topic_abs['index1'] = df_topic_abs.index
col = ['index1','topic_charact','number_of_abs','title','abstract']
df_topic_abs = df_topic_abs.reindex(columns=col)

In [14]:
df_topic_abs.sort_values(by = ['number_of_abs','index1'], ascending = [True,True], inplace = True)

In [15]:
df_topic_abs.head(1)

Unnamed: 0,index1,topic_charact,number_of_abs,title,abstract
21387,21387,31_drought_precipit_sever_frequenc_index,89,Projected Changes of Future Extreme Drought Ev...,Effective drought prediction methods are essen...


In [16]:
df_topic_abs.to_csv("./output/df_topic_abs_classification.csv",index=False)

### Preprocess titles

In [17]:
table = str.maketrans('','', string.punctuation)

In [18]:
df_topic_abs['stem_title'] = df_topic_abs['title'].str.lower()
df_topic_abs['stem_title'] = df_topic_abs['stem_title'].str.translate(table)
df_topic_abs['stem_title'] = df_topic_abs['stem_title'].apply(lambda x: " ".join([stem(y) for y in x.split(" ")]))

### Get the 5 weightest words for each topic

In [19]:
def get_5_words( all_terms, H, topic_index):
    # reverse sort the values to sort the indices
    top_indices = np.argsort( H[topic_index,:] )[::-1]
    top_terms=[]
    for i in range(5):
        term = tfidf_feature_names[top_indices[i]]
        top_terms.append(term)
    return top_terms

In [20]:
list_words_topics = []
for k in range(k_topics):
    words_topics = get_5_words(tfidf_feature_names, H , k)
    list_words_topics.append(words_topics)

### Check stemmed titles by topic

In [21]:
new_df = pd.DataFrame()

In [22]:
k=0
for topic_words in list_words_topics:
    column = column_topics[k]
    k+= 1
    for word in topic_words:
        df_tempo = df_topic_abs.loc[df_topic_abs['stem_title'].str.contains(word) & df_topic_abs['topic_charact'].str.contains(column)].copy()
        df_tempo['word'] = word
        new_df = new_df.append(df_tempo)       
print("%d X %d checked dataframe" % (len(new_df), len(new_df.columns) ))

14177 X 7 checked dataframe


In [23]:
df_sort = new_df.drop_duplicates(['topic_charact','title'], keep='first')
print("%d X %d dataframe" % (len(df_sort), len(df_sort.columns) ))

10388 X 7 dataframe


### Count how many papers remain 

In [24]:
checked_topic_count = pd.DataFrame(df_sort['topic_charact'].value_counts(), columns = ['topic_charact'])
checked_topic_count.rename(columns ={'topic_charact': "checked_topic_count"}, inplace=True)
checked_topic_count.reset_index(inplace = True)

In [25]:
checked_topic_count.rename(columns={'index':'topic'},inplace=True)

In [26]:
tab = pd.merge(prop, checked_topic_count, on = 'topic')

In [27]:
tab

Unnamed: 0,topic,seuil_0.02,checked_topic_count
0,0_energi_effici_consumpt_save_demand,1732.0,1018
1,1_climat_chang_temperatur_futur_project,2000.0,1038
2,2_power_generat_plant_sector_capac,797.0,416
3,3_vehicl_fleet_car_hybrid_passeng,387.0,227
4,4_industri_cement_sector_product_process,497.0,301
5,5_forest_sequestr_wood_sink_stock,314.0,188
6,6_steel_iron_product_materi_save,105.0,70
7,7_build_residenti_stock_sector_construct,356.0,180
8,8_water_resourc_basin_river_irrig,291.0,133
9,9_air_pollut_pm2_qualiti_health,674.0,252


In [28]:
tab.to_csv("./output/Table_topics_count.csv",index=False)

### How many topics characterize how many papers

In [29]:
checked_title_count = pd.DataFrame(df_sort['title'].value_counts(), columns = ['title'])
checked_title_count.rename(columns ={'title': "title_count"}, inplace=True)
checked_title_count.reset_index(inplace = True)

In [30]:
number_papers = pd.DataFrame(checked_title_count['title_count'].value_counts(), columns = ['title_count'])
number_papers.loc['Total'] = number_papers.sum() 

In [31]:
number_papers

Unnamed: 0,title_count
2,1410
1,1104
3,1050
4,539
5,177
6,37
7,5
8,2
Total,4324


### Add topics attribution with title check to the main database

In [32]:
column_topics.append('Total_topics_in_paper')

In [33]:
topics = pd.crosstab(df_sort.title, df_sort.topic_charact)
topics =topics.reindex(columns= column_topics, fill_value=0)
topics.loc['Total',:]= topics.sum(axis=0)
topics.loc[:,'Total_topics_in_paper'] = topics.sum(axis=1)

In [34]:
df = df.reindex(columns= np.append(df.columns,column_topics), fill_value=0)

In [35]:
df.set_index('title', inplace = True)

In [36]:
df.update(topics)

In [37]:
df.reset_index(0, inplace = True)

In [38]:
df.to_csv('./output/database_titre_seuil0.02.csv',index=False)