In [1]:
PYTHONHASHSEED=0
np.random.seed(100)

In [33]:
import pandas as pd
import numpy as np
import datetime
import gensim
import re
import nltk
import contractions
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer

# Load the NLTK stopwords
stop_words = set(stopwords.words('english'))
stop_words.update(['http', 'https', 'www', 'com', 'org', 'net', 'sciencedirect', 'et', 'al', 'doi', 'elsevier', 'wa', 'cid', 'ha', 'also', 'need']) # add URLs and other words to the set

# Define a lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Wei'En\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
# Define a function to clean and preprocess the text
def preprocess_text(text):
    # Expand contractions
    text = contractions.fix(text)
    
    # Remove non-alphabetic characters and multiple whitespaces
    text = ' '.join(simple_preprocess(text, deacc=True))
    
    # Tokenize the text
    tokens = text.split()
    
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # POS tagging and keep only nouns, comparative and superlative adjectives
    # 'NN', 'NNS', 'NNP', 'JJ', 'JJR', 'JJS'
    tokens_pos = nltk.pos_tag(tokens)
    tokens = [token for token, pos in tokens_pos if pos in ['NN', 'NNS', 'NNP', 'JJ', 'JJR', 'JJS']]
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    return tokens

In [45]:
%%time
# Apply the preprocessing function to the 'text' column
contentdf['clean_text'] = contentdf['content'].apply(preprocess_text)

Wall time: 7min 1s


In [None]:
contentdf.to_excel('contentdf_preprocessed.xlsx')

In [46]:
contentdf.head()

Unnamed: 0,document name,source,year,category,content,clean_text
0,What is antimicrobial resistance and how can w...,World Economic Forum,2023,GLOBAL HEALTH,"['In 1928, Alexander Fleming discovered the fi...","[alexander, fleming, first, antibiotic, penici..."
1,Green subsidy race? 5 experts explain what to ...,World Economic Forum,2023,TRADE AND INVESTMENT,"['In recent months, the United States and Euro...","[recent, month, united, state, european, union..."
2,4 ways to ensure the future of aquaculture is ...,World Economic Forum,2023,"AGRICULTURE, FOOD AND BEVERAGE","['As our global population expands, so too doe...","[global, population, expands, doe, demand, nut..."
3,Ocean currents drive climate patterns - so wha...,World Economic Forum,2023,OCEAN,['Although we as humans like to think we are t...,"[human, main, character, world, ocean, u, beat..."
4,Fossil fuels 101: Everything you need to know,World Economic Forum,2023,OIL AND GAS,['Fossil fuels are fuels that literally come f...,"[fossil, fuel, fuel, fossil, ten, year, plant,..."


In [48]:
type(contentdf['clean_text'].dtype)

numpy.dtype[object_]

In [36]:
# texts for CoherenceModel

texts = []
for tokens in contentdf['clean_text']:
    texts.append(tokens)

In [37]:
from gensim import  models

lda_model = models.ldamodel.LdaModel.load('lda_modelv9_20')

In [58]:
lda_model.show_topics(20)

[(0,
  '0.050*"team" + 0.049*"material" + 0.045*"cell" + 0.045*"research" + 0.029*"researcher" + 0.027*"university" + 0.026*"process" + 0.025*"chemical" + 0.024*"science" + 0.022*"mit"'),
 (1,
  '0.021*"change" + 0.019*"study" + 0.018*"model" + 0.016*"data" + 0.016*"climate" + 0.012*"research" + 0.011*"analysis" + 0.010*"impact" + 0.010*"scenario" + 0.009*"future"'),
 (2,
  '0.042*"health" + 0.028*"social" + 0.025*"household" + 0.021*"people" + 0.020*"woman" + 0.016*"education" + 0.016*"gender" + 0.015*"human" + 0.014*"community" + 0.013*"income"'),
 (3,
  '0.056*"farmer" + 0.042*"crop" + 0.039*"food" + 0.027*"agricultural" + 0.023*"production" + 0.021*"yield" + 0.021*"agriculture" + 0.020*"farm" + 0.018*"soil" + 0.014*"household"'),
 (4,
  '0.049*"airport" + 0.036*"transport" + 0.033*"road" + 0.033*"vehicle" + 0.032*"car" + 0.027*"electric" + 0.021*"infrastructure" + 0.019*"hydrogen" + 0.019*"transportation" + 0.016*"fuel"'),
 (5,
  '0.024*"business" + 0.021*"technology" + 0.020*"comp

In [39]:
from gensim import corpora
dictionary = corpora.Dictionary.load("lda_modelv9_20.id2word")

# Assign topic

In [49]:
# Create a function to get the most likely topic for a given document
def get_document_topic(lda_model, dictionary, document):
    doc_bow = dictionary.doc2bow(document)
    topic_probs = lda_model.get_document_topics(doc_bow)
    if len(topic_probs) > 0:
        return max(topic_probs, key=lambda x: x[1])[0]
    else:
        return -1

# Add a new column to the dataframe containing the most likely topic for each document
contentdf['Dominant_Topic'] = contentdf['clean_text'].apply(lambda x: get_document_topic(lda_model, dictionary, x))

In [50]:
contentdf['Dominant_Topic'].value_counts()

17    1177
19    1125
13    1059
7      926
5      869
15     799
16     717
1      519
11     464
3      417
12     224
10     202
8      176
4      127
2      119
9       46
18      37
0       19
6        1
Name: Dominant_Topic, dtype: int64

In [51]:
contentdf['Dominant_Topic'].value_counts().sum()

9023

# Reassign topic 19

In [53]:
df_topic19 = contentdf[contentdf['Dominant_Topic'] == 19]

In [54]:
df_topic19.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1125 entries, 10 to 9022
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   document name   1125 non-null   object
 1   source          1125 non-null   object
 2   year            1125 non-null   int64 
 3   category        1125 non-null   object
 4   content         1125 non-null   object
 5   clean_text      1125 non-null   object
 6   Dominant_Topic  1125 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 70.3+ KB


In [55]:
# Create a function to get the most likely topic for a given document
def second_document_topic(lda_model, dictionary, document):
    doc_bow = dictionary.doc2bow(document)
    topic_probs = lda_model.get_document_topics(doc_bow)
    if len(topic_probs) > 1:
        sorted_probs = sorted(topic_probs, key=lambda x: x[1], reverse=True)
        return sorted_probs[1][0]
    elif len(topic_probs) == 1:
        return topic_probs[0][0]
    else:
        return -1

# Add a new column to the dataframe containing the most likely topic for each document
df_topic19['Second_Topic'] = df_topic19['clean_text'].apply(lambda x: second_document_topic(lda_model, dictionary, x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_topic19['Second_Topic'] = df_topic19['clean_text'].apply(lambda x: second_document_topic(lda_model, dictionary, x))


In [57]:
df_topic19['Second_Topic'].value_counts()

17    233
15    181
11    113
13    108
5     102
2      67
16     56
1      51
3      48
12     46
7      32
10     24
18     24
4      17
8      14
9       8
0       1
Name: Second_Topic, dtype: int64

# Topic distribution

In [66]:
contentdf.head()

Unnamed: 0,document name,source,year,category,content,clean_text,Dominant_Topic
0,What is antimicrobial resistance and how can w...,World Economic Forum,2023,GLOBAL HEALTH,"['In 1928, Alexander Fleming discovered the fi...","[alexander, fleming, first, antibiotic, penici...",2
1,Green subsidy race? 5 experts explain what to ...,World Economic Forum,2023,TRADE AND INVESTMENT,"['In recent months, the United States and Euro...","[recent, month, united, state, european, union...",13
2,4 ways to ensure the future of aquaculture is ...,World Economic Forum,2023,"AGRICULTURE, FOOD AND BEVERAGE","['As our global population expands, so too doe...","[global, population, expands, doe, demand, nut...",5
3,Ocean currents drive climate patterns - so wha...,World Economic Forum,2023,OCEAN,['Although we as humans like to think we are t...,"[human, main, character, world, ocean, u, beat...",16
4,Fossil fuels 101: Everything you need to know,World Economic Forum,2023,OIL AND GAS,['Fossil fuels are fuels that literally come f...,"[fossil, fuel, fuel, fossil, ten, year, plant,...",13


In [59]:
corpus = [dictionary.doc2bow(doc) for doc in contentdf['clean_text']]
get_document_topics = [lda_model.get_document_topics(item) for item in corpus]

In [68]:
get_document_topics

[[(1, 0.11077774),
  (2, 0.32911146),
  (3, 0.07862916),
  (5, 0.10608411),
  (9, 0.08215332),
  (10, 0.08523458),
  (13, 0.12465869),
  (15, 0.02455998),
  (19, 0.03509191)],
 [(4, 0.044798087),
  (5, 0.11032057),
  (7, 0.023163281),
  (11, 0.020060288),
  (12, 0.012296204),
  (13, 0.4513592),
  (15, 0.13249815),
  (17, 0.17598917),
  (19, 0.023988832)],
 [(1, 0.018410958),
  (2, 0.012647596),
  (3, 0.07356724),
  (5, 0.3107493),
  (10, 0.015088142),
  (11, 0.024754042),
  (15, 0.30480656),
  (17, 0.016226353),
  (18, 0.1279136),
  (19, 0.09481468)],
 [(1, 0.092955776),
  (7, 0.013775442),
  (10, 0.15945996),
  (12, 0.057395186),
  (16, 0.4502538),
  (17, 0.09032562),
  (18, 0.10368386),
  (19, 0.021114083)],
 [(0, 0.010519109),
  (1, 0.029618647),
  (2, 0.019214401),
  (4, 0.03366934),
  (5, 0.037918765),
  (7, 0.098354116),
  (8, 0.019608395),
  (10, 0.13395922),
  (12, 0.037347563),
  (13, 0.32147637),
  (16, 0.040800706),
  (17, 0.17088804),
  (19, 0.036976967)],
 [(1, 0.018162852

In [60]:
# create an empty DataFrame
topicdistribution = pd.DataFrame(columns=['Document', 'Topic', 'Probability'])

In [61]:
# loop through the document_topics list and append each row to the DataFrame
for doc_id, topics in enumerate(get_document_topics):
    for topic_id, topic_prob in topics:
        row = {'Document': doc_id, 'Topic': topic_id, 'Probability': topic_prob}
        topicdistribution = topicdistribution.append(row, ignore_index=True)
        
# convert columns to integers
topicdistribution['Document'] = topicdistribution['Document'].astype(int)
topicdistribution['Topic'] = topicdistribution['Topic'].astype(int)

In [75]:
pivoted_topicdistribution = topicdistribution.pivot(index='Document', columns='Topic', values='Probability')
pivoted_topicdistribution

Topic,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,,0.110778,0.329111,0.078629,,0.106084,,,,0.082153,0.085235,,,0.124659,,0.024560,,,,0.035092
1,,,,,0.044798,0.110321,,0.023163,,,,0.020060,0.012296,0.451359,,0.132498,,0.175989,,0.023989
2,,0.018411,0.012648,0.073567,,0.310749,,,,,0.015088,0.024754,,,,0.304807,,0.016226,0.127914,0.094815
3,,0.092956,,,,,,0.013775,,,0.159460,,0.057395,,,,0.450254,0.090326,0.103684,0.021114
4,0.010519,0.029619,0.019214,,0.033669,0.037919,,0.098354,0.019608,,0.133959,,0.037348,0.321476,,,0.040801,0.170888,,0.036977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9018,,,0.013130,0.016725,,,,,,0.028009,0.012001,0.251700,,0.142658,,,,0.275274,,0.256057
9019,,,0.027412,,,,,,,,,0.074674,,0.041945,,0.168570,,0.154576,,0.499505
9020,,,,,,0.019867,,,,,,0.122315,,0.160043,,0.098924,,0.212997,,0.372849
9021,,,,,,,,0.029979,,,0.019534,0.096524,0.028151,0.071917,,0.087963,,0.246314,,0.412389


In [78]:
topicdistribution_df = pd.merge(contentdf, pivoted_topicdistribution, left_index=True, right_index=True)

In [82]:
topicdistribution_df = topicdistribution_df.drop(columns=['source', 'year', 'category', 'content', 'clean_text'])

In [91]:
topicdistribution_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9023 entries, 0 to 9022
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   document name   9023 non-null   object 
 1   Dominant_Topic  9023 non-null   int64  
 2   0               1530 non-null   float64
 3   1               5668 non-null   float64
 4   2               3823 non-null   float64
 5   3               3207 non-null   float64
 6   4               1870 non-null   float64
 7   5               4930 non-null   float64
 8   6               59 non-null     float64
 9   7               4215 non-null   float64
 10  8               2479 non-null   float64
 11  9               1124 non-null   float64
 12  10              3715 non-null   float64
 13  11              4783 non-null   float64
 14  12              2765 non-null   float64
 15  13              7322 non-null   float64
 16  14              6 non-null      float64
 17  15              6543 non-null   f

In [92]:
topicdistribution_df['total'] = topicdistribution_df[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]].sum(axis=1)

In [94]:
sorted = topicdistribution_df.sort_values(by='total')

In [102]:
topicdistribution_df

Unnamed: 0,document name,Dominant_Topic,0,1,2,3,4,5,6,7,...,11,12,13,14,15,16,17,18,19,total
0,What is antimicrobial resistance and how can w...,2,,0.110778,0.329111,0.078629,,0.106084,,,...,,,0.124659,,0.024560,,,,0.035092,0.976301
1,Green subsidy race? 5 experts explain what to ...,13,,,,,0.044798,0.110321,,0.023163,...,0.020060,0.012296,0.451359,,0.132498,,0.175989,,0.023989,0.994474
2,4 ways to ensure the future of aquaculture is ...,5,,0.018411,0.012648,0.073567,,0.310749,,,...,0.024754,,,,0.304807,,0.016226,0.127914,0.094815,0.998978
3,Ocean currents drive climate patterns - so wha...,16,,0.092956,,,,,,0.013775,...,,0.057395,,,,0.450254,0.090326,0.103684,0.021114,0.988964
4,Fossil fuels 101: Everything you need to know,13,0.010519,0.029619,0.019214,,0.033669,0.037919,,0.098354,...,,0.037348,0.321476,,,0.040801,0.170888,,0.036977,0.990352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9018,ZIMBABWE_ENG,17,,,0.013130,0.016725,,,,,...,0.251700,,0.142658,,,,0.275274,,0.256057,0.995554
9019,"Simon Stiell, Executive Secretary UN Climate C...",19,,,0.027412,,,,,,...,0.074674,,0.041945,,0.168570,,0.154576,,0.499505,0.966683
9020,Secretary-General's remarks to High-Level open...,19,,,,,,0.019867,,,...,0.122315,,0.160043,,0.098924,,0.212997,,0.372849,0.986995
9021,Statement by the UN Secretary-General,19,,,,,,,,0.029979,...,0.096524,0.028151,0.071917,,0.087963,,0.246314,,0.412389,0.992770


In [110]:
pivoted_topicdistribution

Topic,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,,0.110778,0.329111,0.078629,,0.106084,,,,0.082153,0.085235,,,0.124659,,0.024560,,,,0.035092
1,,,,,0.044798,0.110321,,0.023163,,,,0.020060,0.012296,0.451359,,0.132498,,0.175989,,0.023989
2,,0.018411,0.012648,0.073567,,0.310749,,,,,0.015088,0.024754,,,,0.304807,,0.016226,0.127914,0.094815
3,,0.092956,,,,,,0.013775,,,0.159460,,0.057395,,,,0.450254,0.090326,0.103684,0.021114
4,0.010519,0.029619,0.019214,,0.033669,0.037919,,0.098354,0.019608,,0.133959,,0.037348,0.321476,,,0.040801,0.170888,,0.036977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9018,,,0.013130,0.016725,,,,,,0.028009,0.012001,0.251700,,0.142658,,,,0.275274,,0.256057
9019,,,0.027412,,,,,,,,,0.074674,,0.041945,,0.168570,,0.154576,,0.499505
9020,,,,,,0.019867,,,,,,0.122315,,0.160043,,0.098924,,0.212997,,0.372849
9021,,,,,,,,0.029979,,,0.019534,0.096524,0.028151,0.071917,,0.087963,,0.246314,,0.412389


In [113]:
# create a list of column names for the three new columns
new_cols = ['dominant_topic', 'second_dominant_topic', 'third_dominant_topic']

# create a new dataframe to store the dominant topics
dominant_topics_df = pd.DataFrame(index=pivoted_topicdistribution.index, columns=new_cols)

# loop through each row of the dataframe
for index, row in pivoted_topicdistribution.iterrows():
    
    # sort the row values in descending order and get the corresponding indices
    sorted_indices = row.sort_values(ascending=False).index
    
    # get the dominant topic (i.e., the index of the highest value)
    dominant_topic = sorted_indices[0]
    
    # check if the value of the second highest topic is above 0.3
    if row[sorted_indices[1]] > 0.3:
        second_dominant_topic = sorted_indices[1]
    else:
        second_dominant_topic = np.nan
    
    # check if the value of the third highest topic is above 0.2
    if row[sorted_indices[2]] > 0.2:
        third_dominant_topic = sorted_indices[2]
    else:
        third_dominant_topic = np.nan
    
    # store the dominant topics in the new dataframe
    dominant_topics_df.loc[index, 'dominant_topic'] = dominant_topic
    dominant_topics_df.loc[index, 'second_dominant_topic'] = second_dominant_topic
    dominant_topics_df.loc[index, 'third_dominant_topic'] = third_dominant_topic

# print the resulting dataframe with the dominant topics
print(dominant_topics_df)

         dominant_topic second_dominant_topic third_dominant_topic
Document                                                          
0                     2                   NaN                  NaN
1                    13                   NaN                  NaN
2                     5                    15                  NaN
3                    16                   NaN                  NaN
4                    13                   NaN                  NaN
...                 ...                   ...                  ...
9018                 17                   NaN                   11
9019                 19                   NaN                  NaN
9020                 19                   NaN                  NaN
9021                 19                   NaN                  NaN
9022                 19                    15                  NaN

[9023 rows x 3 columns]


In [115]:
dominant_topics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9023 entries, 0 to 9022
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   dominant_topic         9023 non-null   object
 1   second_dominant_topic  770 non-null    object
 2   third_dominant_topic   856 non-null    object
dtypes: object(3)
memory usage: 540.0+ KB
