# Install and libraries

In [9]:
!pip install tweet-preprocessor -q

# Installing Gensim and PyLDAvis
!pip install -qq -U gensim
!pip install -qq pyLDAvis

In [10]:
# explainability (why did the model say it's related to this author)
!pip install eli5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
import pandas as pd
import numpy as np
import preprocessor as prepro # text prepro
import tqdm #progress bar

import spacy #spacy for quick language prepro
nlp = spacy.load('en_core_web_sm') #instantiating English module

# sampling, splitting
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split


# loading ML libraries
from sklearn.pipeline import make_pipeline #pipeline creation
from sklearn.feature_extraction.text import TfidfVectorizer #transforms text to sparse matrix
from sklearn.linear_model import LogisticRegression #Logit model
from sklearn.metrics import classification_report #that's self explanatory
from sklearn.decomposition import TruncatedSVD #dimensionality reduction
from xgboost import XGBClassifier

import altair as alt #viz

#explainability
import eli5
from eli5.lime import TextExplainer

# topic modeling

from gensim.corpora.dictionary import Dictionary # Import the dictionary builder
from gensim.models import LdaMulticore # we'll use the faster multicore version of LDA

# Import pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

%matplotlib inline
pyLDAvis.enable_notebook()

In [12]:
# prepro settings
prepro.set_options(prepro.OPT.URL, prepro.OPT.NUMBER, prepro.OPT.RESERVED, prepro.OPT.MENTION, prepro.OPT.SMILEY)

# Data loading and preparation

In [19]:
data = pd.read_csv("https://github.com/Alphambarushimana/Sustainability/raw/main/scopus.csv.crdownload")

In [20]:
data

Unnamed: 0,Authors,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,Page start,Page end,...,Affiliations,Authors with affiliations,Abstract,Author Keywords,Index Keywords,Document Type,Publication Stage,Open Access,Source,EID
0,"Ma Z., Cheah W.Y., Ng I.-S., Chang J.-S., Zhao...",8399413600;56603907000;57190488178;8567368700;...,Microalgae-based biotechnological sequestratio...,2022,Trends in Biotechnology,,,,,,...,Zhejiang Provincial Key Laboratory for Subtrop...,"Ma, Z., Zhejiang Provincial Key Laboratory for...",Excessive carbon dioxide (CO2) emissions into ...,carbon dioxide; genetic engineering; microalga...,Carbon dioxide; Economics; Genetic engineering...,Review,Article in Press,,Scopus,2-s2.0-85140270725
1,"Ayeb-Karlsson S., Baldwin A.W., Kniveton D.",57189902301;35878004700;6701656335;,Who is the climate-induced trapped figure?,2022,Wiley Interdisciplinary Reviews: Climate Change,,,,,,...,Institute for Risk and Disaster Reduction (IRD...,"Ayeb-Karlsson, S., Institute for Risk and Disa...",Many will remember the 1990s alarmist narrativ...,climate change; climate policy; climate refuge...,Climate policy; Climate refugee; Displacement;...,Review,Article in Press,,Scopus,2-s2.0-85140270517
2,Norheim-Hansen A.,56344365300;,Green supplier development: What's in it for y...,2022,Business Horizons,,,,,,...,"KEDGE Business School, Domaine de Luminy BP 92...","Norheim-Hansen, A., KEDGE Business School, Dom...","Greening suppliers, or cultivating environment...",Environmental responsibility; Environmental su...,,Article,Article in Press,,Scopus,2-s2.0-85140269066
3,"Sierra J., Yassim M., Suárez-Collado Á.",57208398978;57200634622;55813389100;,Together we can: enhancing key 21st-century sk...,2022,Education and Training,64.0,6.0,,826.0,843.0,...,"Department of Applied Economics, Research Cent...","Sierra, J., Department of Applied Economics, R...",Purpose: This research reveals how a virtual e...,Active learning; Awareness; Internationalizati...,,Article,Final,,Scopus,2-s2.0-85140267529
4,"Sati H., Khandelwal A., Pareek S.",57262816100;57934375800;26039431800;,Effect of exogenous melatonin in fruit posthar...,2022,Food Frontiers,,,,,,...,Department of Agriculture and Environmental Sc...,"Sati, H., Department of Agriculture and Enviro...","Derived from tryptophan, melatonin (MT; N-acet...",food technology and sustainability,,Article,Article in Press,,Scopus,2-s2.0-85140267140
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
955,"Basu R J., Abdulrahman M.D., Yuvaraj M.",56520286600;55345988900;57881298700;,Improving agility and resilience of automotive...,2022,Socio-Economic Planning Sciences,,,101401,,,...,"Department of Mechanical Engineering, Presiden...","Basu R, J., Department of Mechanical Engineeri...",The global supply chain disruption by the COVI...,Additive manufacturing; Automotive spares; Onl...,,Article,Article in Press,,Scopus,2-s2.0-85137644164
956,"Lu R., Shi T.-Q., Lin L., Ledesma-Amaro R., Ji...",57664460000;57190862202;57201654934;5579317290...,Advances in metabolic engineering of yeasts fo...,2022,Green Chemical Engineering,,,,,,...,State Key Laboratory of Materials-Oriented Che...,"Lu, R., State Key Laboratory of Materials-Orie...",The reliance of the transport sector on fossil...,Biofuel; Fatty acid; Hydrocarbon; Metabolic en...,,Review,Article in Press,,Scopus,2-s2.0-85137617166
957,"Hu M., Zhou K., Zhao T., Li Z., Zeng X., Yu D....",57345124600;57225741223;57777542300;5628587880...,Facile preparation and efficient MnxCoy porous...,2022,Green Energy and Environment,,,,,,...,Flavors and Fragrance Engineering & Technology...,"Hu, M., Flavors and Fragrance Engineering & Te...",The pursuit of high-performance is worth consi...,Contact efficiency; DFT; Intrinsic activity; M...,Catalysts; Combustion; Design for testability;...,Article,Article in Press,,Scopus,2-s2.0-85137614591
958,"Kotzé L.J., Adelman S.",23469821500;56471454200;,Environmental Law and the Unsustainability of ...,2022,Law and Critique,,,,,,...,"Faculty of Law, North-West University, 11 Hoff...","Kotzé, L.J., Faculty of Law, North-West Univer...",In this article we argue that sustainable deve...,Anthropocene; Buen vivir; Developmentalism; Ec...,,Article,Article in Press,,Scopus,2-s2.0-85137613095


In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960 entries, 0 to 959
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Authors                    960 non-null    object 
 1   Author(s) ID               959 non-null    object 
 2   Title                      960 non-null    object 
 3   Year                       960 non-null    int64  
 4   Source title               960 non-null    object 
 5   Volume                     9 non-null      float64
 6   Issue                      9 non-null      float64
 7   Art. No.                   26 non-null     object 
 8   Page start                 19 non-null     float64
 9   Page end                   19 non-null     float64
 10  Page count                 0 non-null      float64
 11  Cited by                   16 non-null     float64
 12  DOI                        960 non-null    object 
 13  Link                       960 non-null    object 

In [28]:
data = data[['Authors', 'Author(s) ID','Title', 'Abstract','Year', 'Source title']]

In [29]:
data.isnull().sum()

Authors         0
Author(s) ID    0
Title           0
Abstract        0
Year            0
Source title    0
dtype: int64

In [30]:
data.dropna(subset=['Author(s) ID'], inplace=True)

In [31]:
data.head()

Unnamed: 0,Authors,Author(s) ID,Title,Abstract,Year,Source title
0,"Ma Z., Cheah W.Y., Ng I.-S., Chang J.-S., Zhao...",8399413600;56603907000;57190488178;8567368700;...,Microalgae-based biotechnological sequestratio...,Excessive carbon dioxide (CO2) emissions into ...,2022,Trends in Biotechnology
1,"Ayeb-Karlsson S., Baldwin A.W., Kniveton D.",57189902301;35878004700;6701656335;,Who is the climate-induced trapped figure?,Many will remember the 1990s alarmist narrativ...,2022,Wiley Interdisciplinary Reviews: Climate Change
2,Norheim-Hansen A.,56344365300;,Green supplier development: What's in it for y...,"Greening suppliers, or cultivating environment...",2022,Business Horizons
3,"Sierra J., Yassim M., Suárez-Collado Á.",57208398978;57200634622;55813389100;,Together we can: enhancing key 21st-century sk...,Purpose: This research reveals how a virtual e...,2022,Education and Training
4,"Sati H., Khandelwal A., Pareek S.",57262816100;57934375800;26039431800;,Effect of exogenous melatonin in fruit posthar...,"Derived from tryptophan, melatonin (MT; N-acet...",2022,Food Frontiers


# Topic modelling

In [32]:
# Creating a new column.
data ['text'] = data['Title'] + '. ' + data['Abstract']

In [33]:
data['text_clean'] = data['text'].map(lambda t: prepro.clean(t))
data['text_clean'] = data['text_clean'].str.replace('#','')

In [34]:
# run progress bar and clean up using spacy but without some heavy parts of the pipeline

clean_text = []

pbar = tqdm.tqdm(total=len(data['text_clean']),position=0, leave=True)

for text in nlp.pipe(data['text_clean'], disable=["tagger", "parser", "ner"]):

  txt = [token.lemma_.lower() for token in text 
         if token.is_alpha 
         and not token.is_stop 
         and not token.is_punct]

  clean_text.append(" ".join(txt))

  pbar.update(1)

 99%|█████████▉| 953/959 [00:17<00:00, 136.48it/s]

In [35]:
# write everything into one function that can be re-used later
def text_prepro(texts):
  """
  takes in a pandas series (1 column of a DF)
  removes twitter stuff
  lowercases, normalizes text
  """
  texts_clean = texts.map(lambda t: prepro.clean(t))
  texts_clean = texts_clean.str.replace('#','')

  clean_container = []

  pbar = tqdm.tqdm(total=len(texts_clean),position=0, leave=True)

  for text in nlp.pipe(texts_clean, disable=["tagger", "parser", "ner"]):

    txt = [token.lemma_.lower() for token in text 
          if token.is_alpha 
          and not token.is_stop 
          and not token.is_punct]

    clean_container.append(" ".join(txt))
    pbar.update(1)
  
  return clean_container

In [36]:
# apply all prepro-pipeline to texts
data['text_clean'] = text_prepro(data['text'])

100%|██████████| 959/959 [00:14<00:00, 67.53it/s] 


In [37]:
data

Unnamed: 0,Authors,Author(s) ID,Title,Abstract,Year,Source title,text,text_clean
0,"Ma Z., Cheah W.Y., Ng I.-S., Chang J.-S., Zhao...",8399413600;56603907000;57190488178;8567368700;...,Microalgae-based biotechnological sequestratio...,Excessive carbon dioxide (CO2) emissions into ...,2022,Trends in Biotechnology,Microalgae-based biotechnological sequestratio...,microalgae based biotechnological sequestratio...
1,"Ayeb-Karlsson S., Baldwin A.W., Kniveton D.",57189902301;35878004700;6701656335;,Who is the climate-induced trapped figure?,Many will remember the 1990s alarmist narrativ...,2022,Wiley Interdisciplinary Reviews: Climate Change,Who is the climate-induced trapped figure?. Ma...,climate induced trapped figure remember s alar...
2,Norheim-Hansen A.,56344365300;,Green supplier development: What's in it for y...,"Greening suppliers, or cultivating environment...",2022,Business Horizons,Green supplier development: What's in it for y...,green supplier development buyer greening supp...
3,"Sierra J., Yassim M., Suárez-Collado Á.",57208398978;57200634622;55813389100;,Together we can: enhancing key 21st-century sk...,Purpose: This research reveals how a virtual e...,2022,Education and Training,Together we can: enhancing key 21st-century sk...,enhancing key st century skills international ...
4,"Sati H., Khandelwal A., Pareek S.",57262816100;57934375800;26039431800;,Effect of exogenous melatonin in fruit posthar...,"Derived from tryptophan, melatonin (MT; N-acet...",2022,Food Frontiers,Effect of exogenous melatonin in fruit posthar...,effect exogenous melatonin fruit postharvest c...
...,...,...,...,...,...,...,...,...
955,"Basu R J., Abdulrahman M.D., Yuvaraj M.",56520286600;55345988900;57881298700;,Improving agility and resilience of automotive...,The global supply chain disruption by the COVI...,2022,Socio-Economic Planning Sciences,Improving agility and resilience of automotive...,improving agility resilience automotive spares...
956,"Lu R., Shi T.-Q., Lin L., Ledesma-Amaro R., Ji...",57664460000;57190862202;57201654934;5579317290...,Advances in metabolic engineering of yeasts fo...,The reliance of the transport sector on fossil...,2022,Green Chemical Engineering,Advances in metabolic engineering of yeasts fo...,advances metabolic engineering yeasts producti...
957,"Hu M., Zhou K., Zhao T., Li Z., Zeng X., Yu D....",57345124600;57225741223;57777542300;5628587880...,Facile preparation and efficient MnxCoy porous...,The pursuit of high-performance is worth consi...,2022,Green Energy and Environment,Facile preparation and efficient MnxCoy porous...,facile preparation efficient mnxcoy porous nan...
958,"Kotzé L.J., Adelman S.",23469821500;56471454200;,Environmental Law and the Unsustainability of ...,In this article we argue that sustainable deve...,2022,Law and Critique,Environmental Law and the Unsustainability of ...,environmental law unsustainability sustainable...


In [38]:
# preprocess texts (we need tokens)
tokens = []

for summary in nlp.pipe(data['text_clean'], disable=["ner"]):
  proj_tok = [token.lemma_.lower() for token in summary 
              if token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV'] 
              and not token.is_stop
              and not token.is_punct] 
  tokens.append(proj_tok)

In [39]:
data['tokens'] = tokens

In [40]:
# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(data['tokens'])
# filter out low-frequency / high-frequency stuff, also limit the vocabulary to max 1000 words
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=1000)
# construct corpus using this dictionary
corpus = [dictionary.doc2bow(doc) for doc in data['tokens']]

In [41]:
corpus

[[(0, 2),
  (1, 1),
  (2, 3),
  (3, 1),
  (4, 2),
  (5, 1),
  (6, 2),
  (7, 1),
  (8, 1),
  (9, 5),
  (10, 2),
  (11, 1),
  (12, 1),
  (13, 2),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 3),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1)],
 [(15, 2),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 3),
  (29, 1),
  (30, 1),
  (31, 5),
  (32, 11),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 9),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 2),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 2),
  (54, 1),
  (55, 1),
  (56, 2),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 2),
  (61, 1),
  (62, 2),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1)],
 [(0, 1),
  (10, 2),
  (28, 1),
  (35, 3),
  (75, 1),
  (76, 3),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 3),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 

In [42]:
# Training the model
lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=5, workers = 4, passes=10)

In [45]:
# Let's try to visualize
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [46]:
 # Let's Visualize
pyLDAvis.display(lda_display)

# EDA and dynamic effects (over time)

# Sparse paper-keywords matrix

In [47]:
corpus

[[(0, 2),
  (1, 1),
  (2, 3),
  (3, 1),
  (4, 2),
  (5, 1),
  (6, 2),
  (7, 1),
  (8, 1),
  (9, 5),
  (10, 2),
  (11, 1),
  (12, 1),
  (13, 2),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 3),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1)],
 [(15, 2),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 3),
  (29, 1),
  (30, 1),
  (31, 5),
  (32, 11),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 9),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 2),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 2),
  (54, 1),
  (55, 1),
  (56, 2),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 2),
  (61, 1),
  (62, 2),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1)],
 [(0, 1),
  (10, 2),
  (28, 1),
  (35, 3),
  (75, 1),
  (76, 3),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 3),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 

In [48]:
# load models and utility
from gensim.models import LsiModel, TfidfModel
from gensim.matutils import corpus2dense

In [49]:
# fit tfidf weights
tfidf = TfidfModel(corpus)

In [50]:
# apply tfidf to data
corpus_tfidf = tfidf[corpus]

In [51]:
corpus[2]

[(0, 1),
 (10, 2),
 (28, 1),
 (35, 3),
 (75, 1),
 (76, 3),
 (77, 1),
 (78, 1),
 (79, 1),
 (80, 1),
 (81, 3),
 (82, 1),
 (83, 1),
 (84, 1),
 (85, 1),
 (86, 1),
 (87, 2),
 (88, 1),
 (89, 1),
 (90, 1),
 (91, 1),
 (92, 1),
 (93, 1),
 (94, 1),
 (95, 1),
 (96, 1),
 (97, 1),
 (98, 1),
 (99, 1),
 (100, 1),
 (101, 1)]

In [52]:
corpus_tfidf[2]

[(0, 0.06354130095293722),
 (10, 0.10384679652339407),
 (28, 0.10461115246833314),
 (35, 0.17698108290821973),
 (75, 0.1687855147592421),
 (76, 0.4268816208575597),
 (77, 0.11871939425197454),
 (78, 0.1311027936083887),
 (79, 0.16168439100264023),
 (80, 0.22979044255115935),
 (81, 0.3703492184320838),
 (82, 0.21923521394007373),
 (83, 0.07074859330737833),
 (84, 0.09872815611619043),
 (85, 0.10159011557430792),
 (86, 0.11299150406761761),
 (87, 0.17950729830023918),
 (88, 0.1544250002494525),
 (89, 0.20280181260267982),
 (90, 0.13503448286599884),
 (91, 0.08154239465429333),
 (92, 0.15741830713781071),
 (93, 0.14977561509104825),
 (94, 0.16394991359597777),
 (95, 0.22979044255115935),
 (96, 0.16511909145182643),
 (97, 0.2004880729995904),
 (98, 0.0994296455256046),
 (99, 0.17550574729239912),
 (100, 0.13711236667645801),
 (101, 0.15158793874227697)]

In [53]:
data['tokens'].iloc[2]

['green',
 'supplier',
 'development',
 'buyer',
 'supplier',
 'environmental',
 'responsibility',
 'supply',
 'chain',
 'important',
 'strategic',
 'issue',
 'firm',
 'approach',
 'green',
 'supplier',
 'development',
 'supplier',
 'negative',
 'environmental',
 'impact',
 'key',
 'method',
 'significant',
 'gain',
 'party',
 'multiple',
 'level',
 'benefit',
 'sufficiently',
 'nebulous',
 'manager',
 'particularly',
 'buyer',
 'article',
 'benefit',
 'intrafirm',
 'interfirm',
 'market',
 'level',
 'barrier',
 'realization',
 'benefit',
 'managerial',
 'guidance',
 'supplier',
 'prioritization',
 'green',
 'supplier',
 'development',
 'kelley',
 'school',
 'business',
 'indiana',
 'university']

In [54]:
# train LSI model
lsi = LsiModel(corpus_tfidf, num_topics=10, id2word=dictionary)

In [55]:
lsi.print_topics()

[(0,
  '0.144*"energy" + 0.122*"environmental" + 0.116*"sustainable" + 0.110*"water" + 0.106*"food" + 0.103*"research" + 0.101*"green" + 0.099*"development" + 0.098*"performance" + 0.096*"chain"'),
 (1,
  '0.231*"energy" + 0.192*"water" + 0.173*"material" + 0.158*"soil" + 0.149*"battery" + 0.131*"carbon" + -0.131*"chain" + -0.129*"business" + -0.129*"social" + 0.128*"property"'),
 (2,
  '0.350*"chain" + 0.347*"supply" + -0.340*"energy" + -0.171*"emission" + 0.170*"food" + -0.169*"renewable" + -0.140*"tourism" + 0.125*"resilience" + -0.123*"policy" + -0.123*"innovation"'),
 (3,
  '0.304*"energy" + 0.292*"supply" + 0.285*"chain" + -0.208*"urban" + -0.205*"soil" + -0.170*"student" + -0.150*"city" + -0.147*"water" + 0.141*"firm" + 0.140*"innovation"'),
 (4,
  '-0.293*"food" + 0.258*"concrete" + -0.252*"water" + 0.223*"student" + -0.186*"supply" + -0.183*"chain" + 0.179*"battery" + 0.153*"performance" + 0.153*"material" + -0.141*"soil"'),
 (5,
  '0.323*"soil" + 0.321*"firm" + -0.239*"energy

In [56]:
# apply lsi (that's equivalent to SVD fit/transform in sklearn)
corpus_lsi = lsi[corpus_tfidf]

In [57]:
corpus_lsi[0]

[(0, 0.17866384459182455),
 (1, 0.11489981964120315),
 (2, -0.161093178899847),
 (3, 0.12359355614797805),
 (4, -0.0772594453391509),
 (5, 0.02358820644769671),
 (6, 0.07857954954004655),
 (7, -0.0035655484012929633),
 (8, -0.08241113154971592),
 (9, -0.026340516602895027)]

In [58]:
# turning it back into a matrix
lsi_matrix = corpus2dense(corpus_lsi, num_terms = 10)

In [59]:
# getting the top topic-index per doc.
np.argmax(lsi_matrix.T[0])

0