In [1]:
#!pip install pypdf
#!pip install bertopic

In [23]:
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from pathlib import Path
import pandas as pd
import nltk
from nltk.corpus import words
import re
from html import unescape


from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration, OpenAI
import openai

import numpy as np

# Download the words dataset from nltk
nltk.download('words')
nltk.download('punkt')



[nltk_data] Downloading package words to
[nltk_data]     /Users/eryclisrodrigues/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/eryclisrodrigues/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

#### Extracting the data from PDFs

In [36]:
loader = PyPDFDirectoryLoader("/Users/eryclisrodrigues/Documents/Eryclis - docs/Research/Projects/AI Governance - Topic Modeling/Data/Raw Data Sampling/National Strategy")
docs = loader.load_and_split(text_splitter=splitter)





In [37]:
len(docs)

1830

In [45]:
print(docs[10].page_content)

5 
 Foreword  
Through its continued efforts toward  the early realization of the Digital Pakistan vision of the Government 
of Pakistan, the Ministry of IT & Telecom is committed to providing  its people with timely and equal access 
to opportunities by stimulating a culture of innovation through an overarching de velopmental agenda 
orchestrated to embrace cutting -edge technologies such as Artificial Intelligence efficiently and 
responsibly.   
In this regard, the  Artificial Intelligence  (AI) Policy  is a pivotal milestone for  transforming Pakistan into a  
knowledge -based economy as it spells out a national strategy  to establish an ecosystem necessary for AI 
adoption by harnessing an agile framework for addressing different aspects of unique user journeys 
encompassing different market horizontals and industry verticals by ensuring responsible use of AI.  
Furthermore, the policy aims to go beyond the meagre approach of adopting technology to fundamentally


In [41]:
def documents2Dataframe(documents) -> pd.DataFrame:
    '''
    Convert the list of docs into a dataframe
    '''
    import uuid

    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df

In [42]:
df_sampling = documents2Dataframe(docs)


In [43]:
df_sampling

Unnamed: 0,text,source,page,chunk_id
0,i\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n ...,/Users/eryclisrodrigues/Documents/Eryclis - do...,0,fe708ed904b849eca7fa29572090bda8
1,ii \n Acknowledgments \nThe Government of Pak...,/Users/eryclisrodrigues/Documents/Eryclis - do...,1,cdab6c59f36445dcac88d762b1411b93
2,iii \n Table of Contents \n1 Executive Summar...,/Users/eryclisrodrigues/Documents/Eryclis - do...,2,855980307b934541b64091bf6fd8a210
3,3.2 Scope ................................ .....,/Users/eryclisrodrigues/Documents/Eryclis - do...,2,655ee09c0d174a7bbcfe2130fd266f82
4,4.1 1st Pillar: AI Market Enablement ...........,/Users/eryclisrodrigues/Documents/Eryclis - do...,2,d764ab8bb96b4bbcb52775c166652d77
...,...,...,...,...
1825,"Joshua Gans, Avi Goldfarb https://www.amazon.c...",/Users/eryclisrodrigues/Documents/Eryclis - do...,17,d1f95358e7084cbb8577134c76060fee
1826,Oun application for municipalities services ht...,/Users/eryclisrodrigues/Documents/Eryclis - do...,17,c950ce2bd3874adaae84e7401d5979b5
1827,in the context of local cultural norms. For ex...,/Users/eryclisrodrigues/Documents/Eryclis - do...,17,1f9736467368476e90466ca77d580a02
1828,19\nNational Artificial Intelligence Strategy ...,/Users/eryclisrodrigues/Documents/Eryclis - do...,18,e7c40194b07a45fdabbdfe390787d12e


In [44]:
df_sampling['text'][10]

'5 \n Foreword  \nThrough its continued efforts toward  the early realization of the Digital Pakistan vision of the Government \nof Pakistan, the Ministry of IT & Telecom is committed to providing  its people with timely and equal access \nto opportunities by stimulating a culture of innovation through an overarching de velopmental agenda \norchestrated to embrace cutting -edge technologies such as Artificial Intelligence efficiently and \nresponsibly.   \nIn this regard, the  Artificial Intelligence  (AI) Policy  is a pivotal milestone for  transforming Pakistan into a  \nknowledge -based economy as it spells out a national strategy  to establish an ecosystem necessary for AI \nadoption by harnessing an agile framework for addressing different aspects of unique user journeys \nencompassing different market horizontals and industry verticals by ensuring responsible use of AI.  \nFurthermore, the policy aims to go beyond the meagre approach of adopting technology to fundamentally'

#### Data Cleaning

For the Data Cleaning step, we do not have to remove stopwords. It is recommended just do it after the embedding generation and clustering. For now, we will just remove potential errors raised by the langchain extraction.

In [55]:
# Load the list of English words
#english_words = set(words.words())

def clean_text(text):
    # 1. Replace \n with space
    text = text.replace('\n', ' ')
    
    # 2. Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    
    # 3. Remove leading and trailing spaces
    text = text.strip()
    
    # 4. Remove URLs
    text = re.sub(r'http\S+|www\.\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 5. Decode HTML entities
    text = unescape(text)
    
    # 6. Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # 7. Remove emails
    text = re.sub(r'\S+@\S+', '', text)
    
    # 8. Remove page references (e.g., Page 1)
    text = re.sub(r'page \d+', '', text, flags=re.IGNORECASE)
    
    # 9. Remove isolated digits (keeping punctuation)
    text = re.sub(r'\b\d+\b', '', text)
    
    # 10. Remove mentions (e.g., @username)
    text = re.sub(r'@\S+', '', text)
    
    # 11. Replace '&' with 'and'
    text = text.replace('&', 'and')
    
    # 12. Remove special characters, keeping basic punctuation
    text = re.sub(r'[^a-zA-Z0-9.,!?\'\s]', '', text)
    
    # 13. Remove words not in the English vocabulary
    #words_list = text.split()
    #cleaned_words = [word for word in words_list if word.lower() in english_words]
    #text = ' '.join(cleaned_words)
    
    return text



In [56]:
copy_df_sampling = df_sampling.copy()

In [57]:
copy_df_sampling['text'] = copy_df_sampling['text'].apply(clean_text)

In [58]:
copy_df_sampling

Unnamed: 0,text,source,page,chunk_id
0,i Draft National Artificial Intelligence Polic...,/Users/eryclisrodrigues/Documents/Eryclis - do...,0,fe708ed904b849eca7fa29572090bda8
1,"ii Acknowledgments The Government of Pakistan,...",/Users/eryclisrodrigues/Documents/Eryclis - do...,1,cdab6c59f36445dcac88d762b1411b93
2,iii Table of Contents Executive Summary ........,/Users/eryclisrodrigues/Documents/Eryclis - do...,2,855980307b934541b64091bf6fd8a210
3,. Scope ................................ ........,/Users/eryclisrodrigues/Documents/Eryclis - do...,2,655ee09c0d174a7bbcfe2130fd266f82
4,. 1st Pillar AI Market Enablement ...............,/Users/eryclisrodrigues/Documents/Eryclis - do...,2,d764ab8bb96b4bbcb52775c166652d77
...,...,...,...,...
1825,"Joshua Gans, Avi Goldfarb Intelligencedp . Ar...",/Users/eryclisrodrigues/Documents/Eryclis - do...,17,d1f95358e7084cbb8577134c76060fee
1826,Oun application for municipalities services ....,/Users/eryclisrodrigues/Documents/Eryclis - do...,17,c950ce2bd3874adaae84e7401d5979b5
1827,in the context of local cultural norms. For ex...,/Users/eryclisrodrigues/Documents/Eryclis - do...,17,1f9736467368476e90466ca77d580a02
1828,National Artificial Intelligence Strategy for...,/Users/eryclisrodrigues/Documents/Eryclis - do...,18,e7c40194b07a45fdabbdfe390787d12e


In [73]:
copy_df_sampling['text'][1828]

' National Artificial Intelligence Strategy for Qatar'

## Topic Modeling - BERTopic

#### Embeddings

In [118]:
# Convert our column into a list
texts = list(df_sampling['text'].values)


In [122]:
# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2")


In [123]:
embeddings = embedding_model.encode(texts, show_progress_bar=True)

Batches:   0%|          | 0/46 [00:00<?, ?it/s]

#### Dimensionality Reduction, Clustering and Vectorizer

In [134]:
umap_model = UMAP(n_neighbors=25, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 3))

#### Representation models

In [128]:

# KeyBERT
keybert_model = KeyBERTInspired()

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.3)

# # GPT-3.5
# prompt = """
# I have a topic that contains the following documents:
# [DOCUMENTS]
# The topic is described by the following keywords: [KEYWORDS]

# Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
# topic: <topic label>
# """
# client = openai.OpenAI(api_key=read_key_from_file())
# openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    #"OpenAI": openai_model,  # Uncomment if you will use OpenAI
    "MMR": mmr_model
}

#### Training

In [135]:
topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,  # while we use pre-computed embeddings, internally BERTopic still needs this parameter
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(texts, embeddings)

2024-05-24 13:02:56,945 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-24 13:03:00,028 - BERTopic - Dimensionality - Completed ✓
2024-05-24 13:03:00,029 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-24 13:03:00,065 - BERTopic - Cluster - Completed ✓
2024-05-24 13:03:00,069 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-24 13:04:40,267 - BERTopic - Representation - Completed ✓


#### Visualizations

In [139]:
# Show topics
freq = topic_model.get_topic_info(); freq.head(15)

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,334,-1_data_ai_model_organisations,"[data, ai, model, organisations, ai applicatio...","[ethical ai, ethical ai framework, ai model, a...","[data, ai, model, organisations, ai applicatio...",[ETHICAL AI FRAMEWORK APPENDIX \n \n6-16 \n ...
1,0,45,0_qatar_ai technology_businesses_guidelines,"[qatar, ai technology, businesses, guidelines,...","[national ai strategy, national artificial int...","[qatar, ai technology, businesses, guidelines,...",[5\nNational Artificial Intelligence Strategy ...
2,1,43,1_education_sw_sw ai_curriculum,"[education, sw, sw ai, curriculum, schools, sc...","[ai education, ai curriculum, ai allied techno...","[education, sw, sw ai, curriculum, schools, sc...","[customized department), and Industrial AI Pro..."
3,2,29,2_philippines_ai strategy roadmap_strategy roa...,"[philippines, ai strategy roadmap, strategy ro...","[national ai strategy, ai strategy roadmap, na...","[philippines, ai strategy roadmap, strategy ro...",[NATIONAL AI STRATEGY ROADMAP FOR THE PHILIPPI...
4,3,29,3_pakistan_policy_ai policy_ai adoption,"[pakistan, policy, ai policy, ai adoption, sec...","[ai based technologies, national artificial in...","[pakistan, policy, ai policy, ai adoption, sec...",[sharing standardized data will ensure the ena...
5,4,24,4_society_social_social principles_human centric,"[society, social, social principles, human cen...","[social principles ai, society ai, ai ready so...","[society, social, social principles, human cen...",[7 \n 4 Social Principles of Human -Centric A...
6,5,23,5_korea_korean_artificial_strategy,"[korea, korean, artificial, strategy, intellig...","[ai world leader, artificial intelligence poli...","[korea, korean, artificial, strategy, intellig...",[21\n2. Where is Korea aiming? (Vision and Goa...
7,6,18,6_india_consultations_paper_ai systems,"[india, consultations, paper, ai systems, prin...","[ai ethics, responsible ai, ai india, artifici...","[india, consultations, paper, ai systems, prin...",[improving access and quality and higher effic...
8,7,17,7_job_workforce_changes_automation,"[job, workforce, changes, automation, awarenes...","[ai, skills ai, job automation, basic skills a...","[job, workforce, changes, automation, awarenes...","[Global companies such as Microsoft, Amazon, A..."
9,8,16,8_roles_project_responsibilities_roles respons...,"[roles, project, responsibilities, roles respo...","[ai governance structure, ai governance, ethic...","[roles, project, responsibilities, roles respo...",[their corresponding responsibilities . \n \n...


In [148]:
topic_model.get_topic(0, full=True)  # Select the most frequent topic

{'Main': [('qatar', 0.05982546440352836),
  ('ai technology', 0.012312849799269034),
  ('businesses', 0.011800167422276846),
  ('guidelines', 0.010257391343404351),
  ('world', 0.009444092587170239),
  ('strategy', 0.008981548866182259),
  ('gov qa', 0.008906928435222835),
  ('local', 0.008872691665470058),
  ('language', 0.008811053514518646),
  ('qa', 0.008718900920147907)],
 'KeyBERT': [('national ai strategy', 0.6385046),
  ('national artificial intelligence', 0.57856965),
  ('ai future', 0.5761137),
  ('national ai', 0.57600844),
  ('ai technology', 0.5291699),
  ('ai', 0.5124967),
  ('modern ai', 0.5113133),
  ('ai solutions', 0.5000566),
  ('ai based', 0.47077513),
  ('ai enabled', 0.4680573)],
 'MMR': [('qatar', 0.05982546440352836),
  ('ai technology', 0.012312849799269034),
  ('businesses', 0.011800167422276846),
  ('guidelines', 0.010257391343404351),
  ('world', 0.009444092587170239),
  ('strategy', 0.008981548866182259),
  ('gov qa', 0.008906928435222835),
  ('local', 0.00

In [143]:
topic_model.visualize_topics()


In [147]:
topic_model.topic_representations_

{-1: [('data', 0.004711170867681569),
  ('ai', 0.0046521437913720634),
  ('model', 0.004365704358676696),
  ('organisations', 0.00408052819013431),
  ('ai application', 0.003887949739809465),
  ('application', 0.0037846240457796256),
  ('ethical', 0.0035796752448587054),
  ('ensure', 0.0035776489355264078),
  ('process', 0.003497337450093672),
  ('applications', 0.003384670635564553)],
 0: [('qatar', 0.05982546440352836),
  ('ai technology', 0.012312849799269034),
  ('businesses', 0.011800167422276846),
  ('guidelines', 0.010257391343404351),
  ('world', 0.009444092587170239),
  ('strategy', 0.008981548866182259),
  ('gov qa', 0.008906928435222835),
  ('local', 0.008872691665470058),
  ('language', 0.008811053514518646),
  ('qa', 0.008718900920147907)],
 1: [('education', 0.029263260672741716),
  ('sw', 0.022714438251729814),
  ('sw ai', 0.020810223448869774),
  ('curriculum', 0.016793457761493002),
  ('schools', 0.01613701491805157),
  ('school', 0.01585348827627249),
  ('training', 0

In [161]:
topic_model.visualize_barchart(top_n_topics=20)