# Inspired by and uses code from https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [2]:
%pip install gensim nltk pyldavis

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


### Download both of the below files and place them in the same folder as this notebook
https://www.data.gov.uk/dataset/96f3e623-6e54-4df2-808c-48dba5c98b55/countries-december-2021-boundaries-uk-bgc
https://nihr.opendatasoft.com/api/explore/v2.1/catalog/datasets/england_regions/exports/geojson?lang=en&timezone=Europe%2FLondon

In [3]:
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from SPARQLWrapper import SPARQLWrapper, JSON
from shapely.geometry import Point
import geopandas as gpd
import re
import pyLDAvis.gensim
import pickle 
import pyLDAvis
import os

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\icyhe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Set the SPARQL endpoint URL and create the wrapper
sparql_endpoint = "https://api.parliament.uk/sparql/"
sparql = SPARQLWrapper(sparql_endpoint)

# Broad query gathering all questions in the time period the person asking, and that persons MNIS ID
sparql_query = """
SELECT *
WHERE {
  ?question <https://id.parliament.uk/schema/writtenQuestionIndexingAndSearchUin> ?qnum .
  ?person <https://id.parliament.uk/schema/askingPersonHasQuestion> ?question .
  ?question <https://id.parliament.uk/schema/questionText> ?text .
  ?question <https://id.parliament.uk/schema/questionAskedAt> ?date .
  ?person <https://id.parliament.uk/schema/mnisId> ?id
  FILTER (?date >= "2023-01-01+00:00"^^xsd:dateTime && ?date < "2023-10-01+00:00"^^xsd:dateTime)
}
"""

# Perform the query and receive the results
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

all_results = []
for result in results["results"]["bindings"]:
    new_result = {
            "question": result["question"]["value"],
            "qnum": result.get("qnum", {}).get("value", ""),
            "person": result.get("person", {}).get("value", ""),
            "MNIS": result.get("id", {}).get("value", ""),
            "text": result["text"]["value"],
            "date": result.get("date", {}).get("value", ""),
        }
    all_results.append(new_result)

sparql_endpoint = "https://query.wikidata.org/sparql"
sparql = SPARQLWrapper(sparql_endpoint)
# query wikidata to get the MNIS and Constituency of every member of the current parliament
sparql_query = """
SELECT DISTINCT ?item ?id ?district ?districtLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?item wdt:P39 wd:Q77685926.
  ?item wdt:P10428 ?id.
  ?item p:P39 [pq:P768 ?district;
                pq:P580 ?date].
    FILTER(YEAR(?date) = 2019)
}"""

sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)
constituencies = sparql.query().convert()

# Get constituency for each MNIS id
constituencies_list = []
for entry in constituencies["results"]["bindings"]:
    dict_entry = {
        "MNIS": entry["id"]["value"],
        "ConstituencyLabel": entry["districtLabel"]["value"],
        "ConstituencyURI": entry["district"]["value"],
    }
    constituencies_list.append(dict_entry)

# Add constituency information to questions
all_merged_list = [] 
for result in all_results:
    mnis_value = result['MNIS']
    # Merge the two lists based on MNIS
    matching = next((const for const in constituencies_list if const['MNIS'] == mnis_value), None)
    if matching is not None:
        merged_dict = {**result, **matching}
        all_merged_list.append(merged_dict)

In [5]:
# Load all the questions and the constituencies they are in to a dataframe
questions = pd.DataFrame(all_merged_list)

questions = questions.drop(columns=['question', 'qnum', 'person', 'MNIS', 'date', 'ConstituencyURI'], axis=1)

# Query wikidata for the coordinates of all constituencies
endpoint_url = "https://query.wikidata.org/sparql"
sparql = SPARQLWrapper(endpoint_url)

query = """SELECT DISTINCT ?coordLabel ?constLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?const wdt:P31 wd:Q27971968.
  OPTIONAL { ?const wdt:P625 ?coord. }
}"""

sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

# Place coordinates in a list of dicts
const_coords = []
for result in results["results"]["bindings"]:
    if len(result) == 2:
        new_const = {
            "name": result["constLabel"]["value"],
            "coords": result["coordLabel"]["value"]
        }
        const_coords.append(new_const)

# Convert the result coordinates to usable lists and merge
for result in const_coords:
    coordinates_str = result["coords"].replace("Point(", "").replace(")", "")
    longitude, latitude = map(float, coordinates_str.split())
    result["coords"] = [longitude, latitude]
merged_df = pd.merge(questions, pd.DataFrame(const_coords), left_on='ConstituencyLabel', right_on='name', how='left')

# Create shapely points of all coordinates and place it all in a geodataframe
geometry = [Point(lon, lat) for lon, lat in merged_df['coords']]
gdf_points = gpd.GeoDataFrame(merged_df, geometry=geometry, crs='EPSG:4326')

# Assign regions to all questions
gdf = gpd.read_file('england_regions.geojson')
gdf_merged = gpd.sjoin(gdf_points, gdf, how='left', op='within')
result_df = gdf_merged[["text","ConstituencyLabel","coords","rgn22nm"]].rename(columns={'rgn22nm': 'Region'})

# Give constituencies outside of england regions allowing Northern Ireland, Scotland, and Wales to act as their own regions
gdf_countries = gpd.read_file('Countries_(December_2021)_UK_BGC.geojson')
gdf_merged = gpd.sjoin(gdf_points, gdf_countries, how='left', op='within')
for index, row in result_df.iterrows():
    if pd.isna(row["Region"]):
        row["Region"] = gdf_merged.at[index,"CTRY21NM"]

  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  if await self.run_code(code, result, async_=asy):
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  srs = pd.Series(*args, **kwargs)
  s

In [6]:
# Divide the master df into smaller ones based off region
dfs_by_region = dict(tuple(result_df.groupby('Region')))

In [7]:
# Code in this cell mostly taken from article cited at top

# Preprocesses the text, removing punctuation, html tags, and making text lowercase
def processtext(result):
    # Remove punctuation
    result['processed_text'] = \
    result['text'].map(lambda x: re.sub('[,\.!?]', '', x))
    # Convert the titles to lowercase
    result['processed_text'] = \
    result['processed_text'].map(lambda x: x.lower())
    # Remove the html tags
    result["processed_text"] = \
    result['processed_text'].map(lambda x: re.sub(r'<.*?>', '', x))
    # Print out the first rows of result
    result['processed_text'].head()
    return result

  result['text'].map(lambda x: re.sub('[,\.!?]', '', x))


In [8]:
# All code in this cell is adapted from tutorial at top of notebook

# Define stop words that will be removed from the text before LDA
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts,stop_words):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
def get_stop_words(result):
    stop_words = stopwords.words('english')
    stop_words.extend(['ask', 'state', 'secretary', 'department', 'many', 'whether', "make", "made"])

    data = result.processed_text.values.tolist()
    data_words = list(sent_to_words(data))
    # remove stop words
    data_words = remove_stopwords(data_words,stop_words)
    return data_words

In [9]:
# All code in this cell comes from tutorial at top of notebook

# Create a dictionary of words in the corpus
def create_dict(data_words):
    # Create Dictionary
    id2word = corpora.Dictionary(data_words)
    # Create Corpus
    texts = data_words
    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    # View
    return id2word, corpus


In [10]:
# All code in this cell comes from tutorial at top of notebook

# Create an LDA model with gensim trained on the corpus
def buildLDA(num_topics, corpus, id2word):
    # Build LDA model
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                        id2word=id2word,
                                        num_topics=num_topics,
                                        passes=3)
    doc_lda = lda_model[corpus]
    return doc_lda, lda_model

In [21]:
# All code in this cell comes from tutorial at top of notebook

# Create visualisations of the LDA results
def Visualize(lda_model, corpus, id2word, num_topics, region):
    # Visualize the topics
    pyLDAvis.enable_notebook()
    LDAvis_data_filepath = os.path.join('./vis/'+region+'/ldavis_prepared_'+str(num_topics)+region)
    # # this is a bit time consuming - make the if statement True
    # # if you want to execute visualization prep yourself
    if 1 == 1:
        LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
        with open(LDAvis_data_filepath, 'wb') as f:
            pickle.dump(LDAvis_prepared, f)
    # load the pre-prepared pyLDAvis data from disk
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)
    pyLDAvis.save_html(LDAvis_prepared, './vis/' + region + '/ldavis_prepared_'+ str(num_topics) + '.html')

In [15]:
def createLDAfromdf(df,num_topics,region):
    df = processtext(df)
    data_words = get_stop_words(df)
    id2word, corpus = create_dict(data_words)
    doc_lda, lda = buildLDA(num_topics, corpus, id2word)
    Visualize(lda,corpus,id2word,num_topics,region)

In [23]:
for i in range(3,7):
    for region in dfs_by_region:
        createLDAfromdf(dfs_by_region[region],i,region)