# hansard scraper

the script below is responsable of scraping the hansard archive and organizing the text by politica party, in an ordarly fashion suitable for gensim word2vec function to train on

### imports

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import gensim
import nltk
import re
#nltk.download('punkt')
#nltk.download('stopwords')
import pickle
import os
from tqdm import tqdm
import shutil
from gensim.models import translation_matrix
from gensim.models import KeyedVectors
import seaborn as sns
import matplotlib.pyplot as plt

### functions for scraping and data extraction

In [52]:
all_mp_url = "https://hansard.parliament.uk/search/Members?currentFormerFilter=1&house=Commons&startDate=2013-01-01&endDate=2019-05-24&partial=False"

def results_2_links(soup):
    results = soup.find('div', class_="results-list row")
    results_inner = results.find_all('div', class_="col-sm-6 result-outer")
    debate_links = []
    for i in range(len(results_inner)):
        debate_links.append(results_inner[i].find('a')['href'])
    return(debate_links)

def debate_2_data(debate):
    debate_content_list = debate.find_all("div", class_="content-item")
    data = []
    for block in debate_content_list:
        if block.find("h2", class_="memberLink"):
            if block.find("h2").find("a"):
                mp_name = block.find("h2").find("a").text
                mp_id = block.find("h2").find("a")['href'].split("=")[-1]
                content_block = block.find_all("p")
                content = ""
                for p in content_block:
                    if p.contents:
                        content += p.text

                data.append((mp_name, mp_id, content))
    
    data_df = pd.DataFrame(data)
    return data_df

def collect_debates(start_date, end_date):
    all_debates_url = 'https://hansard.parliament.uk/search/Debates?endDate={end_date}&house=Commons&partial=True&startDate={start_date}&page={page}'
    hansard_link = 'https://hansard.parliament.uk'
    all_data = []
    for i in range(1,10000):

        #query hansard
        r = requests.get(all_debates_url.format(start_date=start_date, end_date=end_date,page=str(i)))
        soup = BeautifulSoup(r.content, "lxml")

        # stop if no results
        if soup.find("div", class_="results-heading no-results" ):
            break    

        #get results
        debate_links = results_2_links(soup)
        for link in debate_links:
            r = requests.get(hansard_link+link)
            debate = BeautifulSoup(r.content, "lxml")
            data = debate_2_data(debate)
            data['debate'] = link.split("/")[-1]
            all_data.append(data)
            # rest
            time.sleep(0.19)

        print("collected page #{}".format(str(i)))

    df_all_data = pd.concat(all_data)
    return df_all_data

## scrape and save as dataframe

In [None]:
for i in range(3,8):
    df_all_data = collect_debates('201{}-01-01'.format(str(i)),'201{}-12-31'.format(str(i)))
    df_all_data.to_csv("hansard_201{}.csv".format(str(i)))



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


collected page #1
collected page #2
collected page #3
collected page #4
collected page #5
collected page #6
collected page #7
collected page #8
collected page #9
collected page #10
collected page #11
collected page #12
collected page #13
collected page #14
collected page #15
collected page #16
collected page #17
collected page #18
collected page #19
collected page #20
collected page #21
collected page #22
collected page #23
collected page #24
collected page #25
collected page #26
collected page #27
collected page #28
collected page #29
collected page #30
collected page #31
collected page #32
collected page #33
collected page #34
collected page #35
collected page #36
collected page #37
collected page #38
collected page #39
collected page #40
collected page #41
collected page #42
collected page #43
collected page #44
collected page #45
collected page #46
collected page #47
collected page #48
collected page #49
collected page #50
collected page #51
collected page #52
collected page #53
co

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




collected page #1
collected page #2
collected page #3
collected page #4
collected page #5
collected page #6
collected page #7
collected page #8
collected page #9
collected page #10
collected page #11
collected page #12
collected page #13
collected page #14
collected page #15
collected page #16
collected page #17
collected page #18
collected page #19
collected page #20
collected page #21
collected page #22
collected page #23
collected page #24
collected page #25
collected page #26
collected page #27
collected page #28
collected page #29
collected page #30
collected page #31
collected page #32
collected page #33
collected page #34
collected page #35
collected page #36
collected page #37
collected page #38
collected page #39
collected page #40
collected page #41
collected page #42
collected page #43
collected page #44
collected page #45
collected page #46
collected page #47
collected page #48
collected page #49
collected page #50
collected page #51
collected page #52
collected page #53
co

# Load all data and unite

In [6]:
all_data = []
for i in range(3,9):
    df_data = pd.read_csv("hansard_201{}.csv".format(str(i)), index_col=0)
    all_data.append(df_data)

df_all_data = pd.concat(all_data)
df_all_data.columns = ["mp", "mp_id", "text", "debate"]

# collect MP party affiliation

In [15]:

#get list of mp_ids
mp_ids = df_all_data.mp_id.unique().tolist()
if 0 in mp_ids:
    mp_ids.remove(0)

#scrape mp information
member_url = "https://hansard.parliament.uk/search/MemberContributions?house=Commons&memberId={id}"
party_data = []
for id_num in mp_ids:
    #query hansard
    r = requests.get(member_url.format(id=id_num))
    soup = BeautifulSoup(r.content, "lxml")
    
    # get party
    print(id_num)
    party = soup.find("div", class_="member-details").find("strong").text
    party_data.append((id_num, party))
    time.sleep(0.12)

#organize
df_id2party = pd.DataFrame(party_data)
df_id2party.columns = ['mp_id', 'party']
df_data_full = df_all_data.merge(df_id2party, on=["mp_id"], how="left")
df_data_full.head()

1470
1496
44
209
54
197
3919
4005
178
603
185
4109
4038
1583
4072
1488
467
151
17
3966
484
411
18
180
4120
385
1516
1537
366
1577
1191
463
112
328
157
4119
4137
1211
4099
4106
4118
491
123
4006
1507
384
1440
1581
146
4133
3925
1396
489
4022
1474
3960
626
1453
4130
4025
4103
1400
4079
4015
301
4140
1552
3957
1490
3996
298
4104
1524
318
4032
428
1398
3962
1610
429
1533
4269
1399
4075
1564
280
43
249
4141
333
171
4004
1562
308
4023
1411
1487
3965
1585
533
4101
1590
465
3990
3951
4029
394
46
588
3954
1426
3933
3973
1579
4136
1390
1527
3994
1481
338
242
1520
4049
210
479
133
4069
4098
3939
1504
4056
36
483
3985
47
481
12
4086
4084
4125
583
546
586
534
4094
4067
4052
4264
1503
1569
4064
513
3956
1548
163
451
1546
1521
1394
3964
4139
478
93
4131
1397
164
3981
4244
4036
3992
172
167
245
4142
4083
572
4212
3911
4051
304
4071
4031
3926
1419
1561
4059
1522
1482
390
1427
1461
4035
3930
177
206
1572
184
1567
1588
193
165
217
4082
1439
3921
116
204
227
3952
4076
4037
4122
1506
4265
602
413
4009
1431

Unnamed: 0,mp,mp_id,text,debate,party
0,Ian Lucas (Wrexham) (Lab),1470,It is a pleasure to see you in the Chair this ...,CriminalJustice(NorthWales),Labour
1,The Parliamentary Under-Secretary of State for...,1496,I thank the hon. Member for Wrexham (Ian Lucas...,CriminalJustice(NorthWales),Conservative
2,Mr Vara,1496,The work that magistrates do in north Wales an...,CriminalJustice(NorthWales),Conservative
3,Ian Lucas,1470,"On that point, the difficulty is that steps we...",CriminalJustice(NorthWales),Labour
4,Mr Vara,1496,I appreciate where the hon. Gentleman is comin...,CriminalJustice(NorthWales),Conservative


In [17]:
df_data_full.to_csv('all_data_full.csv')

In [18]:
df_data_full.party.unique()

array(['Labour', 'Conservative', 'Independent', 'Liberal Democrat',
       'Speaker', 'Labour (Co-op)', 'Non-affiliated',
       'Scottish National Party', 'Social Democratic & Labour Party',
       'Change UK - The Independent Group', 'UK Independence Party',
       'Democratic Unionist Party', 'Plaid Cymru', 'Green Party',
       'Alliance', 'Respect', 'Ulster Unionist Party', nan], dtype=object)

In [None]:

parties = {'Labour':['Labour (Co-op)', 'Labour'], 'Conservative':['Conservative']}
for P_name, P_sub_names in parties.items():
    #df_data_full = pd.read_csv('hansard_2014_full.csv')
    df_data_party = df_data_full[df_data_full['party'].isin(P_sub_names)].reset_index()

    for i in tqdm(range(df_data_party.shape[0])):

        #file_labour_text = open(,"a+") 
        article_text =df_data_party.text[i]
        # Cleaing the text
        processed_article = article_text.lower()  
        processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )  
        processed_article = re.sub(r'\s+', ' ', processed_article)

        # Preparing the dataset
        all_sentences = nltk.sent_tokenize(processed_article)

        all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

        # Removing Stop Words
        from nltk.corpus import stopwords  
        for j in range(len(all_words)):  
            all_words[j] = [w for w in all_words[j] if w not in stopwords.words('english')]

        file_name = "{party}_texts/{party}_{num}".format(party=P_name, num=str(i))

        with open(file_name, 'wb+') as fp:
            pickle.dump(all_words, fp)


 17%|█▋        | 118903/683815 [41:30<1:35:23, 98.70it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 24%|██▍       | 165102/683815 [58:30<2:00:57, 71.48it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 31%|███       | 211071/683815 [1:15:12<1:23:53, 93.92it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variab

In [13]:
df_data_full = pd.read_csv('all_data_full.csv', index_col=0)
df_data_full.head()

  mask |= (ar1 == a)


Unnamed: 0,mp,mp_id,text,debate,party
0,Ian Lucas (Wrexham) (Lab),1470,It is a pleasure to see you in the Chair this ...,CriminalJustice(NorthWales),Labour
1,The Parliamentary Under-Secretary of State for...,1496,I thank the hon. Member for Wrexham (Ian Lucas...,CriminalJustice(NorthWales),Conservative
2,Mr Vara,1496,The work that magistrates do in north Wales an...,CriminalJustice(NorthWales),Conservative
3,Ian Lucas,1470,"On that point, the difficulty is that steps we...",CriminalJustice(NorthWales),Labour
4,Mr Vara,1496,I appreciate where the hon. Gentleman is comin...,CriminalJustice(NorthWales),Conservative


In [14]:

parties = {'Conservative':['Conservative']}
for P_name, P_sub_names in parties.items():
    df_data_party = df_data_full[df_data_full['party'].isin(P_sub_names)].reset_index()

    for i in tqdm(range(df_data_party.shape[0])):

        #file_labour_text = open(,"a+") 
        article_text =df_data_party.text[i]
        # Cleaing the text
        processed_article = article_text.lower()  
        processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )  
        processed_article = re.sub(r'\s+', ' ', processed_article)

        # Preparing the dataset
        all_sentences = nltk.sent_tokenize(processed_article)

        all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

        # Removing Stop Words
        from nltk.corpus import stopwords  
        for j in range(len(all_words)):  
            all_words[j] = [w for w in all_words[j] if w not in stopwords.words('english')]

        file_name = "{party}_texts/{party}_{num}".format(party=P_name, num=str(i))

        with open(file_name, 'wb+') as fp:
            pickle.dump(all_words, fp)

100%|██████████| 683815/683815 [4:01:11<00:00, 47.25it/s]   


In [15]:
output_filename = 'Conservative_texts'
dir_name = 'Conservative_texts'
shutil.make_archive(output_filename, 'zip', dir_name)

'/data/home/doron/notebooks/Conservative_texts.zip'

In [6]:
output_filename = 'Labour_texts'
dir_name = 'Labour_texts'
shutil.make_archive(output_filename, 'zip', dir_name)

'/data/home/doron/notebooks/Labour_texts.zip'