In [2]:
# Imports
import pandas as pd
import numpy as np
import os
from tqdm import tqdm, tqdm_pandas
import matplotlib as mpl
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import datetime as datetime

In [None]:
# Lets customize some of the parameters of the matplotlib library.
def setup_mpl():
    mpl.rcParams["font.family"] = "Times New Roman"
    mpl.rcParams["lines.linewidth"] = 1
setup_mpl()

### Basic stats. Let's understand the dataset better

##### Write about your choices in data cleaning and preprocessing

First step is to collect and represent the data in a meaningful datastructure that allows for further analysis. For this project all data is collected in a single Pandas dataframe.

The raw data itself is spread over six subfolder, one for each year. In these folders there are two files for each day. One that contains the transcripts themself and another file that contains the corresponding metadata. Each of these files have a ID column that allows to match the rows of the two data files.

In [None]:
# Create temp meta dataframe
ParlaMint_meta = pd.DataFrame()
rootdir = 'data/ParlaMint-GB.txt'
for subdir, dirs, files in tqdm(os.walk(rootdir)):
    for file in files:
        filename = os.fsdecode(file)
        if filename.endswith(".tsv"):
            #Create the dataframe for metadata
            temp_meta = pd.read_csv(os.path.join(subdir, file), sep = "\t", header = None, skiprows = 1) # Skip the header row
            ParlaMint_meta = pd.concat([ParlaMint_meta, temp_meta], axis = 0) # concat along columns

In [None]:
# Create temp text dataframe
ParlaMint_text = pd.DataFrame()

rootdir = 'data/ParlaMint-GB.txt'
for subdir, dirs, files in tqdm(os.walk(rootdir)):
    for file in files:
        #print(os.fsdecode(file))
        filename = os.fsdecode(file)
        if filename.endswith(".txt"):
            #Create the dataframe for text
            temp_text = pd.read_csv(os.path.join(subdir, file), sep = "\t", header = None)
            ParlaMint_text = pd.concat([ParlaMint_text, temp_text], axis = 0, ignore_index = True) # concat along columns

In [None]:
# Test relation between the two dataframes to ensure match in index.
idx_1 = ParlaMint_meta.iloc[:, 0].tolist()
idx_2 = ParlaMint_text.iloc[:, 0].tolist()
idx_1 == idx_2

Now the two dataframes are concatenated together with their column names.

In [None]:
ParlaMint = ParlaMint_meta.copy()
# Add text as new column
ParlaMint["Text"] = ParlaMint_text[1].values
# create rename dict
column_names = ['ID', 'Title', 'From', 'To', 'House', 'Term', 'Session', 'Meeting',
       'Sitting', 'Agenda', 'Subcorpus', 'Speaker_role', 'Speaker_type',
       'Speaker_party', 'Speaker_party_name', 'Party_status', 'Speaker_name',
       'Speaker_gender', 'Speaker_birth', "Text"]
new_column_names = [(i, item) for i, item in enumerate(column_names)]
new_column_names = {key: value for (key, value) in new_column_names}
ParlaMint.rename(columns = new_column_names, inplace = True)

In [None]:
# Save to csv
ParlaMint.to_csv("data/parlamint.csv")

##### Write a short section that discusses the dataset stats (here you can recycle the work you did for Project Assignment A)

In the following section a summary of the key values and characteristics are presented. 

First step is to load in the data that was collected in the previous steps.

In [3]:
column_names = ['ID', 'Title', 'From', 'To', 'House', 'Term', 'Session', 'Meeting',
       'Sitting', 'Agenda', 'Subcorpus', 'Speaker_role', 'Speaker_type',
       'Speaker_party', 'Speaker_party_name', 'Party_status', 'Speaker_name',
       'Speaker_gender', 'Speaker_birth', "Text"]
ParlaMint = pd.read_csv("data/parlamint.csv", index_col=False, usecols = column_names)
# Show head of the data
ParlaMint.head()

Unnamed: 0,ID,Title,From,To,House,Term,Session,Meeting,Sitting,Agenda,Subcorpus,Speaker_role,Speaker_type,Speaker_party,Speaker_party_name,Party_status,Speaker_name,Speaker_gender,Speaker_birth,Text
0,ParlaMint-GB_2015-01-05-commons.u1,"Minutes of the House of Commons, Daily Session...",2015-01-05,2015-01-05,Lower house,55,,,,,Reference,Regular,MP,LD,Liberal Democrat,Coalition,"Willott, Jennifer Nancy",F,-,1. What progress her Department has made on im...
1,ParlaMint-GB_2015-01-05-commons.u2,"Minutes of the House of Commons, Daily Session...",2015-01-05,2015-01-05,Lower house,55,,,,,Reference,Regular,MP,CON,Conservative,Coalition,"May, Theresa Mary",F,-,The Government are on track to deliver their c...
2,ParlaMint-GB_2015-01-05-commons.u3,"Minutes of the House of Commons, Daily Session...",2015-01-05,2015-01-05,Lower house,55,,,,,Reference,Regular,MP,LD,Liberal Democrat,Coalition,"Willott, Jennifer Nancy",F,-,"It is clear that exit checks, which were scrap..."
3,ParlaMint-GB_2015-01-05-commons.u4,"Minutes of the House of Commons, Daily Session...",2015-01-05,2015-01-05,Lower house,55,,,,,Reference,Regular,MP,CON,Conservative,Coalition,"May, Theresa Mary",F,-,"As I indicated in my original answer, we are o..."
4,ParlaMint-GB_2015-01-05-commons.u5,"Minutes of the House of Commons, Daily Session...",2015-01-05,2015-01-05,Lower house,55,,,,,Reference,Regular,MP,LAB,Labour,Opposition,"Cunningham, Alexander",M,-,19. Given the situation at our border in Calai...


Next step is exploration of the size of the dataset

In [4]:
MB_size = os.path.getsize("data/parlamint.csv") / 1024 / 1024 # Convert from byte to MB
print(f"The dataset is {MB_size:0.2f} MB.")
print(f"The dataset consists of {ParlaMint.shape[0]} datapoints each with a feature length of {ParlaMint.shape[1]}.")

The dataset is 676.02 MB.
The dataset consists of 552103 datapoints each with a feature length of 20.


Here each datapoints should be understood as a MP's statement, while the next row in the dataset is the following MP's/speakers statement.

Next step is to explore if there are any redunctant or dirty data that needs to be filter out before further analysis.

In [5]:
ParlaMint.count()

ID                    552103
Title                 552103
From                  552103
To                    552103
House                 552103
Term                  552103
Session                    0
Meeting                    0
Sitting                    0
Agenda                     0
Subcorpus             552103
Speaker_role          552103
Speaker_type          552103
Speaker_party         550489
Speaker_party_name    551961
Party_status          548885
Speaker_name          552103
Speaker_gender        552103
Speaker_birth         552103
Text                  552103
dtype: int64

From the counts of each columns it is seen that the columns Session, Meeting, Setting, Agenda only contains NaN-values and therefore can be excluded. 

In [9]:
np.unique(ParlaMint["Speaker_birth"].values)

array(['-'], dtype=object)

Here it is seen that the Speaker Birth column only contains "-" and not the actual birthday of the speaker. Therefore this column is also excluded. 

In [34]:
ParlaMint["Party_status"][ParlaMint["Party_status"] == "Coalition"]

0        Coalition
1        Coalition
2        Coalition
3        Coalition
5        Coalition
           ...    
27457    Coalition
27458    Coalition
27459    Coalition
27460    Coalition
27461    Coalition
Name: Party_status, Length: 17219, dtype: object