In [1]:
!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh

In [2]:
import sys
sys.path.append("..")

# MAKE SURE TO SET-UP PATH -> use local to run with demo data; use azure to run with complete dataset (access required)
# Select where to run notebook: "azure" or "local"
my_run = "azure"

import my_secrets as sc
import settings as st

if my_run == "azure":
    import config_azure as cf
    running_demo = False
elif my_run == "local":
    import config as cf
    running_demo = True


import os
if my_run == "azure":
    if not os.path.exists(cf.HUGGING_CACHE):
        os.mkdir(cf.HUGGING_CACHE)
    os.environ["TRANSFORMERS_CACHE"] = cf.HUGGING_CACHE

import pandas as pd


## Notebook overview
Goal: manually check some of the classes. especially docs that have very low token count.
- Starting point: txtfiles_notcleaned.pkl
- Ending point: txtfiles.pkl -> only including 'good' documents

We found that there are documents with very low token count. Thus we further checked for each class how clean the dataset was. This led to the complete removal of three classes and the removal of some documents based on their length. 

In this notebook:
1. We will first display the token distribution to highlight the imbalance. 
2. Then, we will provide the code to remove the faulty documents.
3. After, we also remove duplicate documents. 
4. Then, we remove unneccary columns. We don't need the tokens, token_count, clean_tokens, clean_tokens_count columns anymore, those were used for analysis of the raw data.
5. Finally, we will explain the reasons for the removal of the faulty documents, if applicable, for each class. The explanation is put at the end of the notebook since it takes up a lot of cells. 

Previous notebook: load_txt.ipynb

Next notebook: duplicates.ipynb

### 1. Check document length
We load in the unclean dataset and look into the document length. We can see that some documents are unusually short. 

In [3]:
# load data
import ast
import pandas as pd

txtfiles = pd.read_pickle(f"{cf.output_path}/txtfiles_notcleaned.pkl")
df = txtfiles.copy()

display(txtfiles.groupby('label')['token_count'].describe())

# clean tokens -> stopwords, punctuation etc. removed. very simple cleaning of tokens applied
display(txtfiles.groupby('label')['clean_tokens_count'].describe())


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Actualiteit,996.0,696.773092,3462.293848,72.0,236.0,344.5,592.0,103700.0
Adviesaanvraag,2442.0,1644.871417,2993.302964,57.0,557.0,955.0,1672.0,77769.0
Agenda,3537.0,1048.904156,4244.783305,20.0,325.0,616.0,939.0,139648.0
Amendement,1969.0,2658.493652,7044.47041,30.0,236.0,359.0,1184.0,62304.0
Begroting,1967.0,13160.190646,46780.793626,40.0,248.0,385.0,2324.0,247184.0
Besluit,775.0,986.649032,1749.0065,70.0,168.5,439.0,1216.0,26523.0
Brief,1995.0,1764.259649,1728.612069,3.0,734.0,1269.0,2291.5,32957.0
Factsheets,234.0,6008.987179,14742.837685,112.0,1154.0,2753.5,5462.0,171297.0
Motie,8336.0,521.707893,1129.571368,105.0,234.0,292.5,402.0,36091.0
Onderzoeksrapport,1286.0,15329.244168,18086.586572,233.0,5896.25,10999.0,19127.5,275597.0


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Actualiteit,996.0,360.197791,1833.846209,39.0,124.0,175.0,311.25,55400.0
Adviesaanvraag,2442.0,825.560197,1517.594501,33.0,287.0,486.0,818.0,39865.0
Agenda,3537.0,559.212327,2187.277067,17.0,180.0,337.0,515.0,72471.0
Amendement,1969.0,1274.853225,3332.397979,15.0,118.0,179.0,595.0,29968.0
Begroting,1967.0,7290.902898,26004.241417,24.0,125.0,203.0,1179.0,137501.0
Besluit,775.0,513.227097,889.155749,50.0,95.5,230.0,657.0,13651.0
Brief,1995.0,881.689724,954.436225,3.0,369.0,636.0,1134.0,21285.0
Factsheets,234.0,3347.893162,9863.108583,60.0,634.75,1527.0,2937.0,131816.0
Motie,8336.0,264.074976,584.938299,60.0,120.0,149.0,204.0,21285.0
Onderzoeksrapport,1286.0,7970.667963,9994.442762,162.0,3049.25,5631.5,9927.25,143782.0


### 2. Remove the faulty documents.
Below each class is checked, and explained why these documents are removed.

The removal includes:
- classes: 'Adviesaanvraag', 'Amendement', 'Begroting'
- some documents that are gibberish


In [4]:
print("Original amount of docs:", len(txtfiles))
# remove Adviesaanvraag, Amendement and Begroting. These classes include docs from other classes.
txtfiles = txtfiles.loc[~txtfiles['label'].isin(['Adviesaanvraag', 'Amendement', 'Begroting'])]

# remove docs that are too short (only include gibberish)
subdf = txtfiles.loc[txtfiles['label']== 'Schriftelijke Vragen']
threshold = subdf['clean_tokens_count'].quantile(0.01)
subdf = subdf.loc[txtfiles['clean_tokens_count'] < threshold].sort_values(by=['clean_tokens_count'])
txtfiles = txtfiles.loc[~txtfiles['id'].isin(subdf['id'])]

subdf = txtfiles.loc[(txtfiles['label']== 'Brief') & (txtfiles['clean_tokens_count'] < 75)]
txtfiles = txtfiles.loc[~txtfiles['id'].isin(subdf['id'])]

subdf = txtfiles.loc[(txtfiles['label']== 'Raadsadres') & (txtfiles['clean_tokens_count'] < 25)]
txtfiles = txtfiles.loc[~txtfiles['id'].isin(subdf['id'])]

# rename verslag column
txtfiles.loc[txtfiles['label']=='Verslag','label'] = 'Raadsnotulen'


print("Amount of docs after removal:", len(txtfiles))

Original amount of docs: 33117
Amount of docs after removal: 26704


### 3. Remove duplicates

goal: check data for duplicates

- Use md5 hashing to remove duplicates. 
- Data is re-split into subsets, using 2-split (train and test), 4-split (train, test, val and dev) and balance-split (train, test and val; used for research)

In [5]:
import hashlib
import sys

# load function to split data into subsets (train,test,val and dev)
sys.path.append('../src/') 
from data_split import save_split, save_balanced_split

def calculate_md5(file_path):
    """Calculate the MD5 hash of a file."""
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

"""Calculate MD5 hash for each file path in a DataFrame."""
def drop_duplicates(df, file_path_column):
    df['md5_hash'] = df[file_path_column].apply(calculate_md5)

    # remove rows with duplicate md5_hash
    clean_df = df.drop_duplicates(subset=['md5_hash'])
    print(f"{len(df)-len(clean_df)} docs removed. New total: {len(clean_df)} docs.")
    return clean_df

""" After removing duplicates and faulty documents, data needs to be resplit """
def redo_datasplit(df):
    df = save_split(df)
    df = save_balanced_split(df, demo=running_demo)
    return df

hash_df = drop_duplicates(txtfiles, 'path')
cleaned_df = redo_datasplit(hash_df)


5886 docs removed. New total: 20818 docs.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  remaining_df['balanced_split'] = 'discard'


### 4. Remove unnessecary columns & save cleaned df

In [6]:
print(f"Columns before removing: {list(cleaned_df.columns)}")
cleaned_df = cleaned_df.drop(columns=['tokens', 'token_count', 'clean_tokens', 'clean_tokens_count'])
print(f"Columns after removing: {list(cleaned_df.columns)}")


Columns before removing: ['label', 'path', 'id', 'text', 'tokens', 'token_count', 'clean_tokens', 'clean_tokens_count', 'pdf_path', 'num_pages', 'md5_hash', '4split', '2split', 'balanced_split']
Columns after removing: ['label', 'path', 'id', 'text', 'pdf_path', 'num_pages', 'md5_hash', '4split', '2split', 'balanced_split']


In [7]:
cleaned_df.to_pickle(f"{cf.output_path}/txtfiles.pkl")

### 5. Explanation: Check each class

NOTE: this analysis is run on 'df', which is the uncleaned dataframe.

**Actualiteit**

After checking the lowest 1 percent of number of tokens (10 documents which have less than 58 tokens) were checked using the PDFs. These documents are just very short, thus not a mistake of the txt extraction. 

In [36]:
actualiteit = df.loc[df['label']=='Actualiteit']
threshold = actualiteit['clean_tokens_count'].quantile(0.01)
lower_5_percentile_rows = actualiteit[actualiteit['clean_tokens_count'] < threshold]
# actualiteit['clean_tokens_count'].describe()


**Adviesaanvraag**

Txt extraction from PDFs went well, except for image. The images result in gibberish in the data. Additionally, it does not seem like there are only adviesaanvragen in there. 

In [37]:
adviesaanvraag = df.loc[df['label']=='Adviesaanvraag']
threshold = adviesaanvraag['clean_tokens_count'].quantile(0.01)
lower_5_percentile_rows = adviesaanvraag[adviesaanvraag['clean_tokens_count'] < threshold].sort_values(by=['clean_tokens_count'])
# print(len(lower_5_percentile_rows))

# for index, row in lower_5_percentile_rows.iterrows():
#     print(row['clean_tokens_count'], row['path'])
#     print(row['text'])
#     print('\n\n')

# adviesaanvraag['clean_tokens_count'].describe()


**Agenda**

Agenda's can just be very short. Txt extraction went well.

In [38]:
subdf = df.loc[df['label']=='Agenda']
threshold = subdf['clean_tokens_count'].quantile(0.01)
lower_5_percentile_rows = subdf[subdf['clean_tokens_count'] < threshold].sort_values(by=['clean_tokens_count'])
# print(len(lower_5_percentile_rows))

# for index, row in lower_5_percentile_rows.iterrows():
#     print(row['clean_tokens_count'], row['path'])
#     print(row['text'])
#     print('\n\n')

# subdf['clean_tokens_count'].describe()


**Amendement**

Not only amendement docs are included. txt extraction went well.

In [39]:
subdf = df.loc[df['label']=='Amendement']
threshold = subdf['clean_tokens_count'].quantile(0.01)
lower_5_percentile_rows = subdf[subdf['clean_tokens_count'] < threshold].sort_values(by=['clean_tokens_count'])

# print(len(lower_5_percentile_rows))

# for index, row in lower_5_percentile_rows.iterrows():
#     print(row['clean_tokens_count'], row['path'])
#     print(row['text'])
#     print('\n\n')

# subdf['clean_tokens_count'].describe()


**Begroting**

It does not seem like there are only begrotingen. txt went well. except for tables (file:///C:/Users/femke/Documents/MasterThesis/discardfiles/1543430.pdf)

In [40]:
subdf = df.loc[df['label']=='Begroting']
threshold = subdf['clean_tokens_count'].quantile(0.01)
lower_5_percentile_rows = subdf[subdf['clean_tokens_count'] < threshold].sort_values(by=['clean_tokens_count'])
# print(len(lower_5_percentile_rows))

# for index, row in lower_5_percentile_rows.iterrows():
#     print(row['clean_tokens_count'], row['path'])
#     print(row['text'])
#     print('\n\n')

# subdf['clean_tokens_count'].describe()


**Besluit**

Looks good

In [41]:
subdf = df.loc[df['label']=='Besluit']
threshold = subdf['clean_tokens_count'].quantile(0.01)
lower_5_percentile_rows = subdf[subdf['clean_tokens_count'] < threshold].sort_values(by=['clean_tokens_count'])
# print(len(lower_5_percentile_rows))

# for index, row in lower_5_percentile_rows.iterrows():
#     print(row['clean_tokens_count'], row['path'])
#     print(row['text'])
#     print('\n\n')

# subdf['clean_tokens_count'].describe()


**Brief**

Remove docs with less than 75 tokens. These are some weird poster. The rest looks good.

In [42]:
subdf = df.loc[df['label']=='Brief']
threshold = subdf['clean_tokens_count'].quantile(0.01)
lower_5_percentile_rows = subdf[subdf['clean_tokens_count'] < threshold].sort_values(by=['clean_tokens_count'])
# print(len(lower_5_percentile_rows))

# for index, row in lower_5_percentile_rows.iterrows():
#     print(row['clean_tokens_count'], row['path'])
#     print(row['text'])
#     print('\n\n')

# subdf['clean_tokens_count'].describe()


**Factsheets**

Lots of posters. Lots of images included

In [43]:
subdf = df.loc[df['label']=='Factsheets']
threshold = subdf['clean_tokens_count'].quantile(0.1)
lower_5_percentile_rows = subdf[subdf['clean_tokens_count'] < threshold].sort_values(by=['clean_tokens_count'])
# print(len(lower_5_percentile_rows))

# for index, row in lower_5_percentile_rows.iterrows():
#     print(row['clean_tokens_count'], row['path'])
#     print(row['text'])
#     print('\n\n')

# subdf['clean_tokens_count'].describe()


**Motie**

I cant see the original PDFs. Looks good tho.

In [44]:
subdf = df.loc[df['label']=='Motie']
threshold = subdf['clean_tokens_count'].quantile(0.01)
lower_5_percentile_rows = subdf[subdf['clean_tokens_count'] < threshold].sort_values(by=['clean_tokens_count'])
# print(len(lower_5_percentile_rows))

# for index, row in lower_5_percentile_rows.iterrows():
#     print(row['clean_tokens_count'], row['path'])
#     print(row['text'])
#     print('\n\n')

# subdf['clean_tokens_count'].describe()


**Onderzoeksrapport**   
   
Include posters, presentations? (/home/azureuser/cloudfiles/code/blobfuse/raadsinformatie/OpenResearch/onderzoek-in-de-gemeenteraad/onderzoeken-rapporten-2019-gemeenteraad/5g-technische-sessie.pdf.ocr
)


Includes images.

In [45]:
subdf = df.loc[df['label']=='Onderzoeksrapport']
threshold = subdf['clean_tokens_count'].quantile(0.01)
lower_5_percentile_rows = subdf[subdf['clean_tokens_count'] < threshold].sort_values(by=['clean_tokens_count'])
# print(len(lower_5_percentile_rows))

# for index, row in lower_5_percentile_rows.iterrows():
#     print(row['clean_tokens_count'], row['path'])
#     print(row['text'])
#     print('\n\n')

# subdf['clean_tokens_count'].describe()


**Raadsadres**

remove less than 25 tokens. looks good.

In [46]:
subdf = df.loc[df['label']=='Raadsadres']
threshold = subdf['clean_tokens_count'].quantile(0.01)
lower_5_percentile_rows = subdf[subdf['clean_tokens_count'] < threshold].sort_values(by=['clean_tokens_count'])
# print(len(lower_5_percentile_rows))

# for index, row in lower_5_percentile_rows.iterrows():
#     print(row['clean_tokens_count'], row['path'])
#     print(row['text'])
#     print('\n\n')

# subdf['clean_tokens_count'].describe()


**Schriftelijke Vragen**

Cant open PDFs. Looks like a mess. Middle looks much better. Should remove lowest 0.01 cleaned_tokens_count

In [47]:
subdf = df.loc[df['label']== 'Schriftelijke Vragen']
threshold = subdf['clean_tokens_count'].quantile(0.01)
# threshold2 = subdf['clean_tokens_count'].quantile(0.02)

lower_5_percentile_rows = subdf[subdf['clean_tokens_count'] < threshold].sort_values(by=['clean_tokens_count'])
# lower_5_percentile_rows = subdf[(subdf['clean_tokens_count'] < threshold2) & (subdf['clean_tokens_count']>threshold)].sort_values(by=['clean_tokens_count'])

# print(len(lower_5_percentile_rows))

# for index, row in lower_5_percentile_rows.iterrows():
#     print(row['clean_tokens_count'], row['path'])
#     print(row['text'])
#     print('\n\n')

# subdf['clean_tokens_count'].describe()



**Termijnagenda**

Does include many tables.
Should be combined with agenda.

In [48]:
subdf = df.loc[df['label']=='Termijnagenda']
threshold = subdf['clean_tokens_count'].quantile(0.01)
lower_5_percentile_rows = subdf[subdf['clean_tokens_count'] < threshold].sort_values(by=['clean_tokens_count'])
# print(len(lower_5_percentile_rows))

# for index, row in lower_5_percentile_rows.iterrows():
#     print(row['clean_tokens_count'], row['path'])
#     print(row['text'])
#     print('\n\n')

# subdf['clean_tokens_count'].describe()


**Verslag**

Should rename into raadsnotulen. looks good

In [51]:
subdf = df.loc[df['label']=='Verslag']
threshold = subdf['clean_tokens_count'].quantile(0.01)
lower_5_percentile_rows = subdf[subdf['clean_tokens_count'] < threshold].sort_values(by=['clean_tokens_count'])
# print(len(lower_5_percentile_rows))

# for index, row in lower_5_percentile_rows.iterrows():
#     print(row['clean_tokens_count'], row['path'])
#     print(row['text'])
#     print('\n\n')

# subdf['clean_tokens_count'].describe()


**Voordracht**

Looks good

In [54]:
subdf = df.loc[df['label']=='Voordracht']
threshold = subdf['clean_tokens_count'].quantile(0.01)
lower_5_percentile_rows = subdf[subdf['clean_tokens_count'] < threshold].sort_values(by=['clean_tokens_count'])
# print(len(lower_5_percentile_rows))

# for index, row in lower_5_percentile_rows.iterrows():
#     print(row['clean_tokens_count'], row['path'])
#     print(row['text'])
#     print('\n\n')

# subdf['clean_tokens_count'].describe()
