In [50]:
"""
This note book cleans the data for the network visuslisation.
"""
import pandas as pd
import numpy as np
import os

In [51]:

# Specify the path to the folder containing the CSV files
folder_path = 'data/Preprocessed_Facet_Datasets'

# Get the list of files in the folder
file_list = os.listdir(folder_path)

# Initialize an empty DataFrame to store the merged data
merged_archival_data = pd.DataFrame()

# Loop through the files and merge them
for file_name in file_list:
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        merged_archival_data = pd.concat([merged_archival_data, df], ignore_index=True)

#print shape of merged data
print(merged_archival_data.shape)

(14248, 11)


In [52]:
#use this one! just fix it!!! Fix duplicate records
drop_dupes_df = merged_archival_data[merged_archival_data['PERSON'] != merged_archival_data['cleaned_author_name']]
print(drop_dupes_df.shape)

(13347, 11)


In [53]:
#remove rows where there is just one word in the PERSON column. For example, 'Porter' and retain any entries with a space eg. "M. Pitt" or "Marie Cowan"
# Remove rows where there is just one word in the PERSON column.
# Retain entries with a space in the PERSON column, such as "M. Pitt" or "Marie Cowan".
try:
    drop_singles_df = drop_dupes_df[drop_dupes_df['PERSON'].str.contains(' ', na=False)]
except KeyError as e:
    print("Error: 'PERSON' column not found in the DataFrame.")
except AttributeError as e:
    print("Error: 'PERSON' column is not a string type.")

print(drop_singles_df.shape)

(9529, 11)


In [55]:
#if the 'Bib ID' column and the 'PERSON' column are duplicated, remove the duplicate row
#we don't care about if the same name is mentioned in the same catalogue record.

try:
    drop_person_dupes_df = drop_singles_df.drop_duplicates(subset=['Bib ID', 'PERSON'], keep='first')
except KeyError as e:
    print("Error: 'Bib ID' or 'PERSON' column not found in the DataFrame.")

print(drop_person_dupes_df.shape)

(6625, 11)


In [56]:
print("there are {} entries in the final dataframe".format(drop_person_dupes_df.shape[0]))
print("there are {} individual archives from the NLA in the final dataframe".format(drop_person_dupes_df['Bib ID'].nunique()))
print("there are {} different individuals mentioned in the catalogue meta-data for these archives in the final dataframe".format(drop_person_dupes_df['PERSON'].nunique()))

there are 6625 entries in the final dataframe
there are 1106 individual archives from the NLA in the final dataframe
there are 4776 different individuals mentioned in the catalogue meta-data for these archives in the final dataframe


In [78]:
#iterate through items in 'archival_extent' column and convert anything that is in cm to metres


#create new column called 'archival_extent_metres' that is a copy of the 'archival_extent' column
drop_person_dupes_df['archival_extent_metres'] = drop_person_dupes_df['archival_extent']

#itterate through the rows in the 'archival_extent_metres' column and convert anything that is in cm to metres
for index, row in drop_person_dupes_df.iterrows():
    if 'cm' in row['archival_extent_metres']:
        #split on ' cm' and take the first item in the list
        extent = (row['archival_extent_metres'].split(' cm')[0])
        #if extent contains letters (eg. '1 box') then set the value to 0
        if any(c.isalpha() for c in extent):
            drop_person_dupes_df.loc[index, 'archival_extent_metres'] = 0
        else:
            #convert to float
            extent = float(extent)
            #if extent is less than 1 then retain the value
            if extent < 1:
                drop_person_dupes_df.loc[index, 'archival_extent_metres'] = extent
            else:
                #convert to metres
                drop_person_dupes_df.loc[index, 'archival_extent_metres'] = extent/100
    elif 'm' in row['archival_extent_metres']:
        #split on ' m' and take the first item in the list
        extent = (row['archival_extent_metres'].split(' m')[0])
        #if extent contains letters (eg. '1 box') then set the value to 0
        if any(c.isalpha() for c in extent):
            drop_person_dupes_df.loc[index, 'archival_extent_metres'] = 0
        else:
            #convert to float and set the value
            drop_person_dupes_df.loc[index, 'archival_extent_metres'] = float(extent)
    else:
        #if any extent is not a float then set the value to 0
        if any(c.isalpha() for c in row['archival_extent_metres']):
            drop_person_dupes_df.loc[index, 'archival_extent_metres'] = 0
        pass




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drop_person_dupes_df['archival_extent_metres'] = drop_person_dupes_df['archival_extent']


In [79]:
#drop rows where the 'archival_extent_metres' column is 0
final_df = drop_person_dupes_df[drop_person_dupes_df['archival_extent_metres'] != 0]

In [80]:
print("there are {} entries in the final dataframe".format(final_df.shape[0]))
print("there are {} individual archives from the NLA in the final dataframe".format(final_df['Bib ID'].nunique()))
print("there are {} different individuals mentioned in the catalogue meta-data for these archives in the final dataframe".format(final_df['PERSON'].nunique()))

there are 6094 entries in the final dataframe
there are 967 individual archives from the NLA in the final dataframe
there are 4444 different individuals mentioned in the catalogue meta-data for these archives in the final dataframe


In [81]:
#save the final dataframe to a csv file and a pickle file 

final_df.to_csv('data/final/processed_df.csv', index=False)
final_df.to_pickle('data/final/processed_df.pkl')