# Part III: Data repairing with imputation
## Exrcise 6: Do it from scratch at home and upload it on Moodle
Now you have new datasets ”Olympics.csv” and ”flicker.csv” available on Moodle. You have to define your own
strategy to clean these data and comment your notebook at each step.


In [1]:
# Load the datasets
import pandas as pd

flickers_df = pd.read_csv('data/flicker.csv')
olympics_df = pd.read_csv('data/olympics.csv')

# Inspect Flicker dataset
print("Flicker Dataset Info:")
flickers_df.info()

# Check for missing values in the Flicker dataset
print("\nMissing values in Flicker Dataset:")
print(flickers_df.isnull().sum())

# Inspect Olympics dataset
print("\nOlympics Dataset Info:")
olympics_df.info()

# Check for missing values in the Olympics dataset
print("\nMissing values in Olympics Dataset:")
print(olympics_df.isnull().sum())


Flicker Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8287 entries, 0 to 8286
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Identifier              8287 non-null   int64  
 1   Edition Statement       773 non-null    object 
 2   Place of Publication    8287 non-null   object 
 3   Date of Publication     8106 non-null   object 
 4   Publisher               4092 non-null   object 
 5   Title                   8287 non-null   object 
 6   Author                  6509 non-null   object 
 7   Contributors            8287 non-null   object 
 8   Corporate Author        0 non-null      float64
 9   Corporate Contributors  0 non-null      float64
 10  Former owner            1 non-null      object 
 11  Engraver                0 non-null      float64
 12  Issuance type           8287 non-null   object 
 13  Flickr URL              8287 non-null   object 
 14  Shelfmarks        

In [2]:
# Cleaning the Flicker Dataset
# Dropping irrelevant or empty columns
flickers_cleaned_df = flickers_df.drop(columns=['Corporate Author', 'Corporate Contributors', 'Former owner', 'Engraver'])

# Filling missing 'Edition Statement' with 'Unknown'
flickers_cleaned_df['Edition Statement'].fillna('Unknown', inplace=True)

# Cleaning the Olympics Dataset
# Renaming columns for clarity
olympics_df.columns = [
    'Country', 'Summer Games', 'Summer Gold', 'Summer Silver', 'Summer Bronze', 'Total Summer Medals', 
    'Winter Games', 'Winter Gold', 'Winter Silver', 'Winter Bronze', 'Total Winter Medals', 
    'Games Total', 'Gold Total', 'Silver Total', 'Bronze Total', 'Combined Total'
]

# Removing the first row (header descriptions)
olympics_cleaned_df = olympics_df.drop(index=0)

# Converting numeric columns to correct types
numeric_columns = [
    'Summer Games', 'Summer Gold', 'Summer Silver', 'Summer Bronze', 'Total Summer Medals', 
    'Winter Games', 'Winter Gold', 'Winter Silver', 'Winter Bronze', 'Total Winter Medals', 
    'Games Total', 'Gold Total', 'Silver Total', 'Bronze Total', 'Combined Total'
]

olympics_cleaned_df[numeric_columns] = olympics_cleaned_df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Checking cleaned data
print("Flicker Dataset Info After Cleaning:")
flickers_cleaned_df.info()

print("\nOlympics Dataset Info After Cleaning:")
olympics_cleaned_df.info()


Flicker Dataset Info After Cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8287 entries, 0 to 8286
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Identifier            8287 non-null   int64 
 1   Edition Statement     8287 non-null   object
 2   Place of Publication  8287 non-null   object
 3   Date of Publication   8106 non-null   object
 4   Publisher             4092 non-null   object
 5   Title                 8287 non-null   object
 6   Author                6509 non-null   object
 7   Contributors          8287 non-null   object
 8   Issuance type         8287 non-null   object
 9   Flickr URL            8287 non-null   object
 10  Shelfmarks            8287 non-null   object
dtypes: int64(1), object(10)
memory usage: 712.3+ KB

Olympics Dataset Info After Cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 1 to 147
Data columns (total 16 columns):
 #   Colu

In [3]:
# Handling missing data in Flicker Dataset
flickers_cleaned_df['Publisher'].fillna('Unknown Publisher', inplace=True)
flickers_cleaned_df['Author'].fillna('Unknown Author', inplace=True)
flickers_cleaned_df['Date of Publication'].fillna('Unknown Year', inplace=True)

# Olympics Dataset doesn't have missing values, so no action required.

# Checking final state after handling missing data
print("Final Flicker Dataset Info After Handling Missing Data:")
flickers_cleaned_df.info()


Final Flicker Dataset Info After Handling Missing Data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8287 entries, 0 to 8286
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Identifier            8287 non-null   int64 
 1   Edition Statement     8287 non-null   object
 2   Place of Publication  8287 non-null   object
 3   Date of Publication   8287 non-null   object
 4   Publisher             8287 non-null   object
 5   Title                 8287 non-null   object
 6   Author                8287 non-null   object
 7   Contributors          8287 non-null   object
 8   Issuance type         8287 non-null   object
 9   Flickr URL            8287 non-null   object
 10  Shelfmarks            8287 non-null   object
dtypes: int64(1), object(10)
memory usage: 712.3+ KB


In [4]:
# Exercise 4: Normalizing and Formatting Data

# 1. For the Flicker Dataset
# Normalize 'Date of Publication' (extract year and clean inconsistent data)
# Example: Removing non-numeric characters and retaining the year only
flickers_cleaned_df['Date of Publication'] = flickers_cleaned_df['Date of Publication'].str.extract(r'(\d{4})')

# Strip any leading/trailing whitespaces in 'Author', 'Publisher', and 'Place of Publication'
flickers_cleaned_df['Author'] = flickers_cleaned_df['Author'].str.strip()
flickers_cleaned_df['Publisher'] = flickers_cleaned_df['Publisher'].str.strip()
flickers_cleaned_df['Place of Publication'] = flickers_cleaned_df['Place of Publication'].str.strip()

# 2. For the Olympics Dataset
# Standardizing country names if needed (strip leading/trailing spaces, handle inconsistencies)
olympics_cleaned_df['Country'] = olympics_cleaned_df['Country'].str.strip()

# Verifying the unique country names to detect inconsistencies
print("Unique Country Names in Olympics Dataset:")
print(olympics_cleaned_df['Country'].unique())

# Display the cleaned columns for checking
print("\nSample Flicker Dataset (After Normalization):")
print(flickers_cleaned_df[['Date of Publication', 'Author', 'Publisher', 'Place of Publication']].head())

print("\nSample Olympics Dataset (After Normalization):")
print(olympics_cleaned_df[['Country']].head())


Unique Country Names in Olympics Dataset:
['Afghanistan\xa0(AFG)' 'Algeria\xa0(ALG)' 'Argentina\xa0(ARG)'
 'Armenia\xa0(ARM)' 'Australasia\xa0(ANZ) [ANZ]'
 'Australia\xa0(AUS) [AUS] [Z]' 'Austria\xa0(AUT)' 'Azerbaijan\xa0(AZE)'
 'Bahamas\xa0(BAH)' 'Bahrain\xa0(BRN)' 'Barbados\xa0(BAR) [BAR]'
 'Belarus\xa0(BLR)' 'Belgium\xa0(BEL)' 'Bermuda\xa0(BER)'
 'Bohemia\xa0(BOH) [BOH] [Z]' 'Botswana\xa0(BOT)' 'Brazil\xa0(BRA)'
 'British West Indies\xa0(BWI) [BWI]' 'Bulgaria\xa0(BUL) [H]'
 'Burundi\xa0(BDI)' 'Cameroon\xa0(CMR)' 'Canada\xa0(CAN)'
 'Chile\xa0(CHI) [I]' 'China\xa0(CHN) [CHN]' 'Colombia\xa0(COL)'
 'Costa Rica\xa0(CRC)' 'Ivory Coast\xa0(CIV) [CIV]' 'Croatia\xa0(CRO)'
 'Cuba\xa0(CUB) [Z]' 'Cyprus\xa0(CYP)' 'Czech Republic\xa0(CZE) [CZE]'
 'Czechoslovakia\xa0(TCH) [TCH]' 'Denmark\xa0(DEN) [Z]'
 'Djibouti\xa0(DJI) [B]' 'Dominican Republic\xa0(DOM)' 'Ecuador\xa0(ECU)'
 'Egypt\xa0(EGY) [EGY] [Z]' 'Eritrea\xa0(ERI)' 'Estonia\xa0(EST)'
 'Ethiopia\xa0(ETH)' 'Finland\xa0(FIN)' 'France\xa0(FRA) [

In [5]:
# Exercise 5: Validating the Cleaned Data

# 1. For the Flicker Dataset
# Check for any new missing values after normalization
print("Missing values in Flicker Dataset after cleaning:")
print(flickers_cleaned_df.isnull().sum())

# Ensure that the 'Date of Publication' column now contains only valid years or NaN
print("\nUnique values in 'Date of Publication' (should only be years or NaN):")
print(flickers_cleaned_df['Date of Publication'].unique())

# Check for duplicate entries based on key fields like 'Identifier'
print("\nChecking for duplicates in Flicker Dataset (based on Identifier):")
print(flickers_cleaned_df['Identifier'].duplicated().sum())

# 2. For the Olympics Dataset
# Check for any new missing values in the cleaned dataset
print("\nMissing values in Olympics Dataset after cleaning:")
print(olympics_cleaned_df.isnull().sum())

# Check that all numeric columns are correctly typed and contain no missing values
print("\nOlympics Dataset column types:")
print(olympics_cleaned_df.dtypes)

# Verifying logical consistency: e.g., 'Gold Total' + 'Silver Total' + 'Bronze Total' == 'Combined Total'
olympics_cleaned_df['Medal Total Check'] = (
    olympics_cleaned_df['Gold Total'] + olympics_cleaned_df['Silver Total'] + olympics_cleaned_df['Bronze Total']
)
print("\nChecking if 'Gold Total' + 'Silver Total' + 'Bronze Total' matches 'Combined Total':")
print((olympics_cleaned_df['Combined Total'] == olympics_cleaned_df['Medal Total Check']).all())


Missing values in Flicker Dataset after cleaning:
Identifier                0
Edition Statement         0
Place of Publication      0
Date of Publication     183
Publisher                 0
Title                     0
Author                    0
Contributors              0
Issuance type             0
Flickr URL                0
Shelfmarks                0
dtype: int64

Unique values in 'Date of Publication' (should only be years or NaN):
['1879' '1868' '1869' '1851' '1857' '1875' '1872' nan '1676' '1679' '1802'
 '1859' '1888' '1839' '1897' '1865' '1860' '1873' '1866' '1899' '1814'
 '1820' '1800' '1847' '1893' '1805' '1837' '1896' '1898' '1892' '1894'
 '1885' '1846' '1817' '1816' '1833' '1804' '1777' '1799' '1827' '1853'
 '1874' '1790' '1883' '1795' '1877' '1886' '1834' '1852' '1828' '1876'
 '1758' '1880' '1823' '1887' '1825' '1850' '1810' '1889' '1861' '1858'
 '1878' '1821' '1891' '1808' '1849' '1724' '1772' '1812' '1835' '1867'
 '1830' '1841' '1884' '1863' '1848' '1845' '1807' '1864' 