In [1]:
# dependencies
import pandas as pd
import numpy as np

pd.pandas.set_option('display.max_columns',None)

In [2]:
# variables
ARTWORKS_DATA_PATH="data/Artworks.csv"

In [3]:
# read dataset
data = pd.read_csv(ARTWORKS_DATA_PATH, delimiter=",")

FileNotFoundError: [Errno 2] No such file or directory: 'data/Artworks.csv'

In [None]:
df = data.copy()
df.head(3)

In [None]:
df.info()

### Workflow

-Fromat columns & remove redudndant `()`
      - `ArtistBio`
      - `Nationality`
      - `BeginDate`
      - `EndDate`
      - `Gender`


- split `Title` column to get only Relevant title

- Drop `Dimensions` column   
  - There are `lenth` and `width` columns with the same data already 
  - Resolve to use cm as a central metric for measurement


- handle missning/Null values

- Infer data-types

- Save as `parquet` file to retain infered data types


In [None]:
# format columns with redudant brackets
columns_w_redundant_brackets=["ArtistBio","Nationality","BeginDate","EndDate","Gender"]

for column in columns_w_redundant_brackets:
    df[column] = df[column].str.replace(r'\(|\)', '', regex=True)

In [None]:
# split ArtistBio column to use only relevant info (Nationalty data only)
df['ArtistBio'] = df['ArtistBio'].str.split(',').str[0].str.strip()

# split Title column to use only relevant Title info
# df['Title'] = df['Title'].str.split(',').str[0].str.strip()

- values in ArtistBio seem to exist already in `Nationality`, `BeginDate` & `EndDate` columns
- ArtistBio	& Nationality seem to have similar info. could be used to fill in missing values

In [None]:
# Drop Dimesions column since `lenth` and `width` columns with the same data already 
df = df.drop('Dimensions', axis=1)

In [None]:
# function to find percentage of missing values
def find_percentage_missing(data):
    # find columns with missing values
    column_na=[features for features in data.columns if data[features].isnull().sum()>1]

    result_list = []

    for feature in column_na:
        missing_percentage = np.round(data[feature].isnull().mean() * 100, 2)
        result_list.append({'Column Name': feature, 'Missing Values Percentage': missing_percentage})

    return pd.DataFrame(result_list)

find_percentage_missing(df)

In [None]:
# drop columns with high % of null values (% >= 85)
large_na_columns=["Circumference (cm)", 
                  "Depth (cm)", "Diameter (cm)", "Length (cm)", "Weight (kg)", "Seat Height (cm)", "Duration (sec.)"]
df = df.drop(columns=large_na_columns)

In [None]:
# fill title with "No Title" (% null values - < 1%)
df['Title'].fillna("No Title", inplace=True)

# Fill Artists column with Artist Unavailable (% null values - < 1%)
df['Artist'].fillna("Artist Unavailable", inplace=True)

# ID columns are very unique & cannot be randomly filled  (% null values - < 1%)
df.dropna(subset=['ConstituentID'], inplace=True)




# merge ArtistBio & Nationality
#  ArtistBio & Nationality have same data to merge - Taking Nationality as majority
df['merged_Nationality_ArtistBio'] = df['Nationality'].combine_first(df['ArtistBio'])

# drop null rows
df = df[pd.notna(df['merged_Nationality_ArtistBio'])]

# discrad Nationality & ArtistBio after merge
df = df.drop(columns=["Nationality", "ArtistBio"])
df.rename(columns={'merged_Nationality_ArtistBio': 'Nationality'}, inplace=True)

# fill more empty ' ' spaces
mode_nationality = df['Nationality'].mode().values[0]
df['Nationality'] = df['Nationality'].replace('', mode_nationality)

# clean multiple Nationality in one sample
split_df = df['Nationality'].str.split(expand=True)
df['Nationality'] = split_df[0]
df['Nationality'].fillna(mode_nationality, inplace=True)



# replace with mode of Medium
mode_medium = df['Medium'].mode().values[0]
df['Medium'] = df['Medium'].fillna(mode_medium)

# replace height and width with most occuring (use mode incase of outliers)
mode_height = df['Height (cm)'].mode().values[0]
df['Height (cm)'] = df['Height (cm)'].fillna(mode_height)

mode_width = df['Width (cm)'].mode().values[0]
df['Width (cm)'] = df['Width (cm)'].fillna(mode_width)


# replace url, thumbnail, creditline with not available (since URL must be unique)
df['URL'].fillna("Unavailable", inplace=True)
df['ThumbnailURL'].fillna("Unavailable", inplace=True)
df['CreditLine'].fillna("Unavailable", inplace=True)


# DateAcquired column (% null values - = 4%)
mode_date_acquired = df['DateAcquired'].mode().values[0]
df['DateAcquired'] = df['DateAcquired'].fillna(mode_date_acquired)

# Date ceated column 
df['Date'] = df['Date'].str.extract(r'(\d{4})')
mode_date = df['Date'].mode().values[0]
df['Date'] = df['Date'].fillna(mode_date)
df['Date'] = df['Date'].str.replace('-10-06', mode_date)


# Classification column
mode_classification = df['Classification'].mode().values[0]
df['Classification'] = df['Classification'].fillna(mode_classification)

# gender handling
gender_mapping = {
    'male': 'Male',
    'female': 'Female',
    'non-binary': 'Non-Binary',
    '': "undisclosed" }

# clean inconsistent gender naming 
df['Cleaned_Gender'] = df['Gender'].str.lower().map(gender_mapping)

# fill null gender spaces 
df['Cleaned_Gender_nan'] = df['Cleaned_Gender'].fillna("undisclosed")

# drop after use 
df = df.drop(columns=["Gender","Cleaned_Gender"])
df.rename(columns={'Cleaned_Gender_nan': 'Gender'}, inplace=True)


In [None]:
df.head()

In [None]:
# % of null BeginDate & EndDate in columns
percentage_of_zeros_in_beginDate = (df['BeginDate'] == "0").mean() * 100 
percentage_of_zeros_in_beginDate = (df['EndDate'] == "0").mean() * 100

print(f"% of null values in dates -> \nBeginDate: {percentage_of_zeros_in_beginDate} \nEndDate: {percentage_of_zeros_in_beginDate}")

In [None]:
df["BeginDate"].value_counts()

In [None]:
df["EndDate"].value_counts()

In [None]:
# clean inconsistent BeginDate & EndDate formating
# replace sample with date length greater than standard year length with 0
df.loc[df['BeginDate'].str.len() > 4, 'BeginDate'] = "0"
df["BeginDate"].value_counts()

In [None]:
df.loc[df['EndDate'].str.len() > 4, 'EndDate'] = "0"
df["EndDate"].value_counts()

In [None]:
# % of null dates in columns
percentage_of_zeros_in_beginDate = (df['BeginDate'] == "0").mean() * 100 
percentage_of_zeros_in_beginDate = (df['EndDate'] == "0").mean() * 100

print(f"% of null values in dates -> \nBeginDate: {percentage_of_zeros_in_beginDate} \nEndDate: {percentage_of_zeros_in_beginDate}")

In [None]:
# replace 0 values with mode of date column
df_w_no_zero_beginDate = df[df["BeginDate"] != "0"]
mode_date_beginDate = df_w_no_zero_beginDate["BeginDate"].mode().values[0]
df['BeginDate'] = df['BeginDate'].replace('0', mode_date_beginDate)
df['BeginDate'] = df['BeginDate'].replace("0 0", mode_date_beginDate)


In [None]:
df_w_no_zero_beginDate = df[df["EndDate"] != "0"]
mode_date_beginDate = df_w_no_zero_beginDate["EndDate"].mode().values[0]
df['EndDate'] = df['EndDate'].replace('0', mode_date_beginDate)
df['EndDate'] = df['EndDate'].replace("0 0", mode_date_beginDate)

In [None]:
# infer data types
df['BeginDate'] = pd.to_datetime(df['BeginDate'], format='%Y')
df['EndDate'] = pd.to_datetime(df['EndDate'], format='%Y')
df['Date'] = pd.to_datetime(df['Date'], format='%Y')

In [None]:
# complete date format
df['DateAcquired'] = pd.to_datetime(df['DateAcquired'])

In [None]:
df.info()

In [None]:
df.to_parquet('./data/cleaned_data.parquet', index=False)