# Data Cleaning


In [1]:
# Importing the libraries
import pandas as pd
import numpy as np

In [2]:
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
file_path = '../../data/raw/challenge_campus_biomedico_2024.parquet'
df = pd.read_parquet(file_path, engine= 'pyarrow')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 484291 entries, 0 to 484290
Data columns (total 33 columns):
 #   Column                                     Non-Null Count   Dtype  
---  ------                                     --------------   -----  
 0   id_prenotazione                            484291 non-null  object 
 1   id_paziente                                484291 non-null  object 
 2   data_nascita                               484291 non-null  object 
 3   sesso                                      484291 non-null  object 
 4   regione_residenza                          484291 non-null  object 
 5   codice_regione_residenza                   484291 non-null  int64  
 6   asl_residenza                              484291 non-null  object 
 7   codice_asl_residenza                       484291 non-null  int64  
 8   provincia_residenza                        484291 non-null  object 
 9   codice_provincia_residenza                 455911 non-null  object 
 10  comune_r

The function __remove_disdette__ removes the rows of a DataFrame where the data_disdetta column is not null. This is useful for filtering the data, keeping only the rows that do not have an associated cancellation date.

In [51]:
df['data_disdetta'].isnull().value_counts()

data_disdetta
True     460639
False     23652
Name: count, dtype: int64

In [4]:
def remove_disdette(df) -> pd.DataFrame: 
    # Remove rows where 'data_disdetta' is not null
    df = df[df['data_disdetta'].isnull()]
    return df


In [5]:
df = remove_disdette(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 460639 entries, 0 to 484290
Data columns (total 33 columns):
 #   Column                                     Non-Null Count   Dtype  
---  ------                                     --------------   -----  
 0   id_prenotazione                            460639 non-null  object 
 1   id_paziente                                460639 non-null  object 
 2   data_nascita                               460639 non-null  object 
 3   sesso                                      460639 non-null  object 
 4   regione_residenza                          460639 non-null  object 
 5   codice_regione_residenza                   460639 non-null  int64  
 6   asl_residenza                              460639 non-null  object 
 7   codice_asl_residenza                       460639 non-null  int64  
 8   provincia_residenza                        460639 non-null  object 
 9   codice_provincia_residenza                 433623 non-null  object 
 10  comune_reside

In [7]:
df.columns

Index(['id_prenotazione', 'id_paziente', 'data_nascita', 'sesso',
       'regione_residenza', 'codice_regione_residenza', 'asl_residenza',
       'codice_asl_residenza', 'provincia_residenza',
       'codice_provincia_residenza', 'comune_residenza',
       'codice_comune_residenza', 'tipologia_servizio', 'descrizione_attivita',
       'codice_descrizione_attivita', 'data_contatto', 'regione_erogazione',
       'codice_regione_erogazione', 'asl_erogazione', 'codice_asl_erogazione',
       'provincia_erogazione', 'codice_provincia_erogazione',
       'struttura_erogazione', 'codice_struttura_erogazione',
       'tipologia_struttura_erogazione',
       'codice_tipologia_struttura_erogazione', 'id_professionista_sanitario',
       'tipologia_professionista_sanitario',
       'codice_tipologia_professionista_sanitario', 'data_erogazione',
       'ora_inizio_erogazione', 'ora_fine_erogazione', 'data_disdetta'],
      dtype='object')

The method __identify_and_remove_outliers_zscore__ uses the z-score method to identify and remove outliers from a DataFrame. The z-score measures the distance of a value from the mean in terms of standard deviations. This method normalizes the data and considers values that deviate from the mean beyond a specified threshold (default is 3) as outliers. The outliers are removed for each specified column, returning a DataFrame without these anomalous values.

In [5]:
def identify_and_remove_outliers_zscore(df, columns, threshold=3):
    """
    Identifies and removes outliers using the z-score method (normalization).
    
    :param df: The original DataFrame.
    :param columns: The columns on which to apply outliers removal.
    :param threshold: The z-score threshold for outlier detection (default: 3).
    :return: A DataFrame with no outliers.
    """
    for col in columns:
        z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
        df = df[z_scores <= threshold]
    return df


The function __smooth_noisy_data__ applies a moving average to smooth noisy data in a specified column of a DataFrame. Using a defined window size, the function calculates the average of the values within this window, thereby reducing fluctuations and noise in the data. This method is useful for obtaining a clearer representation of trends in the data.

In [6]:
def smooth_noisy_data(df, column, window_size=3):
  """
  Smooth noisy data using moving average.

  Args:
    df: The original DataFrame.
    column: The column to apply smoothing to.
    window_size: The size of the moving average window.

  Returns:
    A DataFrame with the smoothed data.
  """

  df[column] = df[column].rolling(window=window_size, min_periods=1).mean()
  return df


The function __remove_duplicates__ removes duplicates from a DataFrame. Using the drop_duplicates method from pandas, the function eliminates duplicate rows.

In [7]:
def remove_duplicates(df) -> pd.DataFrame:
    """
    Removes duplicates from dataset df.
    :param df:
    :return:
    """
    df.drop_duplicates(inplace=True)
    return df

The function __imputate_comune_residenza__ imputes missing values in the comune_residenza column of a DataFrame using ISTAT codes. It loads a dataset containing ISTAT codes and the names of Italian municipalities, then merges this dataset with the original DataFrame based on the municipality code. Finally, it renames the column with the municipality name and removes the excess columns, returning a DataFrame with the imputed values.

In [None]:
def imputate_comune_residenza(df):
    """
    Imputes missing values for 'comune_residenza' using ISTAT codes.

    Args:
        df: The DataFrame containing the data.

    Returns:
        The DataFrame with imputed values.
    """

    # Load ISTAT data
    istat_data = pd.read_excel('datasets/Codici-statistici-e-denominazioni-al-30_06_2024.xlsx')

    # Merge DataFrames on 'codice_comune'
    df = pd.merge(df, istat_data, left_on='codice_comune', right_on='Codice Comune formato alfanumerico', how='left')

    # Rename the column and remove the excess column (if necessary)
    df.rename(columns={'Denominazione in italiano': 'comune_residenza'}, inplace=True)
    df.drop('Codice Comune formato alfanumerico', axis=1, inplace=True)

    return df

The function __fill_missing_comune_residenza__ is designed to fill missing values in the comune_residenza column of a DataFrame using a provided mapping dictionary. This dictionary maps municipality codes to municipality names. The function also handles a special case where the municipality code '1168' is replaced with 'None' (representing Turin). The missing values in the comune_residenza column are then filled using the mapping dictionary. The function returns the DataFrame with the filled values

In [None]:
def fill_missing_comune_residenza(df, codice_comune_to_nome):
      """
      Fills missing values in the 'comune_residenza' column using a mapping.
    
      Args:
        df: The DataFrame containing the data.
        codice_comune_to_nome: A dictionary mapping the municipality code to the municipality name.
    
      Returns:
        The DataFrame with filled missing values.
      """
    
      # Handle the special case: municipality of None (Turin)
      df['codice_comune_residenza'] = df['codice_comune_residenza'].replace('1168', 'None')
    
      # Fill missing values using the mapping
      df['comune_residenza'] = df['comune_residenza'].fillna(df['codice_comune_residenza'].map(codice_comune_to_nome))

      return df

The function __check_missing_values_same_row__ is designed to identify and count the rows in a DataFrame where both ora_inizio_erogazione and ora_fine_erogazione columns have missing values. It checks for missing values in these two columns simultaneously and prints the number of rows where both columns are missing. 

In [3]:
def check_missing_values_same_row(df):
    """
    Checks if missing values in 'ora_inizio_erogazione' and 'ora_fine_erogazione' are in the same rows.

    Args:
        df: The DataFrame to check.

    Returns:
        None
    """

    missing_both = df[['ora_inizio_erogazione', 'ora_fine_erogazione']].isna().all(axis=1)
    num_rows_with_both_missing = missing_both.sum()
    print(f"Number of rows with both 'ora_inizio_erogazione' and 'ora_fine_erogazione' missing: {num_rows_with_both_missing}")
