In [17]:
# Libraries 
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np


# Configuration
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None)


In [None]:
# Psss! README!
# The purpose of this code is to explore the data from customer_flight_activity.csv and customer_loyalty_history.csv files. 
# Providing information for the user for decision making.
# In adition, it performs the data transformation based on the conclusions from 01_exploration_conclusions.md.
# The final output is a new file customer_data_transformed, where clean and trnasformed data from both csv is united.

In [None]:
# 📂 Open data 

df_customer_flights = pd.read_csv("data/customer_flight_activity.csv")
df_customer_loyalty = pd.read_csv("data/customer_loyalty_history.csv")
dataframes_dict = {"df_customer_flight✈️" : df_customer_flights, "df_customer_loyalty💞" : df_customer_loyalty}

In [None]:
# ✍️ Exploratory Data Analysis functions definition

def explore_dataframes (df_dict):
    """
    Explores and displays various statistics and details for each DataFrame in the provided dictionary.

    Args:
        df_dict (dict): A dictionary where the keys are DataFrame names (strings) and the values are pandas DataFrames.

    Returns:
        dict: A dictionary with DataFrame names (with "_duplicates" suffix) as keys and DataFrames with duplicate rows as values.

    Example:
        >>> import pandas as pd
        >>> df1 = pd.DataFrame({
        ...     'A': [1, 2, 2, 4],
        ...     'B': ['foo', 'bar', 'foo', 'baz']
        ... })
        >>> df2 = pd.DataFrame({
        ...     'C': [10, 20, 30],
        ...     'D': [1.1, None, 3.3]
        ... })
        >>> df_dict = {'df1': df1, 'df2': df2}
        >>> duplicates = explore_dataframes(df_dict)
        
        Output:
        Dataframe INFO:
        FIRST FIVE (5) ROWS:
        LAST FIVE (5) ROWS:
        SAMPLE (5) ROWS:
        DUPLICATES COUNT:
        DATAFRAME WITH DUPLICATED ROWS (INCLUDING ALL APPEARANCES):
        COUNT OF ROWS WITH ALL NULL VALUES IS:
        COUNT OF COLUMNS WITH ALL NULL VALUES IS:
        STATISTICAL METRICS FOR NUMERICAL COLUMNS:
        STATISTICAL METRICS FOR CATEGORICAL COLUMNS:

    Notes:
        - The function prints information including the shape, first and last five rows, a sample of rows, duplicate counts, counts of rows and columns with all null values, and statistical metrics for numerical and categorical columns.
        - If there are no categorical columns, it will print an error message.
        - It returns a dictionary of DataFrames containing only duplicated rows, with the names of the DataFrames suffixed by "_duplicates".
    """
    duplicate_dataframes_dict = {}
    for name, df in df_dict.items():
        df_name = f"'{name.upper().replace('_',' ')}'"
        print(f" \n\n----------- DATAFRAME NAME: {df_name} -----------")
        print(f"\n{df_name} ---> Dataframe INFO:\n")
        display(df.info())
        print(f"\n{df_name} ---> FIRST FIVE (5) ROWS:")
        display(df.head())
        print(f"\n{df_name} ---> LAST FIVE (5) ROWS:")
        display(df.tail())
        print(f"\n{df_name} ---> SAMPLE (5) ROWS:")
        display(df.sample(5))
        print(f" \n{df_name} ---> DUPLICATES COUNT IS: {df.duplicated().sum()}, {round((df.duplicated().sum()/df.shape[0]*100),2)}% OVER TOTAL ROWS\n")
        if df.duplicated().sum() > 0:
            duplicates_df_name = name + "_duplicates"
            duplicates_df = df[df.duplicated(keep=False)]
            print(f"{df_name} ---> DATAFRAME WITH DUPLICATED ROWS (INCLUDING ALL APPEARANCES):\n")
            display(duplicates_df.head(10))
            duplicate_dataframes_dict[duplicates_df_name] = duplicates_df
        print(f"\n{df_name} --> COUNT OF ROWS WITH ALL NULL VALUES IS: {df.isnull().all(axis=1).sum()}\n")
        print(f"\n{df_name} --> COUNT OF COLUMNS WITH ALL NULL VALUES IS: {df.isnull().all().sum()}\n")
        print(F"\n{df_name} --> STATISTICAL METRICS FOR NUMERICAL COLUMNS:")
        display(df.describe().T)
        try:
            print(F"\n{df_name} --> STATISTICAL METRICS FOR CATEGORICAL COLUMNS:")
            display(df.describe(include="object").T)
        except:
            print("\nUPS... IT SEEMS LIKE THERE ARE NO COLUMNS WITH CATEGORICATL DATA")
    return duplicate_dataframes_dict

def explore_columns (df_dict):
    """
    Explores and displays detailed information about each column in the DataFrames provided in the dictionary.

    Args:
        df_dict (dict): A dictionary where the keys are DataFrame names (strings) and the values are pandas DataFrames.

    Returns:
        None: This function prints detailed column information for each DataFrame but does not return any value.

    Example:
        >>> import pandas as pd
        >>> df1 = pd.DataFrame({
        ...     'Age': [25, 30, 25, 40],
        ...     'Gender': ['F', 'M', 'F', 'M']
        ... })
        >>> df2 = pd.DataFrame({
        ...     'Salary': [50000, 60000, None],
        ...     'Department': ['HR', 'Tech', 'Tech']
        ... })
        >>> explore_dataframe_columns({'df1': df1, 'df2': df2})
        
        Output:
        UNIQUE VALUES:
        VALUES COUNT:
        COUNT OF DUPLICATES IN THE COLUMN:
        COUNT OF NULL VALUES IN THE COLUMN:
        STATISTICAL DESCRIPTION (NUMERIC):
        STATISTICAL DESCRIPTION (CATEGORICAL):

    Notes:
        - The function prints unique values, value counts, counts of duplicates and null values, and statistical descriptions for each column.
        - Numerical columns are described with statistics like mean, standard deviation, min, max, etc.
        - Categorical columns are described with counts, unique values, top frequent values, and their frequencies.
        - The function assumes that columns are either numerical or categorical and will handle each type accordingly.
    """
    for name, df in df_dict.items():
        df_name = f"'{name.upper().replace('_',' ')}'"
        print(f" \n\n----------- DATAFRAME NAME: {df_name} -----------")
        for index, column in enumerate(df.columns):
            print (f"\n{index}) Column {column.upper()} (from {(df_name)} dataframe):")
            print (f"\n>>> UNIQUE VALUES:")
            print (df[column].unique())
            print (f"\n>>> VALUES COUNT:")
            print (df[column].value_counts())
            print (f"\n>>> COUNT OF DUPLICATES IN THE COLUMN:")
            print (df.duplicated(subset=[column]).sum())
            print (f"\n>>> COUNT OF NULL VALUES IN THE COLUMN:")
            print (df[column].isnull().sum())
            if df[column].dtype in ['int64', 'float64']:
                print("\nSTATISTICAL DESCRIPTION (NUMERIC):")
                display(df[column].describe())
            else:
                print("\nSTATISTICAL DESCRIPTION (CATEGORICAL):")
                display(df[column].describe(include='object'))
            print ("--------")

In [None]:
# ▶️ Exploratory Data Analysis code execution

explore_dataframes(dataframes_dict)

explore_columns(dataframes_dict)

In [None]:
# ✍️ Data transformation (cleaning and union) functions definition

def columns_to_snake_case (df_dict):
    """
    Converts the column names of each DataFrame in a dictionary to snake_case format and prints the updated column names.

    Args:
        df_dict (dict): A dictionary where the keys are DataFrame names (strings) and the values are pandas DataFrames.

    Returns:
        None: This function modifies the DataFrames in place and prints the updated column names.

    Example:
        >>> import pandas as pd
        >>> df1 = pd.DataFrame(columns=['First Name', 'Last Name'])
        >>> df2 = pd.DataFrame(columns=['Date Of Birth', 'Email Address'])
        >>> df_dict = {'df1': df1, 'df2': df2}
        >>> columns_to_snake_case(df_dict)
        
        Output:
        'DF1' ---> Dataframe COLUMNS:
        Index(['first_name', 'last_name'], dtype='object')

        'DF2' ---> Dataframe COLUMNS:
        Index(['date_of_birth', 'email_address'], dtype='object')

    Notes:
        - The function assumes that the input dictionary contains pandas DataFrames.
        - It will replace spaces with underscores and convert all characters to lowercase.
        - The printed DataFrame names are uppercase with underscores replaced by spaces for readability.
    """
    for name, df in df_dict.items():
        df_name = f"'{name.upper().replace('_',' ')}'"
        df.columns = df.columns.str.lower().str.replace(' ', '_')
        print(f"\n{df_name} ---> Dataframe COLUMNS:\n")
        print(df.columns)

def impute_nulls_as_special_category(df, column_list, category_name):
    """
    Replaces null values in specified columns of a DataFrame with a given category name.

    Args:
        df (pandas.DataFrame): The DataFrame in which null values need to be imputed.
        column_list (list of str): A list of column names where null values should be replaced.
        category_name (str): The category name used to replace null values.

    Returns:
        None: This function modifies the DataFrame in place and prints the status of the operation.

    Example:
        >>> import pandas as pd
        >>> df = pd.DataFrame({
        ...     'Name': ['Alice', None, 'Charlie'],
        ...     'Age': [25, None, 30]
        ... })
        >>> impute_nulls_as_special_category(df, ['Name', 'Age'], 'Unknown')
        
        Output:
        Null values imputed ✅ in column Name.
        UNIQUE VALUES:
        ['Alice' 'Unknown' 'Charlie']
        
        Null values imputed ✅ in column Age.
        UNIQUE VALUES:
        [25.  'Unknown' 30.]

    Notes:
        - The function will only replace null values in columns that exist in the DataFrame.
        - If a column specified in `column_list` does not exist in the DataFrame, a warning message will be printed.
    """
    # Iterate through the list of columns to replace nulls with "category_name"
    for column in column_list:
        if column in df.columns:
            # Replace nulls with the value "category_name" for each column in the list
            df[column] = df[column].fillna(category_name)
            print (f"\nNull values imputed ✅ in column {column}.")
            print (f"UNIQUE VALUES:")
            print (df[column].unique())
        else:
            print(f"❌ The column '{column}' does not exist in the DataFrame.")

def transform_negative_values(df,column_list):
    """
    Transforms negative values to positive values in specified columns of a DataFrame.

    Args:
        df (pandas.DataFrame): The DataFrame in which negative values need to be converted.
        column_list (list of str): A list of column names where negative values should be transformed.

    Returns:
        None: This function modifies the DataFrame in place and prints the status of the operation.

    Example:
        >>> import pandas as pd
        >>> df = pd.DataFrame({
        ...     'Revenue': [-100, 200, -300],
        ...     'Profit': [50, -75, -20]
        ... })
        >>> transform_negatives_to_positives(df, ['Revenue', 'Profit'])
        
        Output:
        Negative values transformed ✅ in column Revenue.
        UNIQUE VALUES:
        [100 200 300]
        
        Negative values transformed ✅ in column Profit.
        UNIQUE VALUES:
        [ 50  75  20]

    Notes:
        - The function uses the absolute value function (`abs()`) to convert negative values to positive.
        - If a column specified in `column_list` does not exist in the DataFrame, a warning message will be printed.
    """
    for column in column_list:
        if column in df.columns:
            df[column] = df[column].abs()
            print (f"\nNegative values transformed ✅ in column {column}.")
            print (f"UNIQUE VALUES:")
            print (df[column].unique())
        else:
            print(f"❌ The column '{column}' does not exist in the DataFrame.")

def impute_nulls_as_median(df, column_list):
    """
    Replaces null values in specified columns of a DataFrame with the median value of each column.

    Args:
        df (pandas.DataFrame): The DataFrame in which null values need to be imputed.
        column_list (list of str): A list of column names where null values should be replaced with the median.

    Returns:
        None: This function modifies the DataFrame in place and prints the status of the operation.

    Example:
        >>> import pandas as pd
        >>> df = pd.DataFrame({
        ...     'Age': [25, None, 30],
        ...     'Salary': [50000, 60000, None]
        ... })
        >>> impute_nulls_as_median(df, ['Age', 'Salary'])
        
        Output:
        Null values imputed ✅ in column Age.
        UNIQUE VALUES:
        [25.  27.5 30.]
        
        Null values imputed ✅ in column Salary.
        UNIQUE VALUES:
        [50000. 60000. 55000.]

    Notes:
        - The function computes the median of each column and uses it to replace null values.
        - If a column specified in `column_list` does not exist in the DataFrame, a warning message will be printed.
        - The median calculation ignores null values, so it's only based on the existing non-null values in the column.
    """
    # Iterate through the list of columns to replace nulls with median
    for column in column_list:
        if column in df.columns:
            median= df[column].median()
            # Replace nulls with the median for each column in the list
            df[column] = df[column].fillna(median)
            print (f"\nNull values imputed ✅ in column {column}.")
            print (f"UNIQUE VALUES:")
            print (df[column].unique())
        else:
            print(f"❌ The column '{column}' does not exist in the DataFrame.")

def dfs_left_union (df_left, df_right):
    """
    Performs a left join (union) between two DataFrames on a specified key and returns the resulting DataFrame.

    Args:
        df_left (pandas.DataFrame): The left DataFrame to be joined.
        df_right (pandas.DataFrame): The right DataFrame to be joined.

    Returns:
        pandas.DataFrame: The resulting DataFrame after performing a left join on the key 'loyalty_number'.

    Example:
        >>> import pandas as pd
        >>> df_customer_flights = pd.DataFrame({
        ...     'loyalty_number': [1, 2, 3],
        ...     'flight_number': ['AA123', 'BB456', 'CC789']
        ... })
        >>> df_customer_loyalty = pd.DataFrame({
        ...     'loyalty_number': [1, 2],
        ...     'customer_name': ['Alice', 'Bob']
        ... })
        >>> df_final = dfs_left_union(df_customer_flights, df_customer_loyalty)
        
        Output:
        Dataframe on the left's SHAPE:(3, 2)
        Dataframe on the right's SHAPE:(2, 2)
        Dataframe were joined successfully ✅
        Dataframe Final's SHAPE:(3, 3)
        Dataframe Final's COLUMNS are:Index(['loyalty_number', 'flight_number', 'customer_name'], dtype='object')

    Notes:
        - The function performs a left join on the 'loyalty_number' column.
        - If the 'loyalty_number' column is not present in both DataFrames, it will result in an error.
        - Ensure that the column name 'loyalty_number' exists in both DataFrames before calling this function.
    """
    print (f"\nDataframe on the left's SHAPE:{df_left.shape}")
    print (f"\nDataframe on the right's SHAPE:{df_right.shape}")
    df_final = df_customer_flights.merge(df_customer_loyalty, how='left', on='loyalty_number')
    print("\nDataframe were joined succesfully ✅")
    print (f"\nDataframe Final's SHAPE:{df_final.shape}")
    print (f"\nDataframe Final's COLUMNS are:{df_final.columns}")
    return df_final

In [None]:
# ▶️ Data Transformation code execution

# Rename columns to snake case format
columns_to_snake_case(dataframes_dict)

# Delete records from df customer_flights that are duplicated in all columns, keeping only the first appearance
df_customer_flights.drop_duplicates(inplace=True)

print(f"DUPLICATES COUNT IS: {df_customer_flights.duplicated().sum()}")

# In columns "cancellation_year" and "cancelation_month" replace NaN with "Not Cancelled"
columns = ["cancellation_year","cancellation_month"]
category_name = "Not Cancelled"
impute_nulls_as_special_category(df_customer_loyalty,columns,category_name)

# In salary column transform negative value into positive and impute nulls assigning the median value
columns = ["salary"]
transform_negative_values(df_customer_loyalty,columns)
impute_nulls_as_median(df_customer_loyalty, columns)

# Join dataframes 
df_final = dfs_left_union(df_customer_flights,df_customer_loyalty)

# Save data into a csv
df_final.to_csv('data/customer_data_transformed1.csv')

### Loyalty number column analysis (unfinished)

In [None]:
# check the number of duplicates in loyalty_number column
duplicates = df_customer_flights.duplicated(subset=['loyalty_number']).sum()
print(f"There are {duplicates} duplicates in the column 'loyalty_number', which represent {round((duplicates/df_customer_flights.shape[0]*100),2)}% over the total")

In [None]:
df_loyalty_number_duplicates = df_customer_flights[df_customer_flights.duplicated(subset=['loyalty_number'])]
print (f"\n>>>DATAFRAME WITH DUPLICATED VALUES IN loyalty_number:")
display(df_loyalty_number_duplicates)


In [None]:
# The information in the dataframe is organized in a way that there is one row per each year and each month within a year. 
# Since there is info from 2 years it is expected to have 24 rows per loyalty_number (or less). So, I will only focus the analysis in those that appear more than 24 times
# Calculate the times of appearance for each loyalty_number
loyalty_number_value_counts = df_customer_flights['loyalty_number'].value_counts()
print (f"\n>>> VALUES COUNT:")
print(loyalty_number_value_counts)
# filter the Series to keep only the values that appear more than 24 times
values_to_keep = loyalty_number_value_counts[loyalty_number_value_counts > 24].index
print (f"\n>>> VALUES WITH MORE THAN 24 ROWS:")
print(values_to_keep)

In [None]:
filtered_df = df_customer_flights[df_customer_flights['loyalty_number'].isin(values_to_keep)].sort_values(by=['loyalty_number','year', 'month','total_flights'])
filtered_df.head(20)

In [None]:
df_customer_loyalty[df_customer_loyalty['loyalty_number'].isin(values_to_keep)]