In this notebook we preprocess the flattened dataset used for the model prediction; we mainly perform check on the values nad improve formatting of certain columns.

### Libraries and dataset imports

In this section we import the required dataset and libraries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../datasets/flattened_dataset.csv")

In [None]:
df.columns

### Helpful functions created for use later

In this section we store the function which we are going to use several times to perform checks and corrections.

We use this function in order to analyze the missing values of a given column

In [None]:
def analyze_missing_values(df, column):
    """Analyze missing values in a column."""
    missing_count = df[column].isna().sum()
    total_records = len(df)
    missing_percentage = (missing_count / total_records) * 100
    
    print(f"Total missing values: {missing_count}")
    print(f"Missing percentage: {missing_percentage:.2f}%")


We use this function in order to analyze numeric columns for null values and decimal places

In [None]:
def analyze_numeric_values(df, column):
    """Analyze numeric column values and decimal places."""
    print(f"\n=== {column} ===")
    print(f"Data type: {df[column].dtype}")
    print(f"Null values: {df[column].isnull().sum()}")
    
    non_null_values = df[column].dropna()
    decimal_places = (non_null_values % 1).apply(lambda x: len(str(x).split('.')[-1]) if x > 0 else 0)
    print("\nDecimal places distribution:")
    print(decimal_places.value_counts().sort_index())
    
    print(f"\nNegative values: {(non_null_values < 0).sum()}")
    print(f"Zero values: {(non_null_values == 0).sum()}")
    print(f"Infinite values: {np.isinf(non_null_values).sum()}")


We use this function to check for inconsistencies in text columns, for example if a column has both 'Olten' and 'olten' as distinct values.

In [None]:
def analyze_text_case_consistency(df, column, show_examples=5):
    """Analyze case consistency in text columns."""
    value_counts = df[column].value_counts()
    value_lower = df[column].str.lower()
    value_lower_counts = value_lower.value_counts()
    
    # Here we check the number distinct present values
    print(f"\nValue counts - original: {len(value_counts)}")
    # Here we check the number distinct present values, but the values are transformed to lower case
    print(f"Value counts - lowercase: {len(value_lower_counts)}")
    
    # If the values have different length, it means that some formatting inconsistencies are present
    if len(value_counts) != len(value_lower_counts):
        print("\nCase inconsistencies found:")
        count = 0
        for value in value_lower_counts.index:
            if count >= show_examples:
                break
            mask = df[column].str.lower() == value
            variants = df.loc[mask, column].unique()
            if len(variants) > 1:
                print(f"\nVariants for '{value}':")
                print(variants)
                count += 1


We use this function specifically for postal codes, for computing constraints, and we check for insconsistencies such as 'as 543 bs' and 'AS 543 BS' as distinct values.

In [None]:
def analyze_postal_code(df, column):
    """Analyze postal code formatting."""
    
    # First we filter for postal codes that contain letters
    postal_mask = df[column].str.contains('[A-Za-z]', na=False)
    postal_with_letters = df.loc[postal_mask, column]
    
    # Then we get the value counts of the original values and the ones turned to lowercase
    postal_counts = postal_with_letters.value_counts()
    postal_lower = postal_with_letters.str.lower()
    postal_lower_counts = postal_lower.value_counts()
    
    print(f"Postal codes with letters - original count: {len(postal_counts)}")
    print(f"Postal codes with letters - lowercase count: {len(postal_lower_counts)}")
    
    # If there are case inconsistencies, we print them
    if len(postal_counts) != len(postal_lower_counts):
        print("\nCase inconsistencies found:")
        count = 0
        for code in postal_lower_counts.index:
            if count >= 5:  # We print only the first 5 examples
                break
            variants = postal_with_letters[postal_with_letters.str.lower() == code].unique()
            if len(variants) > 1:
                print(f"\nVariants for '{code}':")
                print(variants)
                count += 1

We use this function to standardize the postal code format when it's inconsistent.

In [None]:
def standardize_postal_code(x):
    """Standardize postal code format."""
    return x.upper() if isinstance(x, str) and any(c.isalpha() for c in x) else x


### Price analysis

Here we can view different decimal places, we should investigate that more

In [None]:
price_columns = ['customer_price', 'expected_carrier_price', 'final_carrier_price']

for col in price_columns:
    analyze_numeric_values(df, col)

Here we can see that rounding doesn't change anything, so these decimals are just python artifacts

In [None]:
for col in ['customer_price', 'expected_carrier_price', 'final_carrier_price']:
    orig = df[col]
    rounded = df[col].round(2)
    differences = (orig != rounded).sum()
    print(f"\n{col}:")
    print(f"Values changed by rounding to 2 decimals: {differences}")
    
    if differences > 0:
        print("Sample of changed values:")
        changed_mask = (orig != rounded)
        print("Original vs Rounded:")
        comp_df = pd.DataFrame({
            'Original': orig[changed_mask].head(),
            'Rounded': rounded[changed_mask].head()
        })
        print(comp_df)

### Weight analysis

We can see that weight doesn't show any particular issues

In [None]:
print("=== Weight Analysis ===")
print(f"Data type: {df['weight'].dtype}")
print(f"Null values: {df['weight'].isnull().sum()}")

# We check decimal places
non_null_weights = df['weight'].dropna()
decimal_places = (non_null_weights % 1).apply(lambda x: len(str(x).split('.')[-1]) if x > 0 else 0)
print("\nDecimal places distribution:")
print(decimal_places.value_counts().sort_index())

# We check for data issues
print(f"\nNegative values: {(non_null_weights < 0).sum()}")
print(f"Zero values: {(non_null_weights == 0).sum()}")
print(f"Infinite values: {np.isinf(non_null_weights).sum()}")

# We print some sample weights across different decimal places
print("\nSample weights with different decimal places:")
for dec in decimal_places.unique():
   sample_mask = (decimal_places == dec)
   if sample_mask.any():
       print(f"\n{dec} decimal places:")
       print(non_null_weights[sample_mask].head().apply(lambda x: f"{x:.{dec}f}"))

The distribution doesn't show any signs that should be investigated more

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df[df['weight'] <= 100], x='weight', bins=50)
plt.title('Weight Distribution (0-100 kg)')
plt.xlabel('Weight (kg)')
plt.ylabel('Count')
plt.show()

### Shipment type analysis

The elements of shipment type don't have any formatting problems

In [None]:
shipment_types = df['shipment_type'].value_counts()
print("\nUnique shipment types and counts:")
print(shipment_types)

### Insurance type analysis

The elements of insurance type don't have any formattign problems

In [None]:
insurance_types = df['insurance_type'].value_counts()
print("\nUnique insurance types and counts:")
print(insurance_types)

### Booking state analysis

The elements are as described in the documentation

In [None]:
booking_states = df['booking_state'].value_counts()
print("\nUnique insurance types and counts:")
print(booking_states)

### Margin features analysis

Negative margin could mean that they had to do some refunds

In [None]:
stats = df['margin'].describe()
nulls = df['margin'].isna().sum()

print("\nMargin Analysis:")
print(stats)
print(f"\nNull values: {nulls}")

### Segmentation analysis

High number of null values across some features can be a common pattern

In [None]:
stats = df['segmentation_customer'].unique()
nulls = df['segmentation_customer'].isna().sum()

print("\nSegmentation Analysis:")
print(stats)
print(f"\nNull values: {nulls}")

### Main industry name analysis

A third of the industry are not present

In [None]:
analyze_missing_values(df, 'main_industry_name_customer')

Here we verify the case correctness of the industry name, by counting the values as they are, then putting them to lower case and comparing them to see if we can spot differences. 
There aren't.

In [None]:
analyze_text_case_consistency(df, 'main_industry_name_customer')

### Sector name analysis

The same number of values as for the industry name are missing, which makes sense since they are tied together.

In [None]:
analyze_missing_values(df, 'industry_sector_name_customer')

Here we find no formatting issues.

In [None]:
analyze_text_case_consistency(df, 'industry_sector_name_customer')

### Delivery postal code analysis

A much lower percentage of missing values across the dataset.

In [None]:
analyze_missing_values(df, 'postal_code_delivery')

There are case inconsistencies among postal codes:<br>
Postal codes with letters - original count: 11635<br>
Postal codes with letters - lowercase count: 11607

In [None]:
analyze_postal_code(df, 'postal_code_delivery')

This way we handle the inconsistencies.

In [None]:
df['postal_code_delivery'] = df['postal_code_delivery'].apply(standardize_postal_code)

### Delivery city analysis

No delivery cicy has missing values.

In [None]:
analyze_missing_values(df, 'city_delivery')

Lots of cities with inconsistencies in the case formatting:<br>
City counts - original: 69883<br>
City counts - lowercase: 59104

In [None]:
analyze_text_case_consistency(df, 'city_delivery')

In [None]:
# Standardize city case format
df['city_delivery'] = df['city_delivery'].str.lower().str.title()

###  Delivery country name analysis

In this feature , which seems to be the country, we have a small amount of missing values.

In [None]:
analyze_missing_values(df, 'name_country_delivery')

No inconsistencies.

In [None]:
analyze_text_case_consistency(df, 'name_country_delivery')


In [None]:
print("\nFirst 5 name_pickup values:")
print(df['name_country_delivery'].head())

### Iso country code delivery analysis

In [None]:
analyze_missing_values(df, 'iso_country_code_delivery')

In [None]:
df["iso_country_code_delivery"].head()

In [None]:
df['iso_country_code_delivery'].unique()

In [None]:
analyze_text_case_consistency(df, 'iso_country_code_delivery')

### Delivery continent analysis

In [None]:
analyze_missing_values(df, 'continent_delivery')

The format is respected across the unique values.

In [None]:
print(df['continent_delivery'].unique())

### EU delivery analysis

In [None]:
analyze_missing_values(df, 'EU_delivery')

It's boolean.

In [None]:
print(df['EU_delivery'].unique())

### Domain name analysis

In [None]:
analyze_missing_values(df, 'domain_name_delivery')

The nomain name has an established format which is respected.

In [None]:
print(df['domain_name_delivery'].unique())

### Postal code pickup analysis

In [None]:
analyze_missing_values(df, 'postal_code_pickup')

In [None]:
analyze_postal_code(df, 'postal_code_pickup')

In [None]:
df['postal_code_pickup'] = df['postal_code_pickup'].apply(standardize_postal_code)

### City pickup analysis

In [None]:
analyze_missing_values(df, 'city_pickup')

In [None]:
analyze_text_case_consistency(df, 'city_pickup')

In [None]:
df['city_pickup'] = df['city_pickup'].str.lower().str.title()

### Name Service Analysis

In [None]:
analyze_missing_values(df, 'name_service')

In [None]:
analyze_text_case_consistency(df, 'name_service')

In [None]:
df['name_service'] = df['name_service'].str.lower().str.title()

### Service Type analysis

In [None]:
analyze_missing_values(df, 'service_type')

The format is stardard across all entries.

In [None]:
df['service_type'].unique()

### Transport type analysis

In [None]:
analyze_missing_values(df, 'transport_type')

The format is standard across entries.

In [None]:
df['transport_type'].unique()

### Name Carrier analysis

In [None]:
analyze_missing_values(df, 'name_carrier')

The format is consistent across all the instances.

In [None]:
df['name_carrier'].unique()

### lms plus analysis

In [None]:
analyze_missing_values(df, 'lms_plus')

The values are consistent.

In [None]:
df['lms_plus'].unique()

### Is Master analysis

In [None]:
analyze_missing_values(df, 'is_master_customer')

The boolean consistency is respected.

In [None]:
df['is_master_customer'].unique()

### Drop non needed features

They won't be seen again since they were registered in the past, and this is not a time series problem.

In [None]:
date_columns_to_drop = [col for col in df.columns if 'full_date' in col or 'created_date' in col]

# Drop the columns
df = df.drop(columns=date_columns_to_drop)

In [None]:
df.columns

### Save dataset

In this section we save the dataset.

In [None]:
output_file = '../../../00-Project/datasets/preprocessed_flattened_dataset.csv'

df.to_csv(output_file, index=False)