# Data cleaning and EDA for Santender dataset

Renaming the columns of the csv file to english readeable names, in addition the dataset used here is a subset of the original dataset, the original dataset train is 13,647,000 rows and test is 900,000 rows, loading it in will crash your computer. 
Currently the train file in spanish is named sandenter_train_small and test is sandenter_test_small. The code uses sandenter_train_small as it has enough rows to do both train and testing

### Import relevant libraries
Please remove libraries that youre not using.....

In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from statistics import median
from imblearn.over_sampling import SMOTE #Note to add this to requirements.txt, conda install -c conda-forge imbalanced-learn
from sklearn.model_selection import train_test_split

### Data ingestion

In [2]:
## Set the names of the files to clean and the name of the cleaned files here:
old_file = "recodataset.csv"
clean_train = "clean_train_reco.csv"
clean_test = "clean_test_reco.csv"

number_of_each_class = 15000 #Change according to desired size of original dataset, 
#final dataset size is this number * number of products * 2 because binary

In [3]:
csv_file_path = old_file  # input CSV file
output_file_path = clean_train  # output training CSV file

# Define the mapping of original Spanish column names to English column names
column_mapping = {
    "fecha_dato": "report_date",
    "ncodpers": "customer_id",
    "ind_empleado": "employee_index",
    "pais_residencia": "country_residence",
    "sexo": "gender",
    "age": "age",
    "fecha_alta": "contract_start_date",
    "ind_nuevo": "new_customer_index",
    "antiguedad": "seniority_months",
    "indrel": "primary_customer_status",
    "ult_fec_cli_1t": "last_primary_customer_date",
    "indrel_1mes": "customer_type_start_month",
    "tiprel_1mes": "customer_relation_type",
    "indresi": "residence_index",
    "indext": "foreigner_index",
    "conyuemp": "spouse_employee_index",
    "canal_entrada": "join_channel",
    "indfall": "deceased_index",
    "tipodom": "address_type",
    "cod_prov": "province_code",
    "nomprov": "province_name",
    "ind_actividad_cliente": "activity_index",
    "renta": "gross_income",
    "segmento": "customer_segment",
    "ind_ahor_fin_ult1": "saving_account",
    "ind_aval_fin_ult1": "guarantee",
    "ind_cco_fin_ult1": "current_account",
    "ind_cder_fin_ult1": "derivada_account",
    "ind_cno_fin_ult1": "payroll_account",
    "ind_ctju_fin_ult1": "junior_account",
    "ind_ctma_fin_ult1": "more_particular_account",
    "ind_ctop_fin_ult1": "particular_account",
    "ind_ctpp_fin_ult1": "particular_plus_account",
    "ind_deco_fin_ult1": "short_term_deposits",
    "ind_deme_fin_ult1": "medium_term_deposits",
    "ind_dela_fin_ult1": "long_term_deposits",
    "ind_ecue_fin_ult1": "e_account",
    "ind_fond_fin_ult1": "funds",
    "ind_hip_fin_ult1": "mortgage",
    "ind_plan_fin_ult1": "pensions",
    "ind_pres_fin_ult1": "loans",
    "ind_reca_fin_ult1": "taxes",
    "ind_tjcr_fin_ult1": "credit_card",
    "ind_valo_fin_ult1": "securities",
    "ind_viv_fin_ult1": "home_account",
    "ind_nomina_ult1": "payroll",
    "ind_nom_pens_ult1": "pensions_payments",
    "ind_recibo_ult1": "direct_debit"
}

# Read the original CSV file
df = pd.read_csv(csv_file_path)

#Drop the useless columns (correlation matrix used https://medium.com/@samarthjoelram/santander-recommendation-system-cab6b40596b5)

# List of columns to drop if they exist
columns_to_drop = ['ult_fec_cli_1t', 'ind_actividad_cliente', 'cod_prov', 'conyuemp', 'tipodom']

# Drop columns if they exist in the DataFrame
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Rename the columns to english
df.rename(columns=column_mapping, inplace=True)

### Regrouping bank products into 4 main categories
- fixed-deposits
- loan
- accounts
- credit and debit cards

In [4]:
# Define new column names
fixed_deposits_col = 'fixed_deposits'
loan_col = 'loan'
credit_card_debit_card_col = 'credit_card_debit_card'
account_col = 'account'

# Check and create a new column for fixed deposits, if it doesn't exist
if fixed_deposits_col not in df.columns:
    deposit_columns = [
        "short_term_deposits",  # ind_deco_fin_ult1
        "medium_term_deposits",  # ind_deme_fin_ult1
        "long_term_deposits"    # ind_dela_fin_ult1
    ]
    df[fixed_deposits_col] = df[deposit_columns].any(axis=1).astype(int)

# Check and create a new column for loans, if it doesn't exist
if loan_col not in df.columns:
    loan_columns = [
        "loans",                # ind_pres_fin_ult1
        "pensions"             # ind_plan_fin_ult1
    ]
    df[loan_col] = df[loan_columns].any(axis=1).astype(int)

# Check and create a new column for credit and debit cards, if it doesn't exist
if credit_card_debit_card_col not in df.columns:
    credit_card_columns = [
        "credit_card",         # ind_tjcr_fin_ult1
        "direct_debit"        # ind_recibo_ult1
    ]
    df[credit_card_debit_card_col] = df[credit_card_columns].any(axis=1).astype(int)

# Check and create a new column for all accounts combined, if it doesn't exist
if account_col not in df.columns:
    account_columns = [
        "saving_account",      # ind_ahor_fin_ult1
        "current_account",     # ind_cco_fin_ult1
        "derivada_account",    # ind_cder_fin_ult1
        "payroll_account",     # ind_cno_fin_ult1
        "junior_account",      # ind_ctju_fin_ult1
        "more_particular_account",  # ind_ctma_fin_ult1
        "particular_account",   # ind_ctop_fin_ult1
        "particular_plus_account", # ind_ctpp_fin_ult1
        "e_account",           # ind_ecue_fin_ult1
        "funds",               # ind_fond_fin_ult1
        "home_account",        # ind_viv_fin_ult1
    ]
    df[account_col] = df[account_columns].any(axis=1).astype(int)


### Drop unnescessary columns

In [5]:
# The model used for commercial purposes does not merge the products together
columns_to_drop = [
    'saving_account', 'guarantee', 'current_account', 'derivada_account', 'payroll_account', 
    'junior_account', 'more_particular_account', 'particular_account', 'particular_plus_account', 
    'short_term_deposits', 'medium_term_deposits', 'long_term_deposits', 'e_account', 'funds', 
    'mortgage', 'pensions', 'loans', 'taxes', 'credit_card', 'securities', 'home_account', 
    'payroll', 'pensions_payments', 'direct_debit', 'employee_index'
]

# Drop the columns if they exist
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

### Data manipulation and Dealing with missing values

In [6]:
date_columns = ['report_date', 'contract_start_date']  # Add any other date columns if necessary

for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Calculate the difference in days
df['contract_length'] = (df['report_date'] - df['contract_start_date']).dt.days

# Insert 'contract_length' in the same spot as 'contract_start_date'
start_date_index = df.columns.get_loc('contract_start_date')
df.insert(start_date_index, 'contract_length', df.pop('contract_length'))

# Drop the original 'contract_start_date' and 'report_date' columns
df = df.drop(['contract_start_date', 'report_date', 'customer_id'], axis='columns')

# Replace missing values in gross income and age with median of distribution
count = df['gross_income'].isna().sum()
df['gross_income'] = df['gross_income'].fillna(df['gross_income'].median())


df['age'] = pd.to_numeric(df['age'].replace(' NA', None), errors='coerce')
med_age = df['age'].median()
df['age'] = df['age'].fillna(med_age).astype(int)

# Specify the columns to check for NA values
columns_with_na = [
    'country_residence', 'gender', 
    'customer_relation_type', 'residence_index', 
    'foreigner_index', 'join_channel', 'deceased_index', 
    'customer_segment', 'new_customer_index'
]

# Drop rows with NA values in the specified columns
df = df.dropna(subset=columns_with_na)


#### Strip leading and trailing spaces from string columns

In [7]:
def strip_spaces(column):
    if column.dtype == 'object':  # Check if the column is of string type
        return column.str.strip()  # Strip leading and trailing spaces
    return column

# Apply the strip_spaces function to all columns in the DataFrame
df = df.apply(strip_spaces)

# Remove leading/trailing whitespace of seniority_months column and convert to numeric
df['seniority_months'] = pd.to_numeric(df['seniority_months'].str.strip(), errors='coerce')

#### Regrouping province into 5 regions

In [8]:
# Changing province name to regions so less columns created via one hot encoding
region = []
for province in df['province_name']:
    if province in ['CIUDAD REAL', 'SALAMANCA', 'TOLEDO', 'SEGOVIA', 'MADRID', 'GUADALAJARA', 'ALBACETE', 'SORIA', 'CUENCA', 'AVILA']:
        region.append("CENTRAL")
    elif province in ['ALAVA', 'GIPUZKOA', 'PALENCIA', 'BURGOS', 'NAVARRA', 'CANTABRIA', 'BIZKAIA', 'RIOJA, LA', 'ZARAGOZA', 'TARRAGONA', 'LERIDA', 'HUESCA']:
        region.append("NORTH")
    elif province in ['CADIZ', 'JAEN', 'SEVILLA', 'PALMAS, LAS', 'CORDOBA', 'GRANADA', 'SANTA CRUZ DE TENERIFE', 'MELILLA', 'CEUTA', 'MALAGA']:
        region.append("SOUTH")
    elif province in ['VALENCIA', 'TERUEL', 'BALEARS, ILLES', 'CASTELLON', 'ALICANTE', 'MURCIA', 'ALMERIA', 'BARCELONA', 'GIRONA']:
        region.append("EAST")
    elif province in ['ZAMORA', 'CACERES', 'HUELVA', 'BADAJOZ', 'ASTURIAS', 'LEON', 'LUGO', 'CORUÑA, A', 'OURENSE', 'VALLADOLID', 'PONTEVEDRA']:
        region.append("WEST")
    else:
        region.append(None)  # Append None for unmatched provinces

# Assign the new region list to the DataFrame
df['region'] = region
df = df.drop(columns=['province_name'])  # Drop the original 'province_name' column

### One hot encoding

In [9]:
# One-hot encode the specified columns in columns_with_na
df = pd.get_dummies(df, columns=columns_with_na, drop_first=False)

# One-hot encode the 'region' column
if 'region' in df.columns:
    df = pd.get_dummies(df, columns=['region'], drop_first=True)
    
# Convert True/False columns to 1/0
bool_columns = df.select_dtypes(include=['bool']).columns
df[bool_columns] = df[bool_columns].astype(int)

###  Perform train_test_split

In [10]:
# Perform the 80/20 train-test split
df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Reset indices for both datasets to avoid misaligned indexing
df = df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Save the test dataset to CSV
test_df.to_csv("clean_test_reco.csv", index=False)

### Perform SMOTE
Rationale:
- Solve the problem of imbalanced classes

In [11]:
#select an equal amount of each class, and use SMOTE to balance the rest

# Initialize the new balanced DataFrame
balanced_df = pd.DataFrame()

# Define the label columns to balance
label_columns = ['fixed_deposits', 'loan', 'credit_card_debit_card', 'account']

# Set the maximum number of samples to take from any class
max_samples = number_of_each_class  # Adjust as needed

# Loop over each label column to balance it individually
for label in label_columns:
    print(f"Balancing for label: {label}")

    # Separate the current label and the features
    y = df[label]
    X = df.drop(columns=label_columns)  # Keep all features but exclude other labels

    # Prepare the data to balance the 0 and 1 classes for the current label
    class_0 = df[df[label] == 0]
    class_1 = df[df[label] == 1]

    # Take a max of 'max_samples' or the available samples for each class
    sampled_class_0 = class_0.sample(n=min(len(class_0), max_samples), random_state=42)
    sampled_class_1 = class_1.sample(n=min(len(class_1), max_samples), random_state=42)

    # Combine the samples to form the data for SMOTE
    df_to_balance = pd.concat([sampled_class_0, sampled_class_1], ignore_index=True)

    # Separate features and the label for SMOTE
    X_balance = df_to_balance.drop(columns=label)
    y_balance = df_to_balance[label]

    # Apply SMOTE to balance the current label
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_balance, y_balance)

    # Create a DataFrame from the resampled data
    resampled_df = pd.DataFrame(X_resampled, columns=X_balance.columns)
    resampled_df[label] = y_resampled  # Add the resampled label back

    # Append the resampled data to the balanced_df
    balanced_df = pd.concat([balanced_df, resampled_df], ignore_index=True)


Balancing for label: fixed_deposits
Balancing for label: loan


  resampled_df[label] = y_resampled  # Add the resampled label back
  resampled_df[label] = y_resampled  # Add the resampled label back


Balancing for label: credit_card_debit_card
Balancing for label: account


  resampled_df[label] = y_resampled  # Add the resampled label back
  resampled_df[label] = y_resampled  # Add the resampled label back


### Additional data cleaning steps

In [12]:
# Shuffle the final balanced DataFrame
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True).head(number_of_each_class*2*len(label_columns))

# Define the target columns to move to the front
columns_to_move = ['fixed_deposits', 'loan', 'credit_card_debit_card', 'account']

# Remove the target columns from the DataFrame
remaining_columns = [col for col in balanced_df.columns if col not in columns_to_move]

# Add the target columns back to the front
balanced_df = balanced_df[columns_to_move + remaining_columns]

# Save the updated DataFrame to a new CSV file
balanced_df.to_csv(output_file_path, index=False)

print(f"CSV file successfully relabelled and saved to {output_file_path}")

CSV file successfully relabelled and saved to clean_train_reco.csv


### Checking class distribution for the y values
To check if SMOTE indeed solve the class imbalance problem

In [13]:

# List of columns to check for class imbalance
columns_to_check = ['fixed_deposits', 'loan', 'credit_card_debit_card', 'account']

# Loop through each column and calculate class imbalance
for column in columns_to_check:
    class_counts = balanced_df[column].value_counts()
    total_count = class_counts.sum()
    
    # Calculate percentages
    percentages = (class_counts / total_count) * 100
    
    print(f"Class imbalance for '{column}':")
    print(f"Counts:\n{class_counts}")
    print(f"Percentages:\n{percentages}\n")


Class imbalance for 'fixed_deposits':
Counts:
fixed_deposits
0    102722
1     17278
Name: count, dtype: int64
Percentages:
fixed_deposits
0    85.601667
1    14.398333
Name: count, dtype: float64

Class imbalance for 'loan':
Counts:
loan
0    104874
1     15126
Name: count, dtype: int64
Percentages:
loan
0    87.395
1    12.605
Name: count, dtype: float64

Class imbalance for 'credit_card_debit_card':
Counts:
credit_card_debit_card
0    90461
1    29539
Name: count, dtype: int64
Percentages:
credit_card_debit_card
0    75.384167
1    24.615833
Name: count, dtype: float64

Class imbalance for 'account':
Counts:
account
1    96700
0    23300
Name: count, dtype: int64
Percentages:
account
1    80.583333
0    19.416667
Name: count, dtype: float64

