### Renaming the columns of the csv file to english readeable names, in addition the dataset used here is a subset of the original dataset, the original dataset train is 13,647,000 rows and test is 900,000 rows, loading it in will crash your computer. 
### Currently the train file in spanish is named sandenter_train_small and test is sandenter_test_small. The code uses sandenter_train_small as it has enough rows to do both train and testing

In [9]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from statistics import median
from imblearn.over_sampling import SMOTE #Note to add this to requirements.txt, conda install -c conda-forge imbalanced-learn


##  The code block below allows us to edit the settings of the code, once done hit run all

In [10]:
## Set the names of the files to clean and the name of the cleaned files here:
old_file = "recodataset.csv"
cleaned_file = "cleaned_reco.csv"
number_of_each_class = 15000 #Change according to desired size of original dataset, 
#final dataset size is this number * number of products * 2 because binary

In [11]:
# Creating a new file to for the translated version, this is solely for visualisation purposes and is not used. 
# The original spanish verison will be used.

csv_file_path = old_file  # input CSV file
output_file_path = cleaned_file  # output CSV file

# Define the mapping of original Spanish column names to English column names
column_mapping = {
    "fecha_dato": "report_date",
    "ncodpers": "customer_id",
    "ind_empleado": "employee_index",
    "pais_residencia": "country_residence",
    "sexo": "gender",
    "age": "age",
    "fecha_alta": "contract_start_date",
    "ind_nuevo": "new_customer_index",
    "antiguedad": "seniority_months",
    "indrel": "primary_customer_status",
    "ult_fec_cli_1t": "last_primary_customer_date",
    "indrel_1mes": "customer_type_start_month",
    "tiprel_1mes": "customer_relation_type",
    "indresi": "residence_index",
    "indext": "foreigner_index",
    "conyuemp": "spouse_employee_index",
    "canal_entrada": "join_channel",
    "indfall": "deceased_index",
    "tipodom": "address_type",
    "cod_prov": "province_code",
    "nomprov": "province_name",
    "ind_actividad_cliente": "activity_index",
    "renta": "gross_income",
    "segmento": "customer_segment",
    "ind_ahor_fin_ult1": "saving_account",
    "ind_aval_fin_ult1": "guarantee",
    "ind_cco_fin_ult1": "current_account",
    "ind_cder_fin_ult1": "derivada_account",
    "ind_cno_fin_ult1": "payroll_account",
    "ind_ctju_fin_ult1": "junior_account",
    "ind_ctma_fin_ult1": "more_particular_account",
    "ind_ctop_fin_ult1": "particular_account",
    "ind_ctpp_fin_ult1": "particular_plus_account",
    "ind_deco_fin_ult1": "short_term_deposits",
    "ind_deme_fin_ult1": "medium_term_deposits",
    "ind_dela_fin_ult1": "long_term_deposits",
    "ind_ecue_fin_ult1": "e_account",
    "ind_fond_fin_ult1": "funds",
    "ind_hip_fin_ult1": "mortgage",
    "ind_plan_fin_ult1": "pensions",
    "ind_pres_fin_ult1": "loans",
    "ind_reca_fin_ult1": "taxes",
    "ind_tjcr_fin_ult1": "credit_card",
    "ind_valo_fin_ult1": "securities",
    "ind_viv_fin_ult1": "home_account",
    "ind_nomina_ult1": "payroll",
    "ind_nom_pens_ult1": "pensions_payments",
    "ind_recibo_ult1": "direct_debit"
}

# Read the original CSV file
df = pd.read_csv(csv_file_path)

#Drop the useless columns (correlation matrix used https://medium.com/@samarthjoelram/santander-recommendation-system-cab6b40596b5)
# List of columns to drop if they exist
columns_to_drop = ['ult_fec_cli_1t', 'ind_actividad_cliente', 'cod_prov', 'conyuemp', 'tipodom']
# Drop columns if they exist in the DataFrame
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Rename the columns to english
df.rename(columns=column_mapping, inplace=True)



In [12]:
####Making changes to spanish datafile to suit our needs, the simplified dataset is the one we assume is collected by 
#companies, the change here is the many products have been summarised by 4 categories of products


# Define new column names
fixed_deposits_col = 'fixed_deposits'
loan_col = 'loan'
credit_card_debit_card_col = 'credit_card_debit_card'
account_col = 'account'

# Check and create a new column for fixed deposits, if it doesn't exist
if fixed_deposits_col not in df.columns:
    deposit_columns = [
        "short_term_deposits",  # ind_deco_fin_ult1
        "medium_term_deposits",  # ind_deme_fin_ult1
        "long_term_deposits"    # ind_dela_fin_ult1
    ]
    df[fixed_deposits_col] = df[deposit_columns].any(axis=1).astype(int)

# Check and create a new column for loans, if it doesn't exist
if loan_col not in df.columns:
    loan_columns = [
        "loans",                # ind_pres_fin_ult1
        "pensions"             # ind_plan_fin_ult1
    ]
    df[loan_col] = df[loan_columns].any(axis=1).astype(int)

# Check and create a new column for credit and debit cards, if it doesn't exist
if credit_card_debit_card_col not in df.columns:
    credit_card_columns = [
        "credit_card",         # ind_tjcr_fin_ult1
        "direct_debit"        # ind_recibo_ult1
    ]
    df[credit_card_debit_card_col] = df[credit_card_columns].any(axis=1).astype(int)

# Check and create a new column for all accounts combined, if it doesn't exist
if account_col not in df.columns:
    account_columns = [
        "saving_account",      # ind_ahor_fin_ult1
        "current_account",     # ind_cco_fin_ult1
        "derivada_account",    # ind_cder_fin_ult1
        "payroll_account",     # ind_cno_fin_ult1
        "junior_account",      # ind_ctju_fin_ult1
        "more_particular_account",  # ind_ctma_fin_ult1
        "particular_account",   # ind_ctop_fin_ult1
        "particular_plus_account", # ind_ctpp_fin_ult1
        "e_account",           # ind_ecue_fin_ult1
        "funds",               # ind_fond_fin_ult1
        "home_account",        # ind_viv_fin_ult1
    ]
    df[account_col] = df[account_columns].any(axis=1).astype(int)
    
# List of columns to drop, we merged these columns for to keep it simple to present 
# The model used for commercial purposes does not merge the products together
columns_to_drop = [
    'saving_account', 'guarantee', 'current_account', 'derivada_account', 'payroll_account', 
    'junior_account', 'more_particular_account', 'particular_account', 'particular_plus_account', 
    'short_term_deposits', 'medium_term_deposits', 'long_term_deposits', 'e_account', 'funds', 
    'mortgage', 'pensions', 'loans', 'taxes', 'credit_card', 'securities', 'home_account', 
    'payroll', 'pensions_payments', 'direct_debit'
]

# Drop the columns if they exist
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])


### Dealing with missing values

In [13]:
date_columns = ['report_date', 'contract_start_date']  # Add any other date columns if necessary
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')
# Calculate the difference in days
df['contract_length'] = (df['report_date'] - df['contract_start_date']).dt.days

# Insert 'contract_length' in the same spot as 'contract_start_date'
start_date_index = df.columns.get_loc('contract_start_date')
df.insert(start_date_index, 'contract_length', df.pop('contract_length'))

# Drop the original 'contract_start_date' and 'report_date' columns
df = df.drop(['contract_start_date', 'report_date', 'customer_id'], axis='columns')
#replace missing values in gross income and age with median of distribution
count = df['gross_income'].isna().sum()
df['gross_income'] = df['gross_income'].fillna(df['gross_income'].median())
df.head(10)
age_new = df.iloc[:,3]
age_new[1020:1040]
new = []
for i in age_new:
    if i != ' NA':
        new.append(int(i))

med = median(new)
med
age = []
for i in age_new:
    if i !=' NA':
        age.append(int(i))
    else:
        age.append(med)
age[1020:1040]
df['age'] = age


# Function to strip leading and trailing spaces from string columns
def strip_spaces(column):
    if column.dtype == 'object':  # Check if the column is of string type
        return column.str.strip()  # Strip leading and trailing spaces
    return column

# Apply the strip_spaces function to all columns in the DataFrame
df = df.apply(strip_spaces)


## One hot encode dataset, required to perform SMOTE

In [14]:
# remove customer index remove
if 'customer_id' in df.columns:
    df = df.drop("customer_id", axis='columns')
# Drop 'employee_index' if it exists in the DataFrame
if 'employee_index' in df.columns:
    df = df.drop('employee_index', axis='columns')
# Remove leading/trailing whitespace of seniority_months column and convert to numeric
df['seniority_months'] = pd.to_numeric(df['seniority_months'].str.strip(), errors='coerce')


count = df['gross_income'].isna().sum()
df['gross_income'] = df['gross_income'].fillna(df['gross_income'].median())

##### One hot encode all non numeric columns
# Specify the columns to check for NA values
columns_with_na = [
    'employee_index', 'country_residence', 'gender', 
    'customer_relation_type', 'residence_index', 
    'foreigner_index', 'join_channel', 'deceased_index', 
    'customer_segment'
]

# Filter columns that actually exist in the DataFrame
existing_columns = [col for col in columns_with_na if col in df.columns]

# Drop rows with NA values in the specified existing columns
if existing_columns:
    df = df.dropna(subset=existing_columns)
    
    # One-hot encode the specified existing columns (excluding 'province_name')
    df = pd.get_dummies(df, columns=existing_columns, drop_first=True)

# Remove rows where 'province_name' has NA values
df = df.dropna(subset=['province_name'])
## Changing province name to regions so less columns created via one hot encoding

province = df['province_name']
region = []
for i in province:
    if i in ['CIUDAD REAL', 'SALAMANCA','TOLEDO', 'SEGOVIA', 'MADRID', 'GUADALAJARA', 'ALBACETE', 'SORIA', 'CUENCA', 'AVILA']:
        region.append("CENTRAL")
        
    elif i in ['ALAVA', 'GIPUZKOA', 'PALENCIA', 'BURGOS', 'NAVARRA', 'CANTABRIA', 'BIZKAIA', 'RIOJA, LA', 'ZARAGOZA', 'TARRAGONA', 'LERIDA', 'HUESCA']:
        region.append("NORTH")

    elif i in ['CADIZ','JAEN', 'SEVILLA', 'PALMAS, LAS', 'CORDOBA', 'GRANADA', 'SANTA CRUZ DE TENERIFE', 'MELILLA', 'CEUTA', 'MALAGA']:          
        region.append("SOUTH")
    
    elif i in ['VALENCIA', 'TERUEL', 'BALEARS, ILLES', 'CASTELLON', 'ALICANTE', 'MURCIA', 'ALMERIA', 'BARCELONA', 'GIRONA']:
        region.append("EAST")
        
    elif i in ['ZAMORA', 'CACERES', 'HUELVA', 'BADAJOZ', 'ASTURIAS', 'LEON', 'LUGO', 'CORUÑA, A', 'OURENSE', 'VALLADOLID', 'PONTEVEDRA']:
        region.append("WEST")
    else:
        continue
df['province_name'] = region

df = df.rename(columns={'province_name': 'region'})

# One-hot encode the 'region' column
if 'region' in df.columns:
    df = pd.get_dummies(df, columns=['region'], drop_first=True)
    
# Convert True/False columns to 1/0
bool_columns = df.select_dtypes(include=['bool']).columns
df[bool_columns] = df[bool_columns].astype(int)





## Performing SMOTE, this produces enough of each class to be used for the model

In [15]:
#select an equal amount of each class, and use SMOTE to balance the rest

# Initialize the new balanced DataFrame
balanced_df = pd.DataFrame()

# Define the label columns to balance
label_columns = ['fixed_deposits', 'loan', 'credit_card_debit_card', 'account']

# Set the maximum number of samples to take from any class
max_samples = number_of_each_class  # Adjust as needed

# Loop over each label column to balance it individually
for label in label_columns:
    print(f"Balancing for label: {label}")

    # Separate the current label and the features
    y = df[label]
    X = df.drop(columns=label_columns)  # Keep all features but exclude other labels

    # Prepare the data to balance the 0 and 1 classes for the current label
    class_0 = df[df[label] == 0]
    class_1 = df[df[label] == 1]

    # Take a max of 'max_samples' or the available samples for each class
    sampled_class_0 = class_0.sample(n=min(len(class_0), max_samples), random_state=42)
    sampled_class_1 = class_1.sample(n=min(len(class_1), max_samples), random_state=42)

    # Combine the samples to form the data for SMOTE
    df_to_balance = pd.concat([sampled_class_0, sampled_class_1], ignore_index=True)

    # Separate features and the label for SMOTE
    X_balance = df_to_balance.drop(columns=label)
    y_balance = df_to_balance[label]

    # Apply SMOTE to balance the current label
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_balance, y_balance)

    # Create a DataFrame from the resampled data
    resampled_df = pd.DataFrame(X_resampled, columns=X_balance.columns)
    resampled_df[label] = y_resampled  # Add the resampled label back

    # Append the resampled data to the balanced_df
    balanced_df = pd.concat([balanced_df, resampled_df], ignore_index=True)

# Shuffle the final balanced DataFrame
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True).head(number_of_each_class*2*len(label_columns))

##moving the y columns to the front
# Define the target columns to move to the front
columns_to_move = ['fixed_deposits', 'loan', 'credit_card_debit_card', 'account']
# Remove the target columns from the DataFrame
remaining_columns = [col for col in balanced_df.columns if col not in columns_to_move]
# Add the target columns back to the front
balanced_df = balanced_df[columns_to_move + remaining_columns]

# Save the updated DataFrame to a new CSV file
balanced_df.to_csv(output_file_path, index=False)

print(f"CSV file successfully relabelled and saved to {output_file_path}")



Balancing for label: fixed_deposits
Balancing for label: loan
Balancing for label: credit_card_debit_card
Balancing for label: account
CSV file successfully relabelled and saved to cleaned_reco.csv
