In [130]:
import pandas as pd
import os
import pickle

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve, auc


In [121]:
churners_df = pd.read_csv("../../data/BankChurners.csv", index_col='CLIENTNUM', sep= ',')

churners_df.shape
# # Assuming 'df' is your DataFrame containing the dataset
# current_month = 2024 * 12 + 5  # Convert reference date to months
# df['Customer_Tenure'] = current_month - df['Months_on_book']

(10127, 22)

In [122]:
def cleaning_data(df):
    df.drop(columns=["Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1",
                     "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"], 
                     inplace=True)
    # no missing values --> some 'Unknown' values
    df.isnull().sum()
    #change 'Unknown' values in columns to None or Nan
    df = df.applymap(lambda x: None if x == 'Unknown' else x)
    return df

In [123]:
def adding_features(df):
    return df

In [124]:
def imputer(train, test):
    imputer_cat = SimpleImputer(strategy='most_frequent')
    categorical_features = train.select_dtypes(include=['object']).columns
    train[categorical_features] = imputer_cat.fit_transform(train[categorical_features])
    test[categorical_features] = imputer_cat.transform(test[categorical_features])

    imputer_num = SimpleImputer(strategy='median')
    numerical_features = train.select_dtypes(include=['int64', 'float64']).columns
    train[numerical_features] = imputer_num.fit_transform(train[numerical_features])
    test[numerical_features] = imputer_num.transform(test[numerical_features])

    return train, test,imputer_cat, imputer_num


In [125]:
def encode(train, test):
    # encode categorical features
    categorical_cols = train.select_dtypes('object').columns

    encoder = LabelEncoder()
    for col in categorical_cols:
        train[col] = encoder.fit_transform(train[col])
        
        #If a value is encountered in the test data that wasn't seen during training (i.e., not present in the label encoder's classes), 
        #it's assigned a placeholder value of -1. This ensures that unseen categorical values in the test data are handled appropriately 
        #during the encoding process.
        test[col] = test[col].map(lambda s: encoder.transform([s])[0] if s in encoder.classes_ else -1)
    
    return train, test, encoder

In [126]:
'''MinMaxScaler 
--> scales values linearly, without distorting the relative relationships between the variables.
--> brings all values in the range of 0 and 1, preventing any single feature form dominating the learning process
--> Less sensitive to outliers'''
def scaler(X_train, X_test):
    scaler = MinMaxScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train),columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test),columns=X_test.columns)
    
    return X_train, X_test, scaler

In [None]:
def save_preprocess_models(imputer_num, imputer_cat, encoder, scaler):
    # Get the current directory of the script
    current_directory = os.path.dirname(__file__)
    # Define the relative path to the CSV file
    csv_file_path = os.path.join(current_directory, "..", "preprocess_models", "BankChurners.csv")

    '''
    import pickle

def save_preprocessing(num_imp, cat_imp, encoder, scaler, type_prop):
    # Define the folder path based on the type_prop
    folder_path = 'preprocess_models/house' if type_prop == 'HOUSE' else 'preprocess_models/apartment'
    
    # Define the filenames and objects to save
    filenames_objects = [
        ('trained_num_imp', num_imp),
        ('trained_cat_imp', cat_imp),
        ('trained_encoder', encoder),
        ('trained_scaler', scaler)
    ]
    
    # Iterate over the filenames and objects
    for filename, obj in filenames_objects:
        # Construct the file path
        file_path = os.path.join(folder_path, f'{filename}_{type_prop}.pkl.gz')
        
        # Pickle and compress the object
        with gzip.open(file_path, 'wb') as f:
            pickle.dump(obj, f)
        
        print(f"{filename.capitalize()} saved successfully at: {file_path}")
    '''
    
    

In [127]:
def preprocess(df):
    df = cleaning_data(df)
    df = adding_features(df)

    X = churners_df.drop('Attrition_Flag', axis=1)
    y = churners_df['Attrition_Flag']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train, X_test, imputer_cat, imputer_num = imputer(X_train, X_test)
    X_train, X_test, encoder = encode(X_train, X_test)
    X_train, X_test, scaler = scaler(X_train, X_test)

    # call def that exports preprocess models

    return X_train, X_test, y_train, y_test, 


In [128]:
X_train, X_test, y_train, y_test = preprocess(churners_df)
X_train.head()

  df = df.applymap(lambda x: None if x == 'Unknown' else x)


Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,0.636364,0.0,0.2,1.0,0.666667,1.0,0.0,0.534884,0.0,0.5,0.5,0.069071,0.686532,0.057717,0.22243,0.447535,0.689922,0.182553,0.464464
1,0.727273,0.0,0.8,0.5,0.333333,1.0,0.0,0.813953,0.0,0.666667,0.5,0.119649,0.716329,0.104019,0.184299,0.088851,0.224806,0.105816,0.334334
2,0.431818,0.0,0.8,1.0,0.666667,0.8,0.333333,0.534884,1.0,0.166667,0.5,0.439834,0.654748,0.41538,0.273645,0.051519,0.20155,0.336564,0.103103
3,0.181818,0.0,0.4,0.333333,0.666667,0.8,0.0,0.534884,0.6,0.5,0.666667,0.066108,1.0,0.032017,0.432897,0.117169,0.27907,0.350027,0.694695
4,0.522727,0.0,0.4,0.5,0.333333,0.2,0.0,0.604651,0.8,0.5,0.666667,0.038748,0.765197,0.022919,0.225047,0.183376,0.395349,0.213786,0.708709
