In [None]:
# Uncomment below if you want to run this file only
#%run main.ipynb
#%run data_cleaning.ipynb
#%run data_visualization.ipynb

In [None]:
# Read the CSV VISUALIZATION data source file from S3 into a DataFrame
# Use the methods from the S3Utils class
if s3_utils.check_file_exists(output_file_key_data_visualization):
    df = s3_utils.read_csv_from_s3(output_file_key_data_visualization)

FEATURE ENGINEERING

1. Encoding categorical variables into numerical representations

In [None]:
#Initializing the tag encoder
label_encoder = LabelEncoder()

#Applying the encoding to the "pack" column:
df['pack_encoded'] = label_encoder.fit_transform(df['pack'])


In [None]:
df['pack'].value_counts()

In [None]:
df['pack_encoded'].value_counts()

In [None]:
#Dropping pack and status features
df.drop(columns=['pack'], inplace=True)
df.drop(columns=['status'], inplace=True)

2. Remove or transform irrelevant or redundant features to streamline the feature space

In [None]:
df['number_products'] = df['cr_prod_cnt_il'].fillna(0) + df['cr_prod_cnt_vcu'].fillna(0) + df['cr_prod_cnt_tovr'].fillna(0) + df['cr_prod_cnt_pil'].fillna(0) + df['cr_prod_cnt_cc'].fillna(0) + df['cr_prod_cnt_ccfp'].fillna(0)

In [None]:
df['number_products'].value_counts()

TARGET (Dependent Variable)

In [None]:
# Frequency of classes of dependent variable
df["target"].value_counts()

In [None]:
# Customers leaving the bank
churn = df.loc[df["target"]==1]

In [None]:
# Customers who did not leave the bank
not_churn = df.loc[df["target"]==0]

In [None]:
# Frequency of not_churn group according to Months of being customers
not_churn["clnt_setup_tenor"].value_counts().sort_values()

In [None]:
# Frequency of churn group according to Months of being customers
churn["clnt_setup_tenor"].value_counts().sort_values()

AGE

In [None]:

# examining the age of the not_churn group
not_churn["age"].describe([0.05,0.25,0.50,0.75,0.90,0.95,0.99])

In [None]:
# distribution of the Age for not_churn
pyplot.figure(figsize=(8,6))
pyplot.xlabel('Age')
pyplot.hist(not_churn["age"],bins=15, alpha=0.7, label='Not Churn')
pyplot.legend(loc='upper right')
pyplot.show()

In [None]:
# examining the age of the churn group
churn["age"].describe([0.05,0.25,0.50,0.75,0.90,0.95,0.99])

In [None]:
# distribution of the Age for churn
pyplot.figure(figsize=(8,6))
pyplot.xlabel('Age')
pyplot.hist(churn["age"],bins=15, alpha=0.7, label='Churn')
pyplot.legend(loc='upper right')
pyplot.show()

In [None]:
sns.catplot(x='target', y='age', data=df)
plt.show()

In [None]:
#Discretizing age feature 

# Defining category limits
bins = [float('-inf'), 19, 56, float('inf')]  # Categories: (-inf, 19], (19, 56], (56, inf)

# Defining labels
labels = ['Under 20', '20-55 years', 'Over 55']

# Creating a new feature with the categories created 
df['new_age_category'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

# Assigning numerical values to categories
category_mapping = {'Under 20': 0, '20-55 years': 1, 'Over 55': 2}
df['new_age_category_numeric'] = df['new_age_category'].map(category_mapping)

print(df.head())

In [None]:
print(df['new_age_category'].unique())
print(df['new_age_category'].value_counts())

Number of products

In [None]:
# Frequency of not_churn group according to NumOfProducts
not_churn["number_products"].value_counts().sort_values()

In [None]:

# Frequency of not_churn group according to NumOfProducts
not_churn["number_products"].value_counts().sort_values()

In [None]:
# Frequency of churn group according to NumOfProducts
churn["number_products"].value_counts().sort_values()

3. Standarization and normalization

In [None]:
df['clnt_setup_tenor_years'] = df['clnt_setup_tenor'] * 12

In [None]:
# normalizing tenure in relation to age, potentially offering a fresh perspective on customer retention based on age.
df["new_setup_tenor"] = df["clnt_setup_tenor_years"]/df["age"]
df.head()

In [None]:
#Dropping more redundant features 

#age and new setup tenor in years
df.drop(['age', 'clnt_setup_tenor_years', 'clnt_setup_tenor'], axis=1, inplace=True)

In [None]:
age_columns = [col for col in df.columns if 'age' in col]
print(age_columns)

In [None]:
#Dropping age_group 
df.drop(['age_group'], axis=1, inplace=True)

In [None]:
#Looking for columns with the word cnt 
cnt_columns = [col for col in df.columns if 'cnt' in col]
print(cnt_columns)

In [None]:
#Dropping columns of individual products
df.drop(['cr_prod_cnt_il', 'cr_prod_cnt_vcu', 'cr_prod_cnt_tovr', 'cr_prod_cnt_pil', 'cr_prod_cnt_cc', 'cr_prod_cnt_ccfp'], axis=1, inplace=True)

In [None]:
# Looking for columns with the word turnover to sum later  
turnover_columns = [col for col in df.columns if 'turnover' in col]

# Printing results
print(turnover_columns)

In [None]:
# Sum the values of all turnover columns
df['total_turnover'] = df[['turnover_cc', 'turnover_paym']].sum(axis=1)

print(df[['total_turnover']].count())


In [None]:
# Discretizing the new total_turnover feature using pd.qcut() with 'duplicates' set to 'drop'
df['total_turnover_category'] = pd.qcut(df['total_turnover'], 6, labels=False, duplicates='drop')


In [None]:
# Drop the 'total_turnover' column from the DataFrame
df.drop(columns=['total_turnover', 'turnover_cc', 'turnover_paym'], inplace=True)


In [None]:
#Looking for columns related to 1m  
one_month_columns = [col for col in df.columns if '1m' in col]
print(one_month_columns)

In [None]:
# Sum the values of all turnover columns related to dynamic in one month 
df['total_turnover_1m'] = df[['turnover_dynamic_il_1m', 'turnover_dynamic_cur_1m', 'turnover_dynamic_paym_1m','turnover_dynamic_cc_1m' ]].sum(axis=1)

print(df[['total_turnover_1m']].count())

In [None]:
# Discretizing the new total_turnover_1m feature using pd.qcut() with 'duplicates' set to 'drop'
df['total_turnover_1m_category'] = pd.qcut(df['total_turnover_1m'], 6, labels=False, duplicates='drop')

In [None]:
# Drop the 'total_turnover' column from the DataFrame related to 1m
df.drop(columns=['turnover_dynamic_il_1m', 'turnover_dynamic_cur_1m', 'turnover_dynamic_paym_1m','turnover_dynamic_cc_1m', 'total_turnover_1m' ], inplace=True)

In [None]:
#Looking for columns related to 3 months
three_months_columns = [col for col in df.columns if '3m' in col]
print(three_months_columns)

In [None]:
# Sum the values of all turnover columns related to dynamic in three months 
df['total_turnover_3m'] = df[['turnover_dynamic_il_3m', 'turnover_dynamic_cur_3m', 'turnover_dynamic_paym_3m','turnover_dynamic_cc_3m' ]].sum(axis=1)

print(df[['total_turnover_3m']].count())

In [None]:
# Discretizing the new total_turnover_3m feature using pd.qcut() with 'duplicates' set to 'drop'
df['total_turnover_3m_category'] = pd.qcut(df['total_turnover_3m'], 6, labels=False, duplicates='drop')

In [None]:
# Drop the 'total_turnover' column from the DataFrame related to three months
df.drop(columns=['turnover_dynamic_il_3m', 'turnover_dynamic_cur_3m', 'turnover_dynamic_paym_3m','turnover_dynamic_cc_3m', 'total_turnover_3m', 'total_turnover_3m' ], inplace=True)

In [None]:
numeric_columns =[]
cat_columns=[]
for col in df.columns:
    if df[col].dtype == 'object':
        cat_columns.append(col)
    else:
        numeric_columns.append(col)

VISUALIZING HEATMAP TO FIND MORE RELATED FEATURES TO DROP 

In [None]:
# Excluding new_age_category column
numeric_columns_without_new_age = [col for col in df[numeric_columns].columns if col != 'new_age_category']

# Calculating correlation
correlation_matrix = df[numeric_columns_without_new_age].corr()

# Setting the figure size
plt.figure(figsize=(12, 10))

# Visualizing heatmap 
sns.heatmap(correlation_matrix, cmap='Blues', linewidths=1, annot=True, annot_kws={"size": 8}, fmt=".2f")
plt.show()




It is decided based on the heatmap to eliminate the columns: 'trans_count_nas_prc', 'trans_count_sup_prc', 'trans_count_atm_prc'.

Likewise, it was determined to eliminate the columns related to 'rest_dynamic' that describe information for 1 month, these are:
'rest_dynamic_fdep_1m', 'rest_dynamic_il_1m', 'rest_dynamic_cur_1m', 'rest_dynamic_paym_1m', 'rest_dynamic_cc_1m'

In [None]:
#Dropping redundant feature finding in the heatmap 
df.drop(columns=['trans_count_nas_prc', 'new_age_category','trans_count_sup_prc', 'trans_count_atm_prc','rest_dynamic_fdep_1m', 'rest_dynamic_il_1m', 'rest_dynamic_cur_1m', 'rest_dynamic_paym_1m', 'rest_dynamic_cc_1m'], inplace=True)


4. Standarizing 

In [None]:
# Scaling features using statistics that are robust to outliers.

def robust_scaler(variable):
    if variable.dtype == 'object':  # If the variable is of type 'object', it's likely a label encoded column
        return variable
    else:  # If the variable is numeric, apply robust scaling
        var_median = variable.median()
        quartile1 = variable.quantile(0.25)
        quartile3 = variable.quantile(0.75)
        interquantile_range = quartile3 - quartile1
        if int(interquantile_range) == 0:
            quartile1 = variable.quantile(0.05)
            quartile3 = variable.quantile(0.95)
            interquantile_range = quartile3 - quartile1
            if int(interquantile_range) == 0:
                quartile1 = variable.quantile(0.01)
                quartile3 = variable.quantile(0.99)
                interquantile_range = quartile3 - quartile1
                z = (variable - var_median) / interquantile_range
                return round(z, 3)

            z = (variable - var_median) / interquantile_range
            return round(z, 3)
        else:
            z = (variable - var_median) / interquantile_range
        return round(z, 3)


### End of Featured Engineering
#### send df to S3 for modelling

In [None]:
# This Write the analyzed data to S3
s3_utils.write_csv_to_s3(output_file_key_data_feature_engineering, df)
print(f"File '{output_file_key_data_feature_engineering}' successfully written to bucket '{s3_utils.bucket_name}'.")