# Import libraries

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set visual style for plots
sns.set(style="whitegrid")

function for downcasting, and we will need for optimization

In [None]:
def downcaste_dtype(df):
    previous_memory_consumption = (df.memory_usage().sum()) / 1024**2 
    for col in df.columns:
        col_dtype = df[col].dtypes
        if col_dtype != 'object':
            xmin = df[col].min()
            xmax = df[col].max()
            if str(col_dtype)[0] == 'i': 
                if np.iinfo('int8').min < xmin and np.iinfo('int8').max > xmax:
                    df[col] = df[col].astype('int8')
                elif np.iinfo('int16').min < xmin and np.iinfo('int16').max > xmax:
                    df[col] = df[col].astype('int16')
                elif np.iinfo('int32').min < xmin and np.iinfo('int32').max > xmax:
                    df[col] = df[col].astype('int32')
                elif np.iinfo('int64').min < xmin and np.iinfo('int64').max > xmax:
                    df[col] = df[col].astype('int64')
                    
            elif str(col_dtype)[0] == 'f': 
                if np.finfo('float16').min < xmin and np.finfo('float16').max > xmax:
                    df[col] = df[col].astype('float16')
                elif np.finfo('float32').min < xmin and np.finfo('float32').max > xmax:
                    df[col] = df[col].astype('float32')
                elif np.finfo('float64').min < xmin and np.finfo('float64').max > xmax:
                    df[col] = df[col].astype('float64')
            else:
                pass

    after_memory_consumption = (df.memory_usage().sum()) / 1024**2
    percentage_of_decrease = ((previous_memory_consumption - after_memory_consumption) / previous_memory_consumption) * 100

    print('Memory usage before downcasting: {:.2f} MB'.format(previous_memory_consumption))
    print('After Downcasting the memory usage decreased to: {:.2f} MB'.format(after_memory_consumption))
    print('Memory usage decreased by {:.3f}%'.format(percentage_of_decrease))

    return df

# Load datasets

In [None]:
pos_cash_loans = pd.read_csv('../DataSet/previous_pos_cash_loans.csv')
credit_cards = pd.read_csv('../DataSet/previous_credit_cards.csv')

In [None]:
# optimizing
pos_cash_loans = downcaste_dtype(pos_cash_loans)
credit_cards = downcaste_dtype(credit_cards)

# Merging datasets

In [None]:
merged_data = pd.merge(pos_cash_loans, credit_cards, on='sk_id_curr', how='left')

# Data Cleaning and Preprocessing

In [None]:
# Checking for missing values
missing_values = merged_data.isnull().sum()

# handling missing values
merged_data['amt_balance'].fillna(merged_data['amt_balance'].median(), inplace=True)
merged_data['cnt_instalment'].fillna(merged_data['cnt_instalment'].mode()[0], inplace=True)

# Exploratory Data Analysis (EDA)

In [2]:
summary = merged_data.describe()
summary

NameError: name 'merged_data' is not defined

In [None]:
# Distribution of key numerical features
merged_data.hist(bins=15, figsize=(15, 10), edgecolor='black')
plt.subplots_adjust(hspace=0.5)

In [None]:
# Advanced Visualization: POS Cash Loan Amount Distribution
plt.figure(figsize=(10, 6))
sns.distplot(pos_cash_loans['cnt_instalment_future'], bins=30, kde=False)
plt.title('Distribution of Future Installments Count in POS Cash Loans')


In [None]:
# Correlation analysis
correlation_matrix = merged_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')

# Aggregation and Grouping

In [None]:
# Average balance by contract status in credit cards
avg_balance_by_status = credit_cards.groupby('name_contract_status')['amt_balance'].mean()

# Total POS cash loan count by contract status
total_pos_count_by_status = pos_cash_loans.groupby('name_contract_status').size().reset_index(name='count')

# Multi-level aggregation
# Average balance and total credit limit by contract status in credit cards
agg_data = credit_cards.groupby('name_contract_status').agg({'amt_balance': 'mean', 
                                                             'amt_credit_limit_actual': 'sum'}).reset_index()

# Visualization of Aggregated Data

In [None]:
# Bar plot for average balance by contract status
plt.figure(figsize=(10, 6))
sns.barplot(x='name_contract_status', y='amt_balance', data=avg_balance_by_status.reset_index())
plt.title('Average Balance by Contract Status in Credit Cards')
plt.xticks(rotation=45)

In [None]:
# Bar plot for total POS cash loan count by status
plt.figure(figsize=(12, 6))
sns.barplot(x='name_contract_status', y='count', data=total_pos_count_by_status)
plt.title('Total POS Cash Loan Count by Contract Status')
plt.xticks(rotation=45)

In [None]:
# outliers detections usgin z-score
credit_cards['balance_z_score'] = np.abs(stats.zscore(credit_cards['amt_balance']))
outliers = credit_cards[credit_cards['balance_z_score'] > 3]
outliers