# Exploratory Data Analysys (EDA)
## Costumer loans dataset


In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
df_valid = pd.read_csv("orig_valid.csv")
df_train = pd.read_csv("orig_train.csv")

In [None]:
print(len(df_valid))
print(len(df_train))

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.head()

## Dataset Description
#### Variables in order:
**PRODUCT** - Type of financial product the client has  
**AGE** - age of client  
**AREA** - The geographical area where the client resides  
**RESIDENTAL_PLACE** -  Indicates whether the client owns their living place (with further distinction on having a mortgage or not), lives with family, or is renting  
**EDUCATION** - The highest level of education attained by the client  
**MARTIAL_STATUS** - Describes the client's marital status, including options such as single, married, divorced, or widowed  
**HOUSEHOLD_MEMBERS** - The number of people living in the client's household  
**NO_OF_DEPENDENTS** - The number of individuals financially dependent on the client  
**INCOME** - The client's monthly income  
**WORK_SENIORITY** - The total number of years the client has been working  
**LENGTH_RELATIONSHIP_WITH_CLIENT**	- The duration (in years) of the client's relationship with the bank  
**DEBIT_CARD**- wheter clinet owns a debit card or not  
**CURRENT_ACCOUNT** -  wheter client owns a current account  
**SAVING_ACCOUNT** - wheter client owns a saving account  
**SALARY_ACCOUNT** - Indicates whether the client has a salary account with the bank, designed for receiving payroll  
**FOREIGN_ACCOUNT** -  Whether the client holds any bank accounts in foreign countries  
**FINALIZED_LOAN** - The number of loans the client has fully repaid  
**DEPOSIT** -   Indicates whether the client has made any fixed or term deposits with the bank  
**PENSION_FUNDS** - Whether the client has invested in any pension funds through the bank  
**DEFAULT_FLAG** -A binary indicator (e.g., Yes/No, 1/0) showing whether the client has defaulted on any financial obligation 



## Rozkłady zmiennych numerycznych

In [None]:
df_train.hist(bins = 40, figsize=(18, 12))
df_valid.hist(bins = 40, figsize=(18, 12))
plt.show()

## Rozkłady zmiennych kategorycznych

In [None]:
df_strings = df_train[["PRODUCT", "AREA", "RESIDENTIAL_PLACE", "EDUCATION", "MARITAL_STATUS"]]
fig, axs = plt.subplots(nrows= 5, ncols=1, figsize=(10, 5 * 5))
for ax, column in zip(axs, ["PRODUCT", "AREA", "RESIDENTIAL_PLACE", "EDUCATION", "MARITAL_STATUS"]):
    value_counts = df_train[column].value_counts()
    ax.bar(value_counts.index.astype(str), value_counts.values)
    ax.set_title(column)
    ax.set_ylabel('Counts')
    ax.tick_params(axis='x', rotation=45)  

plt.tight_layout()  
plt.show()                         

In [None]:
elems = ["PRODUCT", "AREA", "RESIDENTIAL_PLACE", "EDUCATION", "MARITAL_STATUS"]
for elem in elems:
    print(df_train[elem].unique())


## zamiana zmiennych kategorycznych na numeryczne

In [None]:
df_train['MARITAL_STATUS'] = df_train['MARITAL_STATUS'].replace({'single': 1, 'married': 2, 'divorced': 3, 'widow': 4})
df_train['PRODUCT'] = df_train['PRODUCT'].replace({'A': 1, 'B': 2, "C": 3, "D": 4, "E": 5, "F": 6})
df_train['AREA'] = df_train['AREA'].replace({'County capital' : 3, 'Urban area' : 2, 'Rural area' : 1, 'Missing': 0})
df_train["RESIDENTIAL_PLACE"] = df_train['RESIDENTIAL_PLACE'].replace({'Owner without mortgage' : 4,'Living with family' : 3,'Owner with mortgage' : 2,
 'Rental' : 1, 'Other': 0})
df_train["EDUCATION"] = df_train['EDUCATION'].replace({'University' : 9, 'Post-graduate' : 8, 'Highschool' : 7,'Missing' : 6,
 'Post secondary school' : 5,'College' : 4,'Vocational school' : 3,'Middle school' : 2, 'Primary school' : 1, 'Other' : 0})
df_valid['MARITAL_STATUS'] = df_valid['MARITAL_STATUS'].replace({'single': 1, 'married': 2, 'divorced': 3, 'widow': 4})
df_valid['PRODUCT'] = df_valid['PRODUCT'].replace({'A': 1, 'B': 2, "C": 3, "D": 4, "E": 5, "F": 6})
df_valid['AREA'] = df_valid['AREA'].replace({'County capital' : 3, 'Urban area' : 2, 'Rural area' : 1, 'Missing': 0})
df_valid["RESIDENTIAL_PLACE"] = df_valid['RESIDENTIAL_PLACE'].replace({'Owner without mortgage' : 4,'Living with family' : 3,'Owner with mortgage' : 2,
 'Rental' : 1, 'Other': 0})
df_valid["EDUCATION"] = df_valid['EDUCATION'].replace({'University' : 9, 'Post-graduate' : 8, 'Highschool' : 7,'Missing' : 6,
 'Post secondary school' : 5,'College' : 4,'Vocational school' : 3,'Middle school' : 2, 'Primary school' : 1, 'Other' : 0})


df_train.head(20)
df_train.info()

## Macierz Korelacji zmiennych

In [None]:
df_train = df_train.drop(columns = ["PENSION_FUNDS","ECONOMIC_SECTOR","EMPLOYEE_NO"])

correlation_matrix = df_train.corr('spearman')
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
plt.figure(figsize=(20, 20))
sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt=".2f", cmap='coolwarm',
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, vmin=-1, vmax=1)

plt.show()