# __Analiza Danych__

### Importy

In [24]:
import pandas as pd
import numpy as np
import math

### Załadowanie zbioru danych

In [3]:
df = pd.read_csv('repo/loan_data.csv')
df.sample(5)

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
14839,26.0,female,Associate,140421.0,7,MORTGAGE,20000.0,VENTURE,12.21,0.14,2.0,705,No,0
40282,26.0,female,Associate,60910.0,7,RENT,6500.0,VENTURE,13.46,0.11,6.0,605,Yes,0
31784,43.0,male,Master,170084.0,21,MORTGAGE,15000.0,VENTURE,12.53,0.09,16.0,688,No,0
30629,41.0,male,High School,65925.0,16,MORTGAGE,5000.0,VENTURE,13.23,0.08,12.0,678,No,1
40388,23.0,female,Associate,90994.0,0,RENT,8000.0,DEBTCONSOLIDATION,10.22,0.09,3.0,604,Yes,0


## Wiersze do analizy
* person_age
* person_gender
* person_education
* person_income
* person_emp_exp
* person_home_ownership
* cb_person_cred_hist_length

## Użyteczne funkcje

In [None]:
def trim_quantile(df, q1_val, q2_val):
    Q1 = df.quantile(q1_val)
    Q3 = df.quantile(q2_val)

    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    print(lower_bound)
    print(upper_bound)
    new_df = df[(df < lower_bound) | (df > upper_bound)]
    return new_df

In [78]:
def get_value_counts(df):
    df_v_counts = df.value_counts().to_frame().reset_index()
    df_v_counts['count_percent'] = np.round((df_v_counts['count'] / df_v_counts['count'].sum()) * 100, 2)
    return df_v_counts

In [227]:
def get_value_count_distribution(df, distribution_lvls):
    df_v_counts = df.value_counts().sort_index()

    index_name = df_v_counts.index.name
    index_group_name = index_name + '_group'

    df_min = int(df_v_counts.index.min())
    df_max = int(df_v_counts.index.max())

    bins = [df_min] + distribution_lvls + [df_max]

    df_v_counts = df_v_counts.to_frame()
    
    df_v_counts[index_group_name] = pd.cut(df_v_counts.index, bins=bins, include_lowest=True)
    df_v_counts = df_v_counts.reset_index().drop(columns=index_name)

    df_vc_distribution = df_v_counts.groupby(by=index_group_name).sum().reset_index()
    df_vc_distribution['count_percent'] = np.round((df_vc_distribution['count'] / df_vc_distribution['count'].sum()) * 100, 2)

    return df_vc_distribution

## 1. _person_age_ - wiek klienta

Wszystkie wartości

In [None]:
df_p_age = get_value_counts(df['person_age'])
display(df_p_age.sort_values('person_age'))


Unnamed: 0,person_age,count,count_percent
38,20.0,17,0.04
11,21.0,1289,2.86
3,22.0,4236,9.41
0,23.0,5254,11.68
1,24.0,5138,11.42
2,25.0,4507,10.02
4,26.0,3659,8.13
5,27.0,3095,6.88
6,28.0,2728,6.06
7,29.0,2455,5.46


Rozkład

In [230]:
df_p_age_dist = get_value_count_distribution(df['person_age'], [21,26,35,40,50,65])
display(df_p_age_dist)

  df_vc_distribution = df_v_counts.groupby(by=index_group_name).sum().reset_index()


Unnamed: 0,person_age_group,count,count_percent
0,"(19.999, 21.0]",1306,2.9
1,"(21.0, 26.0]",22794,50.65
2,"(26.0, 35.0]",16414,36.48
3,"(35.0, 40.0]",2647,5.88
4,"(40.0, 50.0]",1511,3.36
5,"(50.0, 65.0]",289,0.64
6,"(65.0, 144.0]",39,0.09


## 2. _person_gender_ - płeć klienta

In [231]:
df_p_gender = get_value_counts(df['person_gender'])
display(df_p_gender.sort_values('count', ascending=False))

Unnamed: 0,person_gender,count,count_percent
0,male,24841,55.2
1,female,20159,44.8


## 3. _person_education_ - wykształcenie klienta

In [232]:
df_p_education = get_value_counts(df['person_education'])
display(df_p_education.sort_values('count', ascending=False))

Unnamed: 0,person_education,count,count_percent
0,Bachelor,13399,29.78
1,Associate,12028,26.73
2,High School,11972,26.6
3,Master,6980,15.51
4,Doctorate,621,1.38


## 4. _person_income_ - dochód klienta

In [236]:
df_p_income_dist = get_value_count_distribution(df['person_income'], [10000,50000,100000,200000,400000])
display(df_p_income_dist)

  df_vc_distribution = df_v_counts.groupby(by=index_group_name).sum().reset_index()


Unnamed: 0,person_income_group,count,count_percent
0,"(7999.999, 10000.0]",31,0.07
1,"(10000.0, 50000.0]",13016,28.92
2,"(50000.0, 100000.0]",21929,48.73
3,"(100000.0, 200000.0]",8781,19.51
4,"(200000.0, 400000.0]",1078,2.4
5,"(400000.0, 7200766.0]",165,0.37


## 5. _person_emp_exp_ - doświadczenie klienta

Wszystkie wartości

In [240]:
df_p_emp_exp = get_value_counts(df['person_emp_exp'])
display(df_p_emp_exp.sort_values('person_emp_exp'))

Unnamed: 0,person_emp_exp,count,count_percent
0,0,9566,21.26
2,1,4061,9.02
1,2,4134,9.19
3,3,3890,8.64
4,4,3524,7.83
...,...,...,...
53,100,1,0.00
54,101,1,0.00
51,121,1,0.00
55,124,1,0.00


Rozkład

In [244]:
df_p_emp_exp_dist = get_value_count_distribution(df['person_emp_exp'], [1, 3, 5, 10])
display(df_p_emp_exp_dist)

  df_vc_distribution = df_v_counts.groupby(by=index_group_name).sum().reset_index()


Unnamed: 0,person_emp_exp_group,count,count_percent
0,"(-0.001, 1.0]",13627,30.28
1,"(1.0, 3.0]",8024,17.83
2,"(3.0, 5.0]",6524,14.5
3,"(5.0, 10.0]",9769,21.71
4,"(10.0, 125.0]",7056,15.68


## 6. _person_home_ownership_ - status posiadania nieruchomości

In [246]:
df_p_home_own = get_value_counts(df['person_home_ownership'])
display(df_p_home_own)

Unnamed: 0,person_home_ownership,count,count_percent
0,RENT,23443,52.1
1,MORTGAGE,18489,41.09
2,OWN,2951,6.56
3,OTHER,117,0.26


## 7. _cb_person_cred_hist_length_ - długość historii pożyczek w latach

Wszystkie wartości

In [250]:
df_cb_p_cred_hist_length = get_value_counts(df['cb_person_cred_hist_length'])
display(df_cb_p_cred_hist_length.sort_values('cb_person_cred_hist_length'))

Unnamed: 0,cb_person_cred_hist_length,count,count_percent
2,2.0,6537,14.53
1,3.0,8312,18.47
0,4.0,8653,19.23
3,5.0,3082,6.85
4,6.0,2966,6.59
5,7.0,2889,6.42
6,8.0,2800,6.22
7,9.0,2685,5.97
8,10.0,2457,5.46
10,11.0,712,1.58


Rozkład

In [256]:
df_cb_p_cred_hist_length_dist = get_value_count_distribution(df['cb_person_cred_hist_length'], [3,5,8,12])
display(df_cb_p_cred_hist_length_dist)

  df_vc_distribution = df_v_counts.groupby(by=index_group_name).sum().reset_index()


Unnamed: 0,cb_person_cred_hist_length_group,count,count_percent
0,"(1.999, 3.0]",14849,33.0
1,"(3.0, 5.0]",11735,26.08
2,"(5.0, 8.0]",8655,19.23
3,"(8.0, 12.0]",6569,14.6
4,"(12.0, 30.0]",3192,7.09
