# Pandas
> - 데이터 과학자를 위해 **테이블형태**로 데이터를 다룰 수 있게 해주는 패키지(python용 엑셀)
> - 기존 데이터처리 라이브러리인 numpy 대신 주로 사용
> - 일반인이 데이터분석을 접하기 쉽게 만들어준 결정적인 라이브러리
> - pandas만으로도 충분히 데이터 분석이 가능할 정도로 고수준의 함수들을 내장
> - 앞으로 진행하는 데이터분석 과정에서 주로 사용하게 될 데이터구조

## pandas 설치 및 import
    
> 콘솔창에서 실행 시  
**`pip install pandas`**  
**`conda install pandas`**
    
> 주피터 노트북으로 실행 시  
**`!pip install pandas`**
    
아나콘다 환경으로 python 환경설정 시 기본적으로 설치가 되어있음

In [1]:
# pandas 설치
# !pip install pandas

In [49]:
# numpy import
import numpy as np

# pandas import
import pandas as pd
# pd라는 닉네임은 많은 파이썬 유저들이 사용하고 있는 닉네임, 분석을 위한 필수는 아니지만 되도록이면 위와 같이 사용을 해줍시다.

pd.options.display.max_columns = 200
# 불러들이는 데이터에 맞춰 모든 컬럼을 확인 가능하도록 옵션값을 주었습니다.
pd.options.display.max_info_columns =200
# 그냥 실행 시키시고 지금 이해 못하셔도 좋습니다.

## DataFrame
> - 엑셀에 익숙한 사용자를 위해 제작 된 **테이블형태의 데이터 구조**  
> - 다양한 형태의 데이터를 받아 사용할 수 있으며 다양한 **통계, 시각화 함수를 제공**한다.  

실제 데이터를 불러들이고 값을 확인 해 보며 기본적인 pandas 사용법을 익혀보도록 하겠습니다.

### 데이터 불러오기
pandas는 다양한 데이터 파일 형태를 지원하며 주로 csv, xlsx, sql, json을 사용합니다.
    
> **`read_csv()`**  
**`read_excel()`**  
**`read_sql()`**  
**`read_json()`**  
**`json_normalize()`**

In [3]:
# DataFrame 의 약자로서 형식적으로 df 변수명을 사용한다.
# pandas패키지의 read_csv() 함수를 사용하여 loan1.csv 파일을 불러들여 데이터프레임을 만들고 df 이름의 변수로 저장
df= pd.read_csv('./data/loan1.csv')

In [4]:
df

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,10000,10000,10000.0,36 months,9.44,320.05,B,B1,mechanic,6 years,MORTGAGE,80000.0,Not Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,762xx,TX,14.82,0.0,Jul-2007,0.0,34.0,,8.0,0.0,5225,73.6,30.0,w,6442.28,6442.28,4493.810000,4493.81,3557.72,936.09,0.0,0.0,0.0,Feb-2019,320.05,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,173110.0,0.0,2.0,0.0,2.0,23.0,12496.0,39.0,0.0,0.0,3949.0,45.0,7100.0,1.0,0.0,0.0,2.0,21639.0,1875.0,73.6,0.0,0.0,125.0,78.0,26.0,23.0,3.0,26.0,,21.0,,0.0,2.0,2.0,4.0,4.0,21.0,4.0,5.0,2.0,8.0,0.0,0.0,0.0,0.0,96.4,25.0,0.0,0.0,196130.0,17756.0,7100.0,31992.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
1,,,3500,3500,3500.0,36 months,10.42,113.63,B,B3,,,OWN,90000.0,Not Verified,Dec-2017,Current,n,,,other,Other,295xx,SC,28.51,0.0,Jun-2002,0.0,39.0,28.0,12.0,3.0,6953,51.9,38.0,w,2266.55,2266.55,1586.770000,1586.77,1233.45,353.32,0.0,0.0,0.0,Feb-2019,113.63,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,339028.0,0.0,4.0,0.0,3.0,22.0,76501.0,69.0,1.0,2.0,1628.0,65.0,13400.0,1.0,5.0,1.0,5.0,28252.0,808.0,82.4,0.0,0.0,164.0,186.0,7.0,7.0,2.0,7.0,39.0,7.0,39.0,0.0,4.0,7.0,4.0,10.0,19.0,7.0,17.0,7.0,12.0,0.0,0.0,0.0,1.0,97.3,75.0,0.0,3.0,416685.0,83454.0,4600.0,110595.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
2,,,5000,5000,5000.0,36 months,13.59,169.90,C,C2,Truck driver,10+ years,OWN,168000.0,Not Verified,Dec-2017,Current,n,,,other,Other,788xx,TX,11.62,0.0,Aug-2002,0.0,44.0,,4.0,0.0,3401,97.2,12.0,w,3291.95,3291.95,2371.050000,2371.05,1708.05,663.00,0.0,0.0,0.0,Feb-2019,169.90,Mar-2019,Feb-2019,0.0,44.0,1,Individual,,,,0.0,0.0,51673.0,1.0,3.0,1.0,1.0,5.0,48272.0,48.0,0.0,0.0,3401.0,53.0,3500.0,2.0,0.0,3.0,1.0,12918.0,99.0,97.2,0.0,0.0,135.0,184.0,54.0,5.0,0.0,54.0,44.0,5.0,44.0,2.0,1.0,1.0,1.0,4.0,6.0,1.0,6.0,1.0,4.0,0.0,0.0,0.0,1.0,83.3,100.0,0.0,0.0,82176.0,51673.0,3500.0,78676.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
3,,,14000,14000,14000.0,36 months,10.91,457.75,B,B4,Confidential Secretary,2 years,RENT,39000.0,Source Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,125xx,NY,22.88,0.0,Apr-2003,0.0,74.0,,8.0,0.0,12918,59.5,16.0,w,9090.87,9090.87,6391.530000,6391.53,4909.13,1482.40,0.0,0.0,0.0,Feb-2019,457.75,Mar-2019,Feb-2019,0.0,74.0,1,Individual,,,,0.0,457.0,29103.0,1.0,1.0,1.0,2.0,4.0,16185.0,95.0,1.0,5.0,10153.0,75.0,21700.0,2.0,6.0,0.0,7.0,3638.0,7265.0,61.6,0.0,0.0,36.0,176.0,7.0,4.0,0.0,7.0,,16.0,74.0,1.0,2.0,4.0,3.0,4.0,4.0,7.0,12.0,4.0,8.0,0.0,0.0,0.0,2.0,93.8,33.3,0.0,0.0,38704.0,29103.0,18900.0,17004.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
4,,,5000,5000,5000.0,36 months,13.59,169.90,C,C2,General Manager,< 1 year,RENT,55000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,672xx,KS,12.18,0.0,Jun-1999,0.0,,,5.0,0.0,4497,91.8,6.0,w,3291.95,3291.95,2371.050000,2371.05,1708.05,663.00,0.0,0.0,0.0,Feb-2019,169.90,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,471.0,7202.0,0.0,1.0,0.0,0.0,71.0,2705.0,12.0,0.0,0.0,1483.0,27.0,4900.0,0.0,1.0,0.0,0.0,1440.0,403.0,91.8,0.0,0.0,149.0,222.0,43.0,43.0,0.0,43.0,,,,0.0,4.0,4.0,4.0,4.0,2.0,4.0,4.0,4.0,5.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,0.0,26841.0,7202.0,4900.0,21941.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,,,10800,10800,10800.0,36 months,9.44,345.66,B,B1,Partner,3 years,MORTGAGE,240000.0,Not Verified,Nov-2017,Current,n,,,major_purchase,Major purchase,370xx,TN,9.25,0.0,Oct-2005,1.0,42.0,,6.0,0.0,128754,68.4,10.0,w,6667.33,6667.33,5173.570000,5173.57,4132.67,1040.90,0.0,0.0,0.0,Feb-2019,345.66,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,414761.0,1.0,2.0,1.0,1.0,2.0,74155.0,,1.0,1.0,911.0,68.0,152300.0,1.0,1.0,4.0,2.0,69127.0,726.0,68.4,0.0,0.0,145.0,107.0,10.0,2.0,2.0,89.0,42.0,0.0,42.0,0.0,2.0,3.0,2.0,2.0,5.0,3.0,3.0,3.0,6.0,0.0,0.0,0.0,2.0,77.8,0.0,0.0,0.0,469534.0,202909.0,2300.0,84548.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
19996,,,14000,14000,14000.0,60 months,14.08,326.34,C,C3,Senior Property Manager,10+ years,OWN,53300.0,Verified,Nov-2017,Current,n,,,major_purchase,Major purchase,481xx,MI,12.95,0.0,Mar-1987,1.0,27.0,,7.0,0.0,1536,30.7,17.0,w,11358.71,11358.71,4873.200000,4873.20,2641.29,2231.91,0.0,0.0,0.0,Feb-2019,326.34,Mar-2019,Feb-2019,0.0,40.0,1,Individual,,,,0.0,0.0,85489.0,0.0,1.0,1.0,1.0,8.0,12118.0,84.0,0.0,1.0,0.0,71.0,5000.0,2.0,0.0,2.0,2.0,14248.0,,,0.0,0.0,140.0,368.0,20.0,8.0,1.0,,40.0,0.0,40.0,3.0,0.0,2.0,0.0,3.0,5.0,5.0,11.0,2.0,7.0,0.0,0.0,0.0,1.0,52.9,,0.0,0.0,105375.0,13654.0,0.0,14375.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
19997,,,35000,35000,35000.0,60 months,12.62,789.57,C,C1,Mechanic engineer,7 years,RENT,85000.0,Source Verified,Nov-2017,Fully Paid,n,,,credit_card,Credit card refinancing,030xx,NH,16.77,0.0,Jul-2002,2.0,64.0,,11.0,0.0,23115,43.6,29.0,w,0.00,0.00,38737.465857,38737.47,35000.00,3737.47,0.0,0.0,0.0,Oct-2018,31680.42,,Oct-2018,1.0,,1,Joint App,160000.0,18.86,Source Verified,0.0,468.0,47500.0,0.0,1.0,0.0,0.0,25.0,24385.0,,0.0,0.0,9076.0,44.0,49500.0,0.0,0.0,2.0,0.0,4750.0,24385.0,45.7,0.0,0.0,184.0,91.0,27.0,25.0,0.0,27.0,,0.0,,0.0,4.0,4.0,8.0,17.0,8.0,10.0,21.0,4.0,11.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,87549.0,47500.0,47500.0,38049.0,40609.0,Oct-2006,1.0,0.0,34.0,45.9,17.0,30.0,0.0,0.0,69.0,N,,,,,,,,,,,,,,,Cash,N,,,,,,
19998,,,35225,35225,35225.0,60 months,19.03,914.34,D,D3,Teacher,8 years,MORTGAGE,42000.0,Source Verified,Nov-2017,Current,n,,,debt_consolidation,Debt consolidation,532xx,WI,32.80,0.0,May-2003,0.0,65.0,,15.0,0.0,10432,60.0,30.0,w,29254.00,29254.00,13640.620000,13640.62,5971.00,7669.62,0.0,0.0,0.0,Feb-2019,914.34,Mar-2019,Feb-2019,0.0,,1,Joint App,82000.0,25.02,Source Verified,0.0,0.0,162765.0,0.0,7.0,0.0,0.0,46.0,19208.0,35.0,0.0,0.0,4515.0,41.0,17400.0,0.0,0.0,0.0,0.0,11626.0,5315.0,65.5,0.0,0.0,169.0,174.0,52.0,40.0,1.0,89.0,,,65.0,0.0,3.0,4.0,4.0,7.0,10.0,6.0,17.0,4.0,15.0,0.0,0.0,0.0,0.0,96.4,50.0,0.0,0.0,213708.0,29838.0,15400.0,54560.0,29018.0,May-2004,0.0,1.0,11.0,56.0,1.0,11.0,0.0,0.0,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [5]:
!pip install xlrd



In [17]:
!pip install openpyxl



In [18]:
# 만약 모듈을 찾을 수 없는 오류가 발생한다면 추가 모듈 설치
# 필요 모듈 import
!pip install pyxlsb

Collecting pyxlsb
  Downloading pyxlsb-1.0.9-py2.py3-none-any.whl (23 kB)
Installing collected packages: pyxlsb
Successfully installed pyxlsb-1.0.9


In [6]:
# 엑셀파일에 시트에 따라 데이터 구분이 지어진 경우 시트별로 데이터프레임 제작 가능
# 다른 엑셀파일형식을 가져올 때 engine파라메터 추가해주시면 됩니다.
df1 = pd.read_excel('./data/loan1.xlsx')
"""df1 = pd.read_excel('./data/loan1.xlsx', 
                    sheet_name='구매영수증상세+상품마스터포함', 
                    engine='pyxlsb',
                    encoding='utf-8') # 윈도우의 경우 cp949"""

"df1 = pd.read_excel('./data/loan1.xlsx', \n                    sheet_name='구매영수증상세+상품마스터포함', \n                    engine='pyxlsb',\n                    encoding='utf-8') # 윈도우의 경우 cp949"

In [7]:
df1

Unnamed: 0.1,Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,0,,,10000,10000,10000.0,36 months,9.44,320.05,B,B1,mechanic,6 years,MORTGAGE,80000.0,Not Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,762xx,TX,14.82,0,Jul-2007,0,34.0,,8,0,5225,73.6,30,w,6442.28,6442.28,4493.810000,4493.81,3557.72,936.09,0.0,0.0,0.0,Feb-2019,320.05,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,173110,0,2,0,2,23.0,12496,39.0,0,0,3949,45.0,7100,1,0,0,2,21639.0,1875.0,73.6,0,0,125.0,78,26,23,3,26.0,,21.0,,0,2,2,4,4,21,4,5,2,8,0.0,0,0,0,96.4,25.0,0,0,196130,17756,7100,31992,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
1,1,,,3500,3500,3500.0,36 months,10.42,113.63,B,B3,,,OWN,90000.0,Not Verified,Dec-2017,Current,n,,,other,Other,295xx,SC,28.51,0,Jun-2002,0,39.0,28.0,12,3,6953,51.9,38,w,2266.55,2266.55,1586.770000,1586.77,1233.45,353.32,0.0,0.0,0.0,Feb-2019,113.63,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,339028,0,4,0,3,22.0,76501,69.0,1,2,1628,65.0,13400,1,5,1,5,28252.0,808.0,82.4,0,0,164.0,186,7,7,2,7.0,39.0,7.0,39.0,0,4,7,4,10,19,7,17,7,12,0.0,0,0,1,97.3,75.0,0,3,416685,83454,4600,110595,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
2,2,,,5000,5000,5000.0,36 months,13.59,169.90,C,C2,Truck driver,10+ years,OWN,168000.0,Not Verified,Dec-2017,Current,n,,,other,Other,788xx,TX,11.62,0,Aug-2002,0,44.0,,4,0,3401,97.2,12,w,3291.95,3291.95,2371.050000,2371.05,1708.05,663.00,0.0,0.0,0.0,Feb-2019,169.90,Mar-2019,Feb-2019,0,44.0,1,Individual,,,,0,0,51673,1,3,1,1,5.0,48272,48.0,0,0,3401,53.0,3500,2,0,3,1,12918.0,99.0,97.2,0,0,135.0,184,54,5,0,54.0,44.0,5.0,44.0,2,1,1,1,4,6,1,6,1,4,0.0,0,0,1,83.3,100.0,0,0,82176,51673,3500,78676,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
3,3,,,14000,14000,14000.0,36 months,10.91,457.75,B,B4,Confidential Secretary,2 years,RENT,39000.0,Source Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,125xx,NY,22.88,0,Apr-2003,0,74.0,,8,0,12918,59.5,16,w,9090.87,9090.87,6391.530000,6391.53,4909.13,1482.40,0.0,0.0,0.0,Feb-2019,457.75,Mar-2019,Feb-2019,0,74.0,1,Individual,,,,0,457,29103,1,1,1,2,4.0,16185,95.0,1,5,10153,75.0,21700,2,6,0,7,3638.0,7265.0,61.6,0,0,36.0,176,7,4,0,7.0,,16.0,74.0,1,2,4,3,4,4,7,12,4,8,0.0,0,0,2,93.8,33.3,0,0,38704,29103,18900,17004,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
4,4,,,5000,5000,5000.0,36 months,13.59,169.90,C,C2,General Manager,< 1 year,RENT,55000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,672xx,KS,12.18,0,Jun-1999,0,,,5,0,4497,91.8,6,w,3291.95,3291.95,2371.050000,2371.05,1708.05,663.00,0.0,0.0,0.0,Feb-2019,169.90,Mar-2019,Feb-2019,0,,1,Individual,,,,0,471,7202,0,1,0,0,71.0,2705,12.0,0,0,1483,27.0,4900,0,1,0,0,1440.0,403.0,91.8,0,0,149.0,222,43,43,0,43.0,,,,0,4,4,4,4,2,4,4,4,5,0.0,0,0,0,100.0,100.0,0,0,26841,7202,4900,21941,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49995,,,13000,13000,13000.0,60 months,18.06,330.54,D,D2,Clinical Supervisor,10+ years,RENT,60000.0,Not Verified,Nov-2017,Current,n,,,debt_consolidation,Debt consolidation,554xx,MN,37.24,0,Jun-2003,1,,,25,0,12046,35.3,44,w,10748.94,10748.94,4932.010000,4932.01,2251.06,2680.95,0.0,0.0,0.0,Feb-2019,330.54,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,108816,1,15,2,4,6.0,96770,77.0,2,2,7281,68.0,34100,0,0,4,6,4534.0,20054.0,37.5,0,0,170.0,173,7,6,0,7.0,,6.0,,0,5,5,8,10,31,10,13,5,25,0.0,0,0,4,100.0,25.0,0,0,160629,108816,32100,126529,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
49996,49996,,,12000,12000,12000.0,60 months,13.59,276.68,C,C2,consultant,2 years,RENT,90000.0,Not Verified,Nov-2017,Fully Paid,n,,,credit_card,Credit card refinancing,773xx,TX,6.19,0,Sep-2003,1,,,12,0,16142,29.1,25,w,0.00,0.00,13886.761900,13886.76,12000.00,1886.76,0.0,0.0,0.0,Feb-2019,10000.07,,Feb-2019,0,,1,Individual,,,,0,52,115385,1,3,1,3,5.0,99243,88.0,0,2,8961,43.0,55400,0,0,1,5,11539.0,39258.0,29.1,0,0,130.0,125,15,5,0,15.0,,6.0,,0,4,4,7,11,8,8,16,6,12,0.0,0,0,1,100.0,14.3,0,0,151664,115385,55400,96264,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
49997,49997,,,20000,20000,20000.0,60 months,15.05,476.33,C,C4,Controls Engineer,2 years,MORTGAGE,105000.0,Not Verified,Nov-2017,Current,n,,,debt_consolidation,Debt consolidation,458xx,OH,15.17,0,Sep-2005,0,,,9,0,4791,51.0,34,w,16309.59,16309.59,7128.230000,7128.23,3690.41,3437.82,0.0,0.0,0.0,Feb-2019,476.33,Mar-2019,Feb-2019,0,,1,Individual,,,,0,2968,136467,0,4,0,2,18.0,45920,75.0,0,1,2276,71.0,9400,1,1,0,4,15163.0,24.0,99.0,0,0,146.0,141,13,13,2,85.0,,19.0,,0,1,3,1,2,17,4,15,3,9,0.0,0,0,0,100.0,100.0,0,0,159023,50711,2300,61623,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
49998,49998,,,7000,7000,7000.0,36 months,7.35,217.27,A,A4,Hair Stylist,< 1 year,RENT,30000.0,Verified,Nov-2017,Current,n,,,credit_card,Credit card refinancing,112xx,NY,32.64,1,Nov-2000,0,10.0,,11,0,6654,15.7,28,w,4268.88,4268.88,3256.190000,3256.19,2731.12,525.07,0.0,0.0,0.0,Feb-2019,217.27,Mar-2019,Feb-2019,0,,1,Joint App,90000.0,11.93,Verified,0,0,31450,0,1,0,1,24.0,24796,,0,1,1508,16.0,42500,0,0,0,2,3145.0,28772.0,8.7,0,0,188.0,204,15,15,0,15.0,,16.0,,0,2,5,4,9,5,10,23,5,11,0.0,0,0,0,96.4,0.0,0,0,77748,31450,31500,35248,11338.0,Aug-2014,0.0,0.0,4.0,62.2,0.0,4.0,0.0,0.0,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


\    /\
 )  ( ')
(  /  )
 \(__)|
 


In [8]:
# 참고! 실습은 하지 않습니다만 쿼리를 사용하여 데이터베이스로부터 데이터프레임을 만드는 것도 가능합니다.
# 데이터베이스로 부터 자료 읽기

# 필요한 모듈 추가 설치 - 각 데이터베이스 별로 다릅니다.
# !pip install pymysql

# sql 모듈 로드하기
# import pymysql
# mysql, mariadb, sqlite, postgresql, ms-sql, oracle, mongodb

# 접속하기
# 접속방법 또한 DB 종류에 따라 다릅니다.
# con = pymysql.connect(host='db서버주소', port=3306, user='id', passwd='pwd', db='dbname')

# query 만들기
# query = 'select * from samples'

# 자료 불러오기
# data = pd.read_sql(query, con=con)

### 데이터 저장하기
불러들인 혹은 작업을 마친 데이터프레임을 다양한 파일형태로 저장이 가능합니다.
    
> **`to_csv()`**  
**`to_excel()`**  
**`to_sql()`**

In [18]:
# index=False 파라메터는 기존 데이터프레임의 인덱스를 무시하고 저장
df.to_csv('./data/save_test.csv', index=False)

### 사용 데이터 간략 설명
> 미국 핀테크 회사인 lending club의 대출 데이터베이스  
클라우드펀딩과 대출을 결합한 핀테크의 시초라고 부를 수 있는 회사  
방대한 양의 대출정보를 공개하면서 금융정보분석에도 기여한 공이 큰 데이터  
2007 ~ 2015 년 대출정보 및 개인정보를 담고 있음  
226만건, 145항목 정보를 담고있음  
실습데이터는 이 중 4만건을 추출한 데이터를 사용합니다.  

데이터출처: https://www.kaggle.com/wordsforthewise/lending-club

### 데이터 살펴보기

In [19]:
# 데이터를 불러들인 후 가장 처음 하는 작업
# 데이터의 구조, 형태 파악하기
# 데이터의 첫 5개 데이터 하나(샘플, 인스턴스) 확인하기
df.head()
# 10개를 확인하려면?

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,10000,10000,10000.0,36 months,9.44,320.05,B,B1,mechanic,6 years,MORTGAGE,80000.0,Not Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,762xx,TX,14.82,0.0,Jul-2007,0.0,34.0,,8.0,0.0,5225,73.6,30.0,w,6442.28,6442.28,4493.81,4493.81,3557.72,936.09,0.0,0.0,0.0,Feb-2019,320.05,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,173110.0,0.0,2.0,0.0,2.0,23.0,12496.0,39.0,0.0,0.0,3949.0,45.0,7100.0,1.0,0.0,0.0,2.0,21639.0,1875.0,73.6,0.0,0.0,125.0,78.0,26.0,23.0,3.0,26.0,,21.0,,0.0,2.0,2.0,4.0,4.0,21.0,4.0,5.0,2.0,8.0,0.0,0.0,0.0,0.0,96.4,25.0,0.0,0.0,196130.0,17756.0,7100.0,31992.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
1,,,3500,3500,3500.0,36 months,10.42,113.63,B,B3,,,OWN,90000.0,Not Verified,Dec-2017,Current,n,,,other,Other,295xx,SC,28.51,0.0,Jun-2002,0.0,39.0,28.0,12.0,3.0,6953,51.9,38.0,w,2266.55,2266.55,1586.77,1586.77,1233.45,353.32,0.0,0.0,0.0,Feb-2019,113.63,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,339028.0,0.0,4.0,0.0,3.0,22.0,76501.0,69.0,1.0,2.0,1628.0,65.0,13400.0,1.0,5.0,1.0,5.0,28252.0,808.0,82.4,0.0,0.0,164.0,186.0,7.0,7.0,2.0,7.0,39.0,7.0,39.0,0.0,4.0,7.0,4.0,10.0,19.0,7.0,17.0,7.0,12.0,0.0,0.0,0.0,1.0,97.3,75.0,0.0,3.0,416685.0,83454.0,4600.0,110595.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
2,,,5000,5000,5000.0,36 months,13.59,169.9,C,C2,Truck driver,10+ years,OWN,168000.0,Not Verified,Dec-2017,Current,n,,,other,Other,788xx,TX,11.62,0.0,Aug-2002,0.0,44.0,,4.0,0.0,3401,97.2,12.0,w,3291.95,3291.95,2371.05,2371.05,1708.05,663.0,0.0,0.0,0.0,Feb-2019,169.9,Mar-2019,Feb-2019,0.0,44.0,1,Individual,,,,0.0,0.0,51673.0,1.0,3.0,1.0,1.0,5.0,48272.0,48.0,0.0,0.0,3401.0,53.0,3500.0,2.0,0.0,3.0,1.0,12918.0,99.0,97.2,0.0,0.0,135.0,184.0,54.0,5.0,0.0,54.0,44.0,5.0,44.0,2.0,1.0,1.0,1.0,4.0,6.0,1.0,6.0,1.0,4.0,0.0,0.0,0.0,1.0,83.3,100.0,0.0,0.0,82176.0,51673.0,3500.0,78676.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
3,,,14000,14000,14000.0,36 months,10.91,457.75,B,B4,Confidential Secretary,2 years,RENT,39000.0,Source Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,125xx,NY,22.88,0.0,Apr-2003,0.0,74.0,,8.0,0.0,12918,59.5,16.0,w,9090.87,9090.87,6391.53,6391.53,4909.13,1482.4,0.0,0.0,0.0,Feb-2019,457.75,Mar-2019,Feb-2019,0.0,74.0,1,Individual,,,,0.0,457.0,29103.0,1.0,1.0,1.0,2.0,4.0,16185.0,95.0,1.0,5.0,10153.0,75.0,21700.0,2.0,6.0,0.0,7.0,3638.0,7265.0,61.6,0.0,0.0,36.0,176.0,7.0,4.0,0.0,7.0,,16.0,74.0,1.0,2.0,4.0,3.0,4.0,4.0,7.0,12.0,4.0,8.0,0.0,0.0,0.0,2.0,93.8,33.3,0.0,0.0,38704.0,29103.0,18900.0,17004.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
4,,,5000,5000,5000.0,36 months,13.59,169.9,C,C2,General Manager,< 1 year,RENT,55000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,672xx,KS,12.18,0.0,Jun-1999,0.0,,,5.0,0.0,4497,91.8,6.0,w,3291.95,3291.95,2371.05,2371.05,1708.05,663.0,0.0,0.0,0.0,Feb-2019,169.9,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,471.0,7202.0,0.0,1.0,0.0,0.0,71.0,2705.0,12.0,0.0,0.0,1483.0,27.0,4900.0,0.0,1.0,0.0,0.0,1440.0,403.0,91.8,0.0,0.0,149.0,222.0,43.0,43.0,0.0,43.0,,,,0.0,4.0,4.0,4.0,4.0,2.0,4.0,4.0,4.0,5.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,0.0,26841.0,7202.0,4900.0,21941.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [11]:
# 데이터의 마지막 5개 샘플 확인하기
# 데이터가 잘 가져왔는지 확인 할 때 보통 씁니다.
df

In [20]:
# 데이터의 갯수를 살펴봅니다
df.shape

(20000, 145)

In [21]:
# 데이터의 전반적인 정보를 확인합니다.
df.info()
# dtype 정보에서는 각 컬럼별 데이터 타입을 확인 할 수 있습니다.
# object == str 이라고 생각하셔도 무방합니다.
# verbose, null_counts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 145 columns):
 #    Column                                      Non-Null Count  Dtype  
---   ------                                      --------------  -----  
 0    id                                          0 non-null      float64
 1    member_id                                   0 non-null      float64
 2    loan_amnt                                   20000 non-null  int64  
 3    funded_amnt                                 20000 non-null  int64  
 4    funded_amnt_inv                             20000 non-null  float64
 5    term                                        20000 non-null  object 
 6    int_rate                                    20000 non-null  float64
 7    installment                                 20000 non-null  float64
 8    grade                                       20000 non-null  object 
 9    sub_grade                                   20000 non-null  object 
 1

In [23]:
# 데이터의 기초통계량을 확인합니다.
df.describe()


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,url,desc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,annual_inc_joint,dti_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,deferral_term,hardship_amount,hardship_length,hardship_dpd,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,settlement_amount,settlement_percentage,settlement_term
count,0.0,0.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,0.0,0.0,19961.0,20000.0,20000.0,8992.0,2914.0,20000.0,20000.0,20000.0,19963.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,4862.0,20000.0,2792.0,2792.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,19242.0,20000.0,16740.0,20000.0,20000.0,20000.0,19994.0,20000.0,20000.0,20000.0,20000.0,20000.0,19997.0,19653.0,19646.0,20000.0,20000.0,19242.0,20000.0,20000.0,20000.0,20000.0,19672.0,4076.0,17495.0,5889.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,18819.0,20000.0,20000.0,20000.0,20000.0,19651.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,2792.0,2792.0,2792.0,2792.0,2745.0,2792.0,2792.0,2792.0,2792.0,998.0,64.0,64.0,64.0,64.0,50.0,64.0,64.0,61.0,61.0,61.0
mean,,,15382.56875,15382.56875,15373.741922,12.606765,446.811954,78429.26,,,19.214833,0.2464,0.4965,36.956962,77.992793,11.0579,0.16525,15415.68815,43.516676,22.1278,8377.290489,8372.905008,8031.681974,8026.753615,6232.746169,1757.637769,1.228706,40.06933,6.149841,2603.487688,0.02175,45.714726,1.0,121185.3,19.649828,0.00055,185.20045,141077.4,0.90875,2.6209,0.6365,1.5103,21.616204,33969.21395,67.57724,1.20755,2.5691,5528.2165,53.787686,36440.10545,1.0009,1.4222,1.9396,4.34895,14054.194879,14526.662291,49.069719,0.0088,11.2964,122.639487,179.0936,15.3664,8.74565,1.35425,25.995425,40.280913,7.177937,37.489896,0.4677,3.4482,5.10545,4.70355,6.9992,7.90935,7.79135,12.6573,5.029,11.0235,0.0,0.0005,0.072,1.97045,94.234615,32.322233,0.1297,0.03535,181022.8,49500.36145,25365.9079,44365.02,32759.315544,0.751791,1.413324,11.49033,59.74204,3.193768,12.423711,0.058381,0.09563,35.486974,3.0,198.6075,3.0,14.5625,582.3612,14786.18875,231.527187,8129.44459,53.103443,17.327869
std,,,10011.645757,10011.645757,10010.591629,4.929089,283.619373,83630.96,,,21.957008,0.834938,0.78238,22.055265,24.54357,5.687958,0.464277,22845.011859,25.409374,11.731788,8291.549106,8290.262906,6951.524721,6949.741126,6546.212512,1570.050467,9.534786,367.534805,59.740728,5953.233604,0.179105,21.840407,0.0,62371.47,8.079497,0.02549,1186.936786,168300.7,1.133088,2.845521,0.896944,1.533529,26.840505,43478.989027,24.003849,1.473052,2.503485,5358.375953,21.717709,37020.542365,1.451518,2.570676,2.273392,3.161263,18258.768688,19228.843119,29.377757,0.111907,725.679605,54.493694,100.822641,19.217498,9.712897,1.720931,34.820223,22.455427,5.94676,22.10069,1.379511,2.343905,3.300133,3.168765,4.472348,7.099426,4.70687,7.691516,3.145831,5.674677,0.0,0.02449,0.562612,1.840773,9.439437,35.301888,0.344505,0.299006,189996.7,51000.464205,25063.607841,47243.95,28041.003357,1.075274,1.698211,6.736632,25.905343,3.597651,8.045636,0.391363,0.445313,24.28697,0.0,146.832687,0.0,8.620527,414.623586,8707.436533,223.560399,5727.283541,10.40876,6.922719
min,,,1000.0,1000.0,1000.0,5.32,7.61,0.0,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,22700.0,0.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,14.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,10.54,3.0,0.0,31.62,1012.11,0.24,645.0,40.0,1.0
25%,,,8000.0,8000.0,7975.0,9.44,238.17,45000.0,,,10.85,0.0,0.0,19.0,63.0,7.0,0.0,4739.0,23.4,14.0,8.1825,8.1825,3590.55,3587.205,2302.92,617.76,0.0,0.0,0.0,268.09,0.0,29.0,1.0,82668.25,13.82,0.0,0.0,25257.75,0.0,1.0,0.0,0.0,7.0,7449.75,53.0,0.0,1.0,2014.0,39.0,14800.0,0.0,0.0,0.0,2.0,2850.0,2818.0,24.5,0.0,0.0,87.0,113.0,4.0,3.0,0.0,6.0,22.0,2.0,20.0,0.0,2.0,3.0,3.0,4.0,3.0,4.0,7.0,3.0,7.0,0.0,0.0,0.0,1.0,91.7,0.0,0.0,0.0,49756.75,18440.5,9000.0,14534.75,14405.0,0.0,0.0,7.0,41.5,1.0,7.0,0.0,0.0,14.0,3.0,82.605,3.0,7.0,249.9375,7370.455,52.7775,3542.96,45.0,14.0
50%,,,12500.0,12500.0,12500.0,11.99,369.0,65000.0,,,17.26,0.0,0.0,34.0,81.0,10.0,0.0,9867.5,41.5,20.0,6391.41,6391.4,5941.78,5938.74,3961.91,1265.8,0.0,0.0,0.0,466.2,0.0,46.0,1.0,109833.0,19.2,0.0,0.0,70028.5,1.0,2.0,0.0,1.0,14.0,21917.0,70.0,1.0,2.0,4167.0,55.0,26500.0,1.0,0.0,1.0,4.0,7038.0,7810.0,47.65,0.0,0.0,129.0,161.0,9.0,6.0,1.0,14.0,38.0,6.0,35.0,0.0,3.0,4.0,4.0,6.0,6.0,7.0,11.0,4.0,10.0,0.0,0.0,0.0,2.0,100.0,22.2,0.0,0.0,111175.5,35654.0,18100.0,32991.5,25584.0,0.0,1.0,10.0,62.1,2.0,11.0,0.0,0.0,33.5,3.0,147.76,3.0,15.5,478.32,13231.04,159.82,6422.0,50.0,18.0
75%,,,20156.25,20156.25,20156.25,15.05,605.67,95000.0,,,24.58,0.0,1.0,54.0,97.0,14.0,0.0,18336.0,62.0,28.0,12951.92,12950.1375,10284.0925,10278.1425,7315.54,2452.35,0.0,0.0,0.0,922.48,0.0,63.0,1.0,145000.0,25.3,0.0,0.0,211228.8,1.0,3.0,1.0,2.0,24.0,44218.25,85.0,2.0,4.0,7353.5,69.0,45900.0,1.0,2.0,3.0,6.0,19499.0,18626.0,74.0,0.0,0.0,153.0,231.0,19.0,11.0,2.0,30.0,57.0,11.0,52.0,0.0,5.0,7.0,6.0,9.0,10.0,10.0,16.0,7.0,14.0,0.0,0.0,0.0,3.0,100.0,50.0,0.0,0.0,260668.8,63291.75,33100.0,60072.25,42740.5,1.0,2.0,15.0,80.1,4.0,16.0,0.0,0.0,54.0,3.0,287.89,3.0,22.0,862.4325,21705.2425,332.5675,12162.66,60.0,24.0
max,,,40000.0,40000.0,40000.0,30.99,1618.03,6500031.0,,,999.0,36.0,5.0,150.0,120.0,56.0,18.0,629372.0,125.0,96.0,35250.64,35250.64,47101.212295,47101.21,40000.0,14105.79,279.15,9625.0,2040.0,40747.67,9.0,150.0,1.0,1058000.0,39.77,2.0,57638.0,2460868.0,11.0,35.0,6.0,16.0,374.0,827988.0,199.0,18.0,37.0,146863.0,160.0,667100.0,18.0,33.0,33.0,37.0,379822.0,281029.0,136.7,5.0,65000.0,501.0,785.0,279.0,197.0,23.0,539.0,150.0,24.0,150.0,31.0,24.0,33.0,38.0,54.0,67.0,53.0,90.0,30.0,56.0,0.0,2.0,36.0,19.0,100.0,100.0,4.0,18.0,2531600.0,897835.0,320000.0,1310923.0,324858.0,6.0,13.0,51.0,159.3,39.0,62.0,6.0,6.0,109.0,3.0,629.82,3.0,30.0,1889.46,32738.12,979.36,23000.0,94.33,24.0


In [15]:
# numpy 함수로 데이터 shape 확인


In [24]:
# 인덱스
df.index

RangeIndex(start=0, stop=20000, step=1)

In [25]:
# 컬럼
df.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       ...
       'hardship_payoff_balance_amount', 'hardship_last_payment_amount',
       'disbursement_method', 'debt_settlement_flag',
       'debt_settlement_flag_date', 'settlement_status', 'settlement_date',
       'settlement_amount', 'settlement_percentage', 'settlement_term'],
      dtype='object', length=145)

데이터셋을 살펴 본 결과 정체를 알 수 없는 많은 컬럼이 있는 걸 확인했고, 

50000개의 샘플이 불러들여진 것을 확인 할 수 있었습니다.

추가로 데이터 중간 중간 비어있는 값도 있는 것을 확인했습니다.

### 데이터접근 (인덱싱, 슬라이싱, 샘플링)

In [27]:
# 첫 샘플 혹은 레코드(대출건)에 대한 데이터를 살펴보겠습니다.
# 인덱스넘버로 데이터에 접근하는 .iloc[색인]
# 각 컬럼이나, 행단위 접근했을 때 출력되는 벡터 데이터를 Serise(시리즈) 라고 하는 자료구조
# index, values로 각각의 속성에 접근 가능
df.iloc[0]

id                           NaN
member_id                    NaN
loan_amnt                  10000
funded_amnt                10000
funded_amnt_inv          10000.0
                          ...   
settlement_status            NaN
settlement_date              NaN
settlement_amount            NaN
settlement_percentage        NaN
settlement_term              NaN
Name: 0, Length: 145, dtype: object

In [28]:
df.iloc[0].values

array([nan, nan, 10000, 10000, 10000.0, ' 36 months', 9.44, 320.05, 'B',
       'B1', 'mechanic', '6 years', 'MORTGAGE', 80000.0, 'Not Verified',
       'Dec-2017', 'Current', 'n', nan, nan, 'credit_card',
       'Credit card refinancing', '762xx', 'TX', 14.82, 0.0, 'Jul-2007',
       0.0, 34.0, nan, 8.0, 0.0, 5225, 73.6, 30.0, 'w', 6442.28, 6442.28,
       4493.81, 4493.81, 3557.72, 936.09, 0.0, 0.0, 0.0, 'Feb-2019',
       320.05, 'Mar-2019', 'Feb-2019', 0.0, nan, 1, 'Individual', nan,
       nan, nan, 0.0, 0.0, 173110.0, 0.0, 2.0, 0.0, 2.0, 23.0, 12496.0,
       39.0, 0.0, 0.0, 3949.0, 45.0, 7100.0, 1.0, 0.0, 0.0, 2.0, 21639.0,
       1875.0, 73.6, 0.0, 0.0, 125.0, 78.0, 26.0, 23.0, 3.0, 26.0, nan,
       21.0, nan, 0.0, 2.0, 2.0, 4.0, 4.0, 21.0, 4.0, 5.0, 2.0, 8.0, 0.0,
       0.0, 0.0, 0.0, 96.4, 25.0, 0.0, 0.0, 196130.0, 17756.0, 7100.0,
       31992.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       'N', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
 

In [29]:
df.iloc[0].values== ' 36 months'

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [37]:
# 10번 인덱스 부터 20번 인덱스 샘플 접근
# start, end+1, step
df.iloc[[10, 21, 1]]

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
10,,,40000,40000,40000.0,60 months,16.02,973.15,C,C5,IT Manager - Business Process,10+ years,OWN,140000.0,Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,292xx,SC,31.79,1.0,Aug-1999,0.0,3.0,,11.0,0.0,34632,59.5,52.0,w,33288.83,33288.83,13602.03,13602.03,6711.17,6890.86,0.0,0.0,0.0,Feb-2019,973.15,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,59.0,81346.0,0.0,3.0,0.0,0.0,29.0,46714.0,50.0,0.0,1.0,19100.0,53.0,104500.0,0.0,0.0,1.0,1.0,8135.0,63703.0,57.9,0.0,0.0,220.0,213.0,13.0,13.0,0.0,13.0,,9.0,3.0,0.0,2.0,6.0,2.0,17.0,22.0,8.0,30.0,6.0,11.0,0.0,0.0,0.0,0.0,98.0,0.0,0.0,0.0,197500.0,81346.0,88000.0,93000.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
21,,,20000,20000,20000.0,36 months,6.08,609.17,A,A2,Control System Designer,2 years,OWN,149000.0,Verified,Dec-2017,Current,n,,,other,Other,770xx,TX,13.31,0.0,Jul-1999,0.0,,,16.0,0.0,5049,5.3,27.0,w,12651.33,12651.33,8470.96,8470.96,7348.67,1122.29,0.0,0.0,0.0,Feb-2019,609.17,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,399842.0,1.0,3.0,2.0,2.0,5.0,92121.0,74.0,3.0,8.0,4983.0,26.0,94800.0,0.0,1.0,1.0,11.0,24990.0,50951.0,9.0,0.0,0.0,130.0,221.0,8.0,5.0,1.0,8.0,,12.0,,0.0,2.0,2.0,9.0,15.0,7.0,12.0,19.0,2.0,16.0,0.0,0.0,0.0,5.0,100.0,0.0,0.0,0.0,527931.0,97170.0,56000.0,110531.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
1,,,3500,3500,3500.0,36 months,10.42,113.63,B,B3,,,OWN,90000.0,Not Verified,Dec-2017,Current,n,,,other,Other,295xx,SC,28.51,0.0,Jun-2002,0.0,39.0,28.0,12.0,3.0,6953,51.9,38.0,w,2266.55,2266.55,1586.77,1586.77,1233.45,353.32,0.0,0.0,0.0,Feb-2019,113.63,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,339028.0,0.0,4.0,0.0,3.0,22.0,76501.0,69.0,1.0,2.0,1628.0,65.0,13400.0,1.0,5.0,1.0,5.0,28252.0,808.0,82.4,0.0,0.0,164.0,186.0,7.0,7.0,2.0,7.0,39.0,7.0,39.0,0.0,4.0,7.0,4.0,10.0,19.0,7.0,17.0,7.0,12.0,0.0,0.0,0.0,1.0,97.3,75.0,0.0,3.0,416685.0,83454.0,4600.0,110595.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [30]:
# 컬럼 단위 샘플 접근
df['grade']
# df[텍스트형태의 컬럼명]
# 인덱싱이나 슬라이싱으로 데이터에 접근을 할 때 큰 단위를 선택하고 그 결과에서 인덱싱 혹은 슬라이싱을 하면
# 조금 더 편하게, 쉽게 데이터 접근이 가능하다.

0        B
1        B
2        C
3        B
4        C
        ..
19995    B
19996    C
19997    C
19998    D
19999    D
Name: grade, Length: 20000, dtype: object

In [34]:
# 여러 컬럼 동시 접근
df[['installment', 'grade', 'sub_grade']]

Unnamed: 0,installment,grade,sub_grade
0,320.05,B,B1
1,113.63,B,B3
2,169.90,C,C2
3,457.75,B,B4
4,169.90,C,C2
...,...,...,...
19995,345.66,B,B1
19996,326.34,C,C3
19997,789.57,C,C1
19998,914.34,D,D3


In [40]:
# row와 columns을 동시에 슬라이싱 하는 속성
# df.loc[인덱스, 컬럼명]
df.loc[0:30, ['grade', 'sub_grade']]

Unnamed: 0,grade,sub_grade
0,B,B1
1,B,B3
2,C,C2
3,B,B4
4,C,C2
5,C,C3
6,B,B2
7,B,B4
8,C,C5
9,A,A2


In [None]:
# df의 컬럼명을 순환하면서 컬럼단위로 접근하고 각 컬럼의 고유값을 출력해주는 코드


In [None]:
# 고윳값 갯수 출력


### 팬시인덱싱
> **`bool`** 형태의 array를 조건을 전달하여 다차원 배열을 인덱싱하는 방법.  
조건식을 사용하여 분석에 필요한 데이터샘플을 추출하기 용이합니다.

In [42]:
# 신용등급이 A인 샘플의 emp_title 확인
df.loc[df['grade']=='A']

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
9,,,35000,35000,35000.0,36 months,6.08,1066.04,A,A2,,,OWN,76000.0,Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,388xx,MS,34.28,0.0,Aug-1994,1.0,,,23.0,0.0,47145,48.0,42.0,w,22139.93,22139.93,14900.920000,14900.92,12860.07,2040.85,0.0,0.0,0.0,Feb-2019,1066.04,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,170361.0,1.0,2.0,0.0,0.0,26.0,28292.0,64.0,2.0,2.0,11128.0,53.0,98200.0,0.0,4.0,1.0,2.0,7744.0,27309.0,63.2,0.0,0.0,148.0,280.0,6.0,6.0,4.0,6.0,,6.0,,0.0,13.0,14.0,14.0,16.0,9.0,20.0,29.0,14.0,23.0,0.0,0.0,0.0,2.0,100.0,35.7,0.0,0.0,252224.0,75437.0,74300.0,44553.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,DirectPay,N,,,,,,
21,,,20000,20000,20000.0,36 months,6.08,609.17,A,A2,Control System Designer,2 years,OWN,149000.0,Verified,Dec-2017,Current,n,,,other,Other,770xx,TX,13.31,0.0,Jul-1999,0.0,,,16.0,0.0,5049,5.3,27.0,w,12651.33,12651.33,8470.960000,8470.96,7348.67,1122.29,0.0,0.0,0.0,Feb-2019,609.17,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,399842.0,1.0,3.0,2.0,2.0,5.0,92121.0,74.0,3.0,8.0,4983.0,26.0,94800.0,0.0,1.0,1.0,11.0,24990.0,50951.0,9.0,0.0,0.0,130.0,221.0,8.0,5.0,1.0,8.0,,12.0,,0.0,2.0,2.0,9.0,15.0,7.0,12.0,19.0,2.0,16.0,0.0,0.0,0.0,5.0,100.0,0.0,0.0,0.0,527931.0,97170.0,56000.0,110531.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
29,,,21600,21600,21600.0,36 months,6.72,664.19,A,A3,Policy Analyst,< 1 year,RENT,44000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,973xx,OR,17.21,0.0,Sep-2008,0.0,,,8.0,0.0,9929,38.0,25.0,w,13711.65,13711.65,9282.530000,9282.53,7888.35,1394.18,0.0,0.0,0.0,Feb-2019,664.19,Mar-2019,Feb-2019,0.0,,1,Joint App,72000.0,15.10,Not Verified,0.0,0.0,149061.0,0.0,2.0,0.0,2.0,13.0,139132.0,86.0,0.0,2.0,4084.0,60.0,26100.0,1.0,4.0,1.0,4.0,18633.0,9209.0,44.5,0.0,0.0,111.0,84.0,15.0,13.0,0.0,15.0,,11.0,,0.0,3.0,5.0,3.0,4.0,15.0,6.0,10.0,5.0,8.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,168394.0,149061.0,16600.0,142294.0,21110.0,May-1995,1.0,0.0,17.0,40.7,10.0,8.0,0.0,0.0,48.0,N,,,,,,,,,,,,,,,Cash,N,,,,,,
32,,,6500,6500,6500.0,36 months,7.97,203.60,A,A5,Science lab director,10+ years,OWN,40000.0,Source Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,333xx,FL,32.67,0.0,Apr-1987,0.0,53.0,,10.0,0.0,18333,5.9,20.0,w,4154.40,4154.40,2834.280000,2834.28,2345.60,488.68,0.0,0.0,0.0,Feb-2019,203.60,Mar-2019,Feb-2019,0.0,53.0,1,Joint App,88000.0,31.93,Source Verified,0.0,106.0,51392.0,1.0,4.0,0.0,0.0,29.0,33059.0,62.0,3.0,4.0,184.0,53.0,60000.0,0.0,5.0,3.0,4.0,5139.0,4016.0,4.4,0.0,0.0,82.0,368.0,4.0,4.0,1.0,11.0,53.0,2.0,53.0,1.0,1.0,3.0,2.0,6.0,6.0,6.0,13.0,3.0,10.0,0.0,0.0,0.0,3.0,85.0,0.0,0.0,0.0,113599.0,51392.0,4200.0,53599.0,10593.0,Sep-2011,0.0,0.0,13.0,56.9,3.0,11.0,0.0,0.0,34.0,N,,,,,,,,,,,,,,,Cash,N,,,,,,
37,,,10000,10000,9975.0,36 months,5.32,301.15,A,A1,EHS Manager,3 years,MORTGAGE,95000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,307xx,GA,9.55,0.0,Nov-1989,0.0,51.0,,11.0,0.0,8471,22.1,24.0,f,6299.10,6283.35,4210.190000,4199.66,3700.90,509.29,0.0,0.0,0.0,Feb-2019,301.15,Mar-2019,Feb-2019,0.0,51.0,1,Individual,,,,0.0,0.0,339796.0,1.0,2.0,1.0,1.0,7.0,19062.0,87.0,2.0,2.0,6030.0,46.0,38300.0,2.0,1.0,1.0,4.0,37755.0,23267.0,23.2,0.0,0.0,138.0,337.0,2.0,2.0,3.0,2.0,51.0,7.0,51.0,0.0,2.0,3.0,5.0,11.0,4.0,8.0,17.0,3.0,11.0,0.0,0.0,0.0,3.0,87.5,0.0,0.0,0.0,379166.0,27533.0,30300.0,21866.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,DirectPay,N,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19957,,,2400,2400,2400.0,36 months,7.97,75.18,A,A5,Talent Keyholder,6 years,OWN,24000.0,Source Verified,Nov-2017,Fully Paid,n,,,credit_card,Credit card refinancing,941xx,CA,26.55,0.0,Aug-2010,0.0,,,9.0,0.0,3345,9.8,10.0,w,0.00,0.00,2578.501235,2578.50,2400.00,178.50,0.0,0.0,0.0,Feb-2019,940.72,,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,5829.0,1.0,1.0,0.0,0.0,55.0,2484.0,10.0,3.0,4.0,1830.0,10.0,34200.0,0.0,0.0,0.0,4.0,729.0,28427.0,10.3,0.0,0.0,55.0,87.0,2.0,2.0,0.0,11.0,,,,0.0,4.0,5.0,6.0,6.0,1.0,8.0,9.0,5.0,9.0,0.0,0.0,0.0,3.0,100.0,0.0,0.0,0.0,59040.0,5829.0,31700.0,24840.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
19960,,,10000,10000,10000.0,36 months,7.97,313.23,A,A5,cpourt reporter,10+ years,RENT,56000.0,Not Verified,Nov-2017,Current,n,,,credit_card,Credit card refinancing,926xx,CA,19.91,0.0,Apr-1986,0.0,,,11.0,0.0,12095,59.0,14.0,w,6120.62,6120.62,4689.590000,4689.59,3879.38,810.21,0.0,0.0,0.0,Feb-2019,313.23,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,27402.0,1.0,1.0,1.0,1.0,1.0,15307.0,74.0,0.0,0.0,9015.0,67.0,20500.0,0.0,0.0,0.0,2.0,2491.0,7805.0,60.8,0.0,0.0,44.0,379.0,33.0,1.0,0.0,33.0,,16.0,,0.0,7.0,7.0,8.0,8.0,3.0,9.0,10.0,7.0,11.0,0.0,0.0,0.0,1.0,100.0,12.5,0.0,0.0,41120.0,27402.0,19900.0,20620.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
19964,,,2000,2000,2000.0,36 months,7.35,62.08,A,A4,Security Guard,2 years,RENT,36000.0,Not Verified,Nov-2017,Current,n,,,vacation,Vacation,066xx,CT,34.89,0.0,Oct-2010,1.0,,,12.0,0.0,8367,12.8,14.0,w,1219.64,1219.64,929.570000,929.57,780.36,149.21,0.0,0.0,0.0,Feb-2019,62.08,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,32740.0,2.0,2.0,1.0,2.0,12.0,24373.0,77.0,2.0,5.0,3742.0,34.0,65500.0,2.0,0.0,2.0,7.0,2976.0,42193.0,14.4,0.0,0.0,43.0,85.0,3.0,3.0,0.0,3.0,,6.0,,0.0,3.0,6.0,5.0,5.0,3.0,10.0,11.0,6.0,12.0,0.0,0.0,0.0,3.0,100.0,0.0,0.0,0.0,97270.0,32740.0,49300.0,31770.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
19975,,,15000,15000,15000.0,36 months,5.32,451.73,A,A1,Vice President/General Manager,10+ years,MORTGAGE,170000.0,Not Verified,Nov-2017,Current,n,,,home_improvement,Home improvement,770xx,TX,19.64,0.0,Jan-2002,0.0,,,9.0,0.0,6223,10.8,28.0,w,9038.73,9038.73,6767.080000,6767.08,5961.27,805.81,0.0,0.0,0.0,Feb-2019,451.73,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,224613.0,1.0,3.0,2.0,3.0,6.0,92691.0,49.0,1.0,2.0,4732.0,22.0,57600.0,1.0,3.0,3.0,5.0,24957.0,14868.0,24.1,0.0,0.0,151.0,190.0,10.0,6.0,3.0,103.0,,6.0,,0.0,1.0,3.0,1.0,4.0,13.0,5.0,12.0,3.0,9.0,0.0,0.0,0.0,3.0,100.0,0.0,0.0,0.0,346720.0,98914.0,19600.0,139120.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [43]:
df.loc[df['grade']=='A','emp_title']

9                                   NaN
21              Control System Designer
29                       Policy Analyst
32                Science lab director 
37                          EHS Manager
                      ...              
19957                  Talent Keyholder
19960                   cpourt reporter
19964                    Security Guard
19975    Vice President/General Manager
19980                   Project manager
Name: emp_title, Length: 4461, dtype: object

In [44]:
df.loc[df['grade']=='A','emp_title'].value_counts()

Manager                             90
Teacher                             72
Owner                               68
Registered Nurse                    40
Driver                              36
                                    ..
Senior VP                            1
Systems Development Engineer         1
Transfusion Services Supervisor      1
Sr Creative Fragrance Manager        1
Vice President/General Manager       1
Name: emp_title, Length: 2628, dtype: int64

In [45]:
# 대출금액평균
df['loan_amnt'].mean()

15382.56875

In [46]:
# 조건식 샘플링 emp_title 이 ceo인 샘플들
df.loc[df['emp_title']]=='ceo'

KeyError: "None of [Index([                   'mechanic',                           nan,\n                      'Truck driver',      'Confidential Secretary',\n                   'General Manager',                     'NDT III',\n       'Supervisory program analyst', 'Commercial Property Manager',\n              'Equipment technician',                           nan,\n       ...\n                                 nan,     'Maintenance Technician ',\n          'senior software engineer',           'Design Specialist',\n                          'Salesman',                     'Partner',\n           'Senior Property Manager',           'Mechanic engineer',\n                           'Teacher',          'Director of Design'],\n      dtype='object', length=20000)] are in the [index]"

In [None]:
# 신용등급 A와 B인 샘플접근

# 조건식을 여러개 써야 한다면 조건마다 ()로 감싸주시는 것이 좋습니다.

In [47]:
# df loan_amnt 컬럼값이 10000이상인 채권샘플의 grade
df.loc[df['loan_amnt']] >= 10000


KeyError: '[36000, 20000, 35000, 40000, 28000, 23000, 25000, 21600, 30000, 22000, 32000, 24000, 26500, 21800, 27000, 33000, 21000, 26000, 25525, 28550, 32900, 34000, 37800, 20800, 24625, 22725, 28150, 24475, 26450, 27725, 24875, 28850, 28675, 27200, 38000, 21500, 21200, 23800, 20125, 22500, 30100, 23300, 21250, 28800, 25200, 24500, 39325, 23650, 28950, 30600, 21975, 20625, 24725, 26400, 31500, 23550, 26250, 29850, 28225, 22425, 26725, 27750, 29825, 34400, 25350, 20875, 29700, 34675, 29750, 30325, 31475, 29325, 21425, 22175, 29775, 25600, 31575, 37450, 37000, 29000, 20400, 36600, 23100, 25500, 25450, 23850, 29800, 36425, 32925, 33525, 38450, 23200, 23050, 24025, 32500, 22200, 22800, 28500, 21050, 24300, 20650, 31975, 25800, 20300, 22100, 26875, 39500, 24100, 31225, 32800, 23825, 30750, 26700, 27125, 32250, 20250, 22400, 34650, 28075, 22625, 29650, 28400, 26775, 31050, 23150, 22750, 25150, 34800, 37350, 34250, 32750, 34850, 38400, 33300, 23450, 22150, 31000, 24925, 30400, 33600, 35200, 27600, 33575, 31925, 29025, 20150, 29500, 20325, 33950, 23725, 38975, 20825, 24750, 37200, 21825, 31850, 30150, 22900, 30075, 20500, 28050, 20375, 38125, 24800, 29100, 31400, 29600, 20175, 27700, 23575, 36675, 34600, 29425, 27650, 37250, 25725, 39900, 23225, 21775, 34350, 24050, 23750, 33850, 24075, 38850, 27175, 23975, 32550, 29900, 30025, 23675, 27350, 28775, 38700, 23250, 27550, 29875, 22550, 30800, 29400, 29150, 36575, 23425, 22225, 21400, 34700, 21625, 32675, 27875, 31675, 22825, 25175, 22450, 38500, 32150, 29625, 37975, 37900, 21350, 28250, 22025, 20975, 27500, 38875, 28350, 21275, 23475, 20575, 25900, 31650, 32325, 34500, 33150, 27050, 24700, 28125, 36750, 37600, 20225, 25075, 33325, 20350, 22650, 24175, 25925, 33175, 29525, 21550, 24900, 28200, 29350, 20675, 20425, 25650, 26150, 34925, 32125, 29725, 27250, 28700, 33825, 33125, 37650, 30450, 32600, 20550, 23500, 37400, 25625, 20075, 24575, 24600, 23600, 26050, 26425, 29675, 34150, 34125, 26275, 28475, 34900, 23175, 21750, 23525, 25825, 25700, 36525, 21150, 26300, 20450, 32200, 26675, 20925, 26850, 22325, 39000, 27900, 32775, 30500, 30300, 28175, 21100, 32400, 23700, 22950, 22775, 33075, 27450, 28450, 37525, 28875, 30850, 28600, 23925, 37500, 39350, 35375, 23375, 33500, 29575, 35825, 27800, 23275, 32475, 29550, 26750, 24250, 21075, 23900, 30350, 24325, 27275, 20100, 36200, 29050, 27675, 31375, 27825, 30475, 34275, 21950, 27300, 20850, 35325, 29975, 34575, 33050, 35125, 29225, 39525, 26350, 26625, 25425, 31425, 25675, 31725, 33650, 24525, 20275, 27375, 22475, 31950, 22125, 28725, 39175, 30200, 27850, 36400, 38275, 38600, 31200, 21525, 33400, 26325, 29450, 21925, 24550, 21025, 29475, 23075, 29375, 29125, 36500, 30700, 35725, 32525, 28275, 27775, 26575, 29250, 22575, 35225] not in index'

In [52]:
# df grade C 와 D 인 채권샘플 annual_inc 최대값인 인덱스 빼오기 (idxmax)
# 최대값 인덱스 빼와서 샘플까지 출력
df.loc[(df['grade']=='C' | df['grade']=='D'), 'annual_inc'].idmax()

TypeError: Cannot perform 'ror_' with a dtyped [object] array and scalar of type [bool]

In [None]:
# 컬럼 내 문자열 내에 우리가 찾고싶은 문자열이 포함되어 있는지를 기준으로 샘플링


## 데이터프레임 병합
> 실제 분석업무를 진행하다보면 데이터가 여기저기 분산되어 있을 경우가 더 많습니다.  
조각난 데이터를 분석에 필요한 데이터셋으로 만들기 위해 데이터프레임 병합을 많이 사용합니다.  
한개 이상의 데이터프레임을 병합 할 때 주로 사용하는 함수 2가지를 알아보겠습니다.    

### 데이터 병합에 사용가능한 key(병합할 기준이 되는 행 or 열)값이 있는경우
**`pd.merge`**(베이스데이터프레임, 병합할데이터프레임)  
> 사용 가능 한 파라메터
- `how` : 'left', 'right', 'inner', 'outer'
- `left_on` : key값이 다를 경우 베이스데이터프레임의 key 설정
- `right_on` : key값이 다를 경우 병합데이터프레임의 key 설정
    
### 단순 데이터 연결
**`pd.concat`**([베이스데이터프레임, 병합할데이터프레임], axis=0 or 1)
> 사용 가능 한 파라메터  
- `axis` : 축 방향 설정

### merge 예시

In [54]:
merge_df1 = pd.DataFrame({
    '이름': ['원영', '사쿠라', '유리', '예나', '유진', '나코', '은비', '혜원', '히토미', '채원', '민주', '째욘'],
    '국어': [100, 70, 70, 70, 60, 90, 90, 70, 70, 80, 100, 100],
    '영어': [100, 90, 80, 50, 70, 100, 70, 90, 100, 100, 80, 100]
    }, columns=['이름', '국어', '영어']) 

merge_df2 = pd.DataFrame({
    '일어': [80, 100, 100, 90, 70, 50, 100],
    '수학': [90, 70, 100, 80, 70, 80, 90],
    '이름': ['원영', '사쿠라', '나코', '히토미', '예나', '은비', '째욘'],
    }, columns=['일어', '수학', '이름'])

In [56]:
# 데이터프레임 확인
merge_df2

Unnamed: 0,일어,수학,이름
0,80,90,원영
1,100,70,사쿠라
2,100,100,나코
3,90,80,히토미
4,70,70,예나
5,50,80,은비
6,100,90,째욘


In [57]:
# 병합 테스트
pd.merge(merge_df1, merge_df2)

Unnamed: 0,이름,국어,영어,일어,수학
0,원영,100,100,80,90
1,사쿠라,70,90,100,70
2,예나,70,50,70,70
3,나코,90,100,100,100
4,은비,90,70,50,80
5,히토미,70,100,90,80
6,째욘,100,100,100,90


In [60]:
# 양쪽 데이터가 모두 있는경우
#pd.merge(merge_df1, merge_df2, how='inner')
# 한쪽 데이터만 있는 경우도 함침
#pd.merge(merge_df1, merge_df2, how='outer')
# 왼쪽 기준
#pd.merge(merge_df1, merge_df2, how='left')
# 오른쪽 데이터 기준
pd.merge(merge_df1, merge_df2, how='right')

Unnamed: 0,이름,국어,영어,일어,수학
0,원영,100,100,80,90
1,사쿠라,70,90,100,70
2,나코,90,100,100,100
3,히토미,70,100,90,80
4,예나,70,50,70,70
5,은비,90,70,50,80
6,째욘,100,100,100,90


In [61]:
merge_df1 = pd.DataFrame({
    '이름': ['원영', '사쿠라', '유리', '예나', '유진', '나코', '은비', '혜원', '히토미', '채원', '민주', '째욘'],
    '국어': [100, 70, 70, 70, 60, 90, 90, 70, 70, 80, 100, 100],
    '영어': [100, 90, 80, 50, 70, 100, 70, 90, 100, 100, 80, 100]
    }, columns=['이름', '국어', '영어']) 

merge_df2 = pd.DataFrame({
    '일어': [80, 100, 100, 90, 70, 50, 100],
    '수학': [90, 70, 100, 80, 70, 80, 90],
    'name': ['원영', '사쿠라', '나코', '히토미', '예나', '은비', '째욘'],
    }, columns=['일어', '수학', 'name'])

In [62]:
merge_df2

Unnamed: 0,일어,수학,name
0,80,90,원영
1,100,70,사쿠라
2,100,100,나코
3,90,80,히토미
4,70,70,예나
5,50,80,은비
6,100,90,째욘


In [64]:
pd.merge(merge_df1, merge_df2, left_on='이름', right_on='name')

Unnamed: 0,이름,국어,영어,일어,수학,name
0,원영,100,100,80,90,원영
1,사쿠라,70,90,100,70,사쿠라
2,예나,70,50,70,70,예나
3,나코,90,100,100,100,나코
4,은비,90,70,50,80,은비
5,히토미,70,100,90,80,히토미
6,째욘,100,100,100,90,째욘


### concat 예시
현재 df에 저장되어있는 데이터에 추가로 2만개의 데이터를 이어붙여보겠습니다. df1이라는 변수에 이어붙일 데이터를 불러들여 병합을 진행해보겠습니다.

In [76]:
# df1 변수에 loan2.csv 파일을 읽어들입니다.
df1 = open('./data/loan2.csv', encoding='cp949')

In [77]:
# 데이터프레임 확인
df1

<_io.TextIOWrapper name='./data/loan2.csv' mode='r' encoding='cp949'>

In [78]:
# df 와 df1 shape 확인
df1.shape()

AttributeError: '_io.TextIOWrapper' object has no attribute 'shape'

In [None]:
# 데이터프레임 행단위 병합


In [None]:
# 병합 데이터프레임 shape 확인


In [None]:
# 병합 데이터프레임 index 확인


## 인덱스 편집
방금 전 concat으로 병합한 데이터프레임의 이상한 점을 찾으셨나요?  
데이터 자체는 잘 붙였지만 인덱스가 꼬여있습니다. 인덱스 편집은 데이터분석을 위해 필요한 인덱스를 설정하기 위해 필요합니다.

In [79]:
# 인덱스리셋
concat_df.reset_index(drop=True, inplace=True)
# 원본값 변경해라 inplace

NameError: name 'concat_df' is not defined

In [None]:
# 기존 컬럼값을 취해 index로 사용


## 컬럼편집
인덱스편집과 마찬가지로 데이터프레임의 컬럼을 변경해야 할 경우도 있습니다. 데이터프레임은 컬럼단위 샘플링 및 인덱싱, 이름변경이 가능합니다.

### 컬럼선택

In [80]:
# df 컬럼명 접근
df.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       ...
       'hardship_payoff_balance_amount', 'hardship_last_payment_amount',
       'disbursement_method', 'debt_settlement_flag',
       'debt_settlement_flag_date', 'settlement_status', 'settlement_date',
       'settlement_amount', 'settlement_percentage', 'settlement_term'],
      dtype='object', length=145)

In [81]:
# columns 속성도 인덱싱 및 슬라이싱이 가능합니다.
df.columns[0:30]

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose',
       'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record'],
      dtype='object')

저는 개인정보에 관한 부분에 관심이 많습니다. 데이터셋 중 필요한 부분만을 컬럼단위로 추려보겠습니다.

In [83]:
# df의 개인정보에 관한 컬럼만을 색인으로 df를 슬라이싱하고 person_df 변수에 할당
person_df=df[df.columns[0:25]]

In [84]:
person_df

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti
0,,,10000,10000,10000.0,36 months,9.44,320.05,B,B1,mechanic,6 years,MORTGAGE,80000.0,Not Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,762xx,TX,14.82
1,,,3500,3500,3500.0,36 months,10.42,113.63,B,B3,,,OWN,90000.0,Not Verified,Dec-2017,Current,n,,,other,Other,295xx,SC,28.51
2,,,5000,5000,5000.0,36 months,13.59,169.90,C,C2,Truck driver,10+ years,OWN,168000.0,Not Verified,Dec-2017,Current,n,,,other,Other,788xx,TX,11.62
3,,,14000,14000,14000.0,36 months,10.91,457.75,B,B4,Confidential Secretary,2 years,RENT,39000.0,Source Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,125xx,NY,22.88
4,,,5000,5000,5000.0,36 months,13.59,169.90,C,C2,General Manager,< 1 year,RENT,55000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,672xx,KS,12.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,,,10800,10800,10800.0,36 months,9.44,345.66,B,B1,Partner,3 years,MORTGAGE,240000.0,Not Verified,Nov-2017,Current,n,,,major_purchase,Major purchase,370xx,TN,9.25
19996,,,14000,14000,14000.0,60 months,14.08,326.34,C,C3,Senior Property Manager,10+ years,OWN,53300.0,Verified,Nov-2017,Current,n,,,major_purchase,Major purchase,481xx,MI,12.95
19997,,,35000,35000,35000.0,60 months,12.62,789.57,C,C1,Mechanic engineer,7 years,RENT,85000.0,Source Verified,Nov-2017,Fully Paid,n,,,credit_card,Credit card refinancing,030xx,NH,16.77
19998,,,35225,35225,35225.0,60 months,19.03,914.34,D,D3,Teacher,8 years,MORTGAGE,42000.0,Source Verified,Nov-2017,Current,n,,,debt_consolidation,Debt consolidation,532xx,WI,32.80


### 컬럼삭제
현재 데이터셋에는 개인식별정보가 지워져서 데이터가 존재하지 않습니다. 불필요한 데이터 column을 지우도록 하겠습니다.

In [85]:
# 지울 column의 데이터값이 모두 NaN인지 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 145 columns):
 #    Column                                      Non-Null Count  Dtype  
---   ------                                      --------------  -----  
 0    id                                          0 non-null      float64
 1    member_id                                   0 non-null      float64
 2    loan_amnt                                   20000 non-null  int64  
 3    funded_amnt                                 20000 non-null  int64  
 4    funded_amnt_inv                             20000 non-null  float64
 5    term                                        20000 non-null  object 
 6    int_rate                                    20000 non-null  float64
 7    installment                                 20000 non-null  float64
 8    grade                                       20000 non-null  object 
 9    sub_grade                                   20000 non-null  object 
 1

In [86]:
df['id'].isna().all()

True

삭제할 컬럼 모두 데이터가 없는 것을 확인했습니다.

In [87]:
# 컬럼 삭제 (drop, del, pop)
df.drop('id', axis=1, inplace=True)
df



Unnamed: 0,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,10000,10000,10000.0,36 months,9.44,320.05,B,B1,mechanic,6 years,MORTGAGE,80000.0,Not Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,762xx,TX,14.82,0.0,Jul-2007,0.0,34.0,,8.0,0.0,5225,73.6,30.0,w,6442.28,6442.28,4493.810000,4493.81,3557.72,936.09,0.0,0.0,0.0,Feb-2019,320.05,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,173110.0,0.0,2.0,0.0,2.0,23.0,12496.0,39.0,0.0,0.0,3949.0,45.0,7100.0,1.0,0.0,0.0,2.0,21639.0,1875.0,73.6,0.0,0.0,125.0,78.0,26.0,23.0,3.0,26.0,,21.0,,0.0,2.0,2.0,4.0,4.0,21.0,4.0,5.0,2.0,8.0,0.0,0.0,0.0,0.0,96.4,25.0,0.0,0.0,196130.0,17756.0,7100.0,31992.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
1,,3500,3500,3500.0,36 months,10.42,113.63,B,B3,,,OWN,90000.0,Not Verified,Dec-2017,Current,n,,,other,Other,295xx,SC,28.51,0.0,Jun-2002,0.0,39.0,28.0,12.0,3.0,6953,51.9,38.0,w,2266.55,2266.55,1586.770000,1586.77,1233.45,353.32,0.0,0.0,0.0,Feb-2019,113.63,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,339028.0,0.0,4.0,0.0,3.0,22.0,76501.0,69.0,1.0,2.0,1628.0,65.0,13400.0,1.0,5.0,1.0,5.0,28252.0,808.0,82.4,0.0,0.0,164.0,186.0,7.0,7.0,2.0,7.0,39.0,7.0,39.0,0.0,4.0,7.0,4.0,10.0,19.0,7.0,17.0,7.0,12.0,0.0,0.0,0.0,1.0,97.3,75.0,0.0,3.0,416685.0,83454.0,4600.0,110595.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
2,,5000,5000,5000.0,36 months,13.59,169.90,C,C2,Truck driver,10+ years,OWN,168000.0,Not Verified,Dec-2017,Current,n,,,other,Other,788xx,TX,11.62,0.0,Aug-2002,0.0,44.0,,4.0,0.0,3401,97.2,12.0,w,3291.95,3291.95,2371.050000,2371.05,1708.05,663.00,0.0,0.0,0.0,Feb-2019,169.90,Mar-2019,Feb-2019,0.0,44.0,1,Individual,,,,0.0,0.0,51673.0,1.0,3.0,1.0,1.0,5.0,48272.0,48.0,0.0,0.0,3401.0,53.0,3500.0,2.0,0.0,3.0,1.0,12918.0,99.0,97.2,0.0,0.0,135.0,184.0,54.0,5.0,0.0,54.0,44.0,5.0,44.0,2.0,1.0,1.0,1.0,4.0,6.0,1.0,6.0,1.0,4.0,0.0,0.0,0.0,1.0,83.3,100.0,0.0,0.0,82176.0,51673.0,3500.0,78676.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
3,,14000,14000,14000.0,36 months,10.91,457.75,B,B4,Confidential Secretary,2 years,RENT,39000.0,Source Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,125xx,NY,22.88,0.0,Apr-2003,0.0,74.0,,8.0,0.0,12918,59.5,16.0,w,9090.87,9090.87,6391.530000,6391.53,4909.13,1482.40,0.0,0.0,0.0,Feb-2019,457.75,Mar-2019,Feb-2019,0.0,74.0,1,Individual,,,,0.0,457.0,29103.0,1.0,1.0,1.0,2.0,4.0,16185.0,95.0,1.0,5.0,10153.0,75.0,21700.0,2.0,6.0,0.0,7.0,3638.0,7265.0,61.6,0.0,0.0,36.0,176.0,7.0,4.0,0.0,7.0,,16.0,74.0,1.0,2.0,4.0,3.0,4.0,4.0,7.0,12.0,4.0,8.0,0.0,0.0,0.0,2.0,93.8,33.3,0.0,0.0,38704.0,29103.0,18900.0,17004.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
4,,5000,5000,5000.0,36 months,13.59,169.90,C,C2,General Manager,< 1 year,RENT,55000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,672xx,KS,12.18,0.0,Jun-1999,0.0,,,5.0,0.0,4497,91.8,6.0,w,3291.95,3291.95,2371.050000,2371.05,1708.05,663.00,0.0,0.0,0.0,Feb-2019,169.90,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,471.0,7202.0,0.0,1.0,0.0,0.0,71.0,2705.0,12.0,0.0,0.0,1483.0,27.0,4900.0,0.0,1.0,0.0,0.0,1440.0,403.0,91.8,0.0,0.0,149.0,222.0,43.0,43.0,0.0,43.0,,,,0.0,4.0,4.0,4.0,4.0,2.0,4.0,4.0,4.0,5.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,0.0,26841.0,7202.0,4900.0,21941.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,,10800,10800,10800.0,36 months,9.44,345.66,B,B1,Partner,3 years,MORTGAGE,240000.0,Not Verified,Nov-2017,Current,n,,,major_purchase,Major purchase,370xx,TN,9.25,0.0,Oct-2005,1.0,42.0,,6.0,0.0,128754,68.4,10.0,w,6667.33,6667.33,5173.570000,5173.57,4132.67,1040.90,0.0,0.0,0.0,Feb-2019,345.66,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,414761.0,1.0,2.0,1.0,1.0,2.0,74155.0,,1.0,1.0,911.0,68.0,152300.0,1.0,1.0,4.0,2.0,69127.0,726.0,68.4,0.0,0.0,145.0,107.0,10.0,2.0,2.0,89.0,42.0,0.0,42.0,0.0,2.0,3.0,2.0,2.0,5.0,3.0,3.0,3.0,6.0,0.0,0.0,0.0,2.0,77.8,0.0,0.0,0.0,469534.0,202909.0,2300.0,84548.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
19996,,14000,14000,14000.0,60 months,14.08,326.34,C,C3,Senior Property Manager,10+ years,OWN,53300.0,Verified,Nov-2017,Current,n,,,major_purchase,Major purchase,481xx,MI,12.95,0.0,Mar-1987,1.0,27.0,,7.0,0.0,1536,30.7,17.0,w,11358.71,11358.71,4873.200000,4873.20,2641.29,2231.91,0.0,0.0,0.0,Feb-2019,326.34,Mar-2019,Feb-2019,0.0,40.0,1,Individual,,,,0.0,0.0,85489.0,0.0,1.0,1.0,1.0,8.0,12118.0,84.0,0.0,1.0,0.0,71.0,5000.0,2.0,0.0,2.0,2.0,14248.0,,,0.0,0.0,140.0,368.0,20.0,8.0,1.0,,40.0,0.0,40.0,3.0,0.0,2.0,0.0,3.0,5.0,5.0,11.0,2.0,7.0,0.0,0.0,0.0,1.0,52.9,,0.0,0.0,105375.0,13654.0,0.0,14375.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
19997,,35000,35000,35000.0,60 months,12.62,789.57,C,C1,Mechanic engineer,7 years,RENT,85000.0,Source Verified,Nov-2017,Fully Paid,n,,,credit_card,Credit card refinancing,030xx,NH,16.77,0.0,Jul-2002,2.0,64.0,,11.0,0.0,23115,43.6,29.0,w,0.00,0.00,38737.465857,38737.47,35000.00,3737.47,0.0,0.0,0.0,Oct-2018,31680.42,,Oct-2018,1.0,,1,Joint App,160000.0,18.86,Source Verified,0.0,468.0,47500.0,0.0,1.0,0.0,0.0,25.0,24385.0,,0.0,0.0,9076.0,44.0,49500.0,0.0,0.0,2.0,0.0,4750.0,24385.0,45.7,0.0,0.0,184.0,91.0,27.0,25.0,0.0,27.0,,0.0,,0.0,4.0,4.0,8.0,17.0,8.0,10.0,21.0,4.0,11.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,87549.0,47500.0,47500.0,38049.0,40609.0,Oct-2006,1.0,0.0,34.0,45.9,17.0,30.0,0.0,0.0,69.0,N,,,,,,,,,,,,,,,Cash,N,,,,,,
19998,,35225,35225,35225.0,60 months,19.03,914.34,D,D3,Teacher,8 years,MORTGAGE,42000.0,Source Verified,Nov-2017,Current,n,,,debt_consolidation,Debt consolidation,532xx,WI,32.80,0.0,May-2003,0.0,65.0,,15.0,0.0,10432,60.0,30.0,w,29254.00,29254.00,13640.620000,13640.62,5971.00,7669.62,0.0,0.0,0.0,Feb-2019,914.34,Mar-2019,Feb-2019,0.0,,1,Joint App,82000.0,25.02,Source Verified,0.0,0.0,162765.0,0.0,7.0,0.0,0.0,46.0,19208.0,35.0,0.0,0.0,4515.0,41.0,17400.0,0.0,0.0,0.0,0.0,11626.0,5315.0,65.5,0.0,0.0,169.0,174.0,52.0,40.0,1.0,89.0,,,65.0,0.0,3.0,4.0,4.0,7.0,10.0,6.0,17.0,4.0,15.0,0.0,0.0,0.0,0.0,96.4,50.0,0.0,0.0,213708.0,29838.0,15400.0,54560.0,29018.0,May-2004,0.0,1.0,11.0,56.0,1.0,11.0,0.0,0.0,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [88]:
del df['member_id']

In [91]:
df.pop('dsec')

KeyError: 'dsec'

### 컬럼명 변경
    경우에 따라서는 데이터셋 제작 중 컬럼명을 변경해야 할 경우도 있습니다.
    국내 수집 데이터 사용 시 컬럼이 한글일 경우 영어로 변경을 많이 합니다.

In [None]:
# home_ownership을 간략하게 home으로 변경
# 한글도 가능합니다만 권장하지는 않습니다.


## 데이터 샘플링 및 분석
> 데이터병합, 인덱스편집, 컬럼선택만으로도 불필요한 정보를 삭제하고 새롭게 데이터셋을 만들 수 있는것을 확인했습니다.  
위에 학습한 내용도 데이터 샘플링에 속한 내용이지만 지금부터는 데이터셋의 데이터를 살펴보면서 의미있는 데이터를 추려보도록 하겠습니다.  
    
**데이터프레임의 기본적인 인덱싱, 슬라이싱, 조건부 샘플링을 조합하면 데이터의 샘플을 확인 하는 과정만으로도 데이터분석이 가능해집니다.**

In [None]:
# 분석에 필요한 데이터프레임을 만들었으니 원본값을 사용하겠습니다. 기존 df에 person_df 값을 덮어 씌웁니다.


In [None]:
# 분석에 필요한 데이터셋을 생성했다면 파일로도 저장 해둡시다.


### 저는 채권자의 개인정보에 관심이 많습니다. 고객의 직업을 살펴보겠습니다.

In [None]:
# emp_title 접근


In [None]:
# 값을 카운트 하는 함수 value_counts()


### 데이터프레임 형변환

In [None]:
# Owner, owner 같은 직업이지만 대소문자 구분에 따라 다른 값으로 취급되는 문제가 있네요.
# 대소문자 구분을 없애기 위해 모두 소문자로 데이터값을 변경하겠습니다.
# 소문자 변환 전 혹시모를 int, float 데이터가 있을지 모를 상황에 대비해서 모두 문자열로 변경해주겠습니다.
# 형변환 함수 astype(데이터타입)


In [None]:
# 반복문을 사용한 데이터 변경도 가능
# 하지만 파이썬의 강점을 살리지 못한 코드


### 배운사람들의 코드, 고오급 python 스킬
numpy를 학습하면서 브로드캐스팅에 관하여 잠깐 언급했었습니다. 그렇다면 그 파워풀하다던 브로드캐스팅은 어떻게 사용해야할까요?
    
>기타 언어에서는 지원하지 않는 기능이니만큼 파이썬의 특징을 가장 잘 살리는 코드  
**`apply`** 함수를 사용하여 인자로 받는 모든 데이터에 함수를 적용

#### apply 함수로 컬럼에 적용시키는 코드 구조
    df['컬럼명'] = df['컬럼명'].apply(lambda x: func(x) if 조건문)
    df['컬럼명'] = df['컬럼명'].apply(func_nm)

In [93]:
# 대문자 만드는 함수
def make_upper(x):
    return x.upper()

In [95]:
make_upper('apple')

'APPLE'

In [103]:
# apply() 함수사용 반복이 가능한 데이터구조의 모든 인자에 적용
# lambda 각 인자에 적용할 함수 혹은 연산
df = df.astype({'emp_title':'str'})
df['emp_title'].apply(make_upper)

0                       MECHANIC
1                            NAN
2                   TRUCK DRIVER
3         CONFIDENTIAL SECRETARY
4                GENERAL MANAGER
                  ...           
19995                    PARTNER
19996    SENIOR PROPERTY MANAGER
19997          MECHANIC ENGINEER
19998                    TEACHER
19999         DIRECTOR OF DESIGN
Name: emp_title, Length: 20000, dtype: object

In [104]:
# 위 3칸과 같음.
df['emp_title'] = df['emp_title'].apply(lambda x: x.lower())

In [106]:
# 대소문자 구분을 처리한 값 확인
df['emp_title'].value_counts().head(20)
# 기존 value_count 값과 차이가 있음을 확인 할 수 있습니다.
# 제공 된 데이터셋이라도 이와 같은 작은 차이가 있을 수 있습니다.
# 데이터를 꼼꼼하게 살펴볼 수록 디테일한 차이를 만들 수 있습니다.

nan                   1731
owner                  474
manager                414
teacher                388
driver                 203
registered nurse       201
sales                  182
supervisor             158
rn                     139
truck driver           128
general manager        113
office manager         106
project manager         99
president               93
nurse                   79
director                78
sales manager           76
engineer                72
operations manager      70
police officer          63
Name: emp_title, dtype: int64

In [114]:
# owner인 사람들 샘플링
owner_df=df.loc[df['emp_title']=='owner']

In [115]:
# 샘플링 된 데이터프레임의 단일 컬럼 접근
owner_df

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
58,6000,6000,5975.0,36 months,15.05,208.14,C,C4,owner,10+ years,OWN,55000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,372xx,TN,20.07,1.0,Mar-2004,1.0,12.0,,11.0,0.0,4655,14.5,14.0,f,3980.00,3963.41,2903.930000,2891.83,2020.00,883.93,0.0,0.00,0.0000,Feb-2019,208.14,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,123762.0,0.0,3.0,0.0,3.0,13.0,17532.0,75.0,0.0,2.0,2586.0,40.0,32200.0,3.0,0.0,2.0,5.0,11251.0,16261.0,19.5,0.0,0.0,17.0,165.0,14.0,13.0,1.0,15.0,12.0,0.0,12.0,0.0,4.0,6.0,5.0,7.0,3.0,7.0,10.0,6.0,11.0,0.0,0.0,0.0,0.0,92.9,0.0,0.0,0.0,168482.0,22187.0,20200.0,23365.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
176,17525,17525,17525.0,60 months,20.00,464.31,D,D4,owner,4 years,RENT,6000.0,Not Verified,Dec-2017,Fully Paid,n,,,debt_consolidation,Debt consolidation,320xx,FL,21.40,0.0,Sep-2005,0.0,66.0,,7.0,0.0,2600,25.5,11.0,w,0.00,0.00,19794.786529,19794.79,17525.00,2269.79,0.0,0.00,0.0000,Aug-2018,16564.09,,Feb-2019,0.0,66.0,1,Joint App,50000.0,17.06,Not Verified,0.0,151.0,49726.0,0.0,1.0,0.0,0.0,147.0,47126.0,,0.0,1.0,2118.0,26.0,10200.0,0.0,0.0,0.0,1.0,8288.0,6108.0,28.1,0.0,0.0,147.0,131.0,13.0,13.0,1.0,77.0,,,,1.0,2.0,3.0,3.0,3.0,1.0,6.0,9.0,3.0,7.0,0.0,0.0,0.0,0.0,90.9,0.0,0.0,0.0,39302.0,49726.0,8500.0,29102.0,20777.0,Feb-2003,1.0,1.0,14.0,37.1,7.0,10.0,0.0,0.0,56.0,N,,,,,,,,,,,,,,,Cash,N,,,,,,
327,7000,7000,7000.0,36 months,12.62,234.58,C,C1,owner,10+ years,RENT,72000.0,Source Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,947xx,CA,7.18,0.0,Apr-2004,0.0,,,7.0,0.0,11504,68.9,12.0,w,4585.80,4585.80,3279.210000,3279.21,2414.20,865.01,0.0,0.00,0.0000,Feb-2019,234.58,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,11504.0,0.0,0.0,0.0,0.0,66.0,0.0,,1.0,1.0,3664.0,69.0,16700.0,0.0,0.0,0.0,1.0,1917.0,4072.0,62.3,0.0,0.0,107.0,164.0,10.0,10.0,0.0,10.0,,,,0.0,4.0,6.0,4.0,4.0,2.0,7.0,9.0,6.0,7.0,0.0,0.0,0.0,1.0,100.0,75.0,0.0,0.0,16700.0,11504.0,10800.0,0.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
354,8000,8000,8000.0,36 months,6.72,246.00,A,A3,owner,5 years,RENT,42000.0,Not Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,329xx,FL,18.00,0.0,Sep-2002,0.0,,,5.0,0.0,10369,54.0,13.0,w,5078.34,5078.34,3441.010000,3441.01,2921.66,519.35,0.0,0.00,0.0000,Feb-2019,246.00,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,19461.0,0.0,1.0,1.0,1.0,9.0,9092.0,75.0,0.0,0.0,5699.0,62.0,19200.0,0.0,0.0,0.0,1.0,3892.0,8831.0,54.0,0.0,0.0,165.0,183.0,110.0,9.0,1.0,110.0,,16.0,,0.0,4.0,4.0,4.0,5.0,6.0,4.0,5.0,4.0,5.0,0.0,0.0,0.0,1.0,100.0,0.0,0.0,0.0,31323.0,19461.0,19200.0,12123.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
362,15000,15000,15000.0,60 months,7.35,299.51,A,A4,owner,10+ years,MORTGAGE,70000.0,Not Verified,Dec-2017,Current,n,,,small_business,Business,410xx,KY,14.44,0.0,Aug-1997,0.0,,,23.0,0.0,9936,7.4,27.0,w,11974.50,11974.50,4180.890000,4180.89,3025.50,1155.39,0.0,0.00,0.0000,Feb-2019,299.51,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,105375.0,2.0,1.0,0.0,0.0,26.0,4495.0,38.0,3.0,5.0,3121.0,10.0,133600.0,0.0,0.0,2.0,6.0,4582.0,103547.0,5.6,0.0,0.0,35.0,244.0,5.0,5.0,1.0,8.0,,10.0,,0.0,10.0,13.0,14.0,16.0,2.0,20.0,23.0,13.0,23.0,0.0,0.0,0.0,4.0,100.0,0.0,0.0,0.0,272310.0,14634.0,109700.0,12000.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19875,10000,10000,10000.0,60 months,20.00,264.94,D,D4,owner,4 years,RENT,24000.0,Not Verified,Dec-2017,Current,n,,,credit_card,Credit card refinancing,070xx,NJ,27.75,0.0,Jun-2012,0.0,,,8.0,0.0,18232,61.2,10.0,w,8464.72,8464.72,3686.940000,3686.94,1535.28,2151.66,0.0,0.00,0.0000,Feb-2019,264.94,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,18232.0,0.0,0.0,0.0,0.0,,0.0,,0.0,1.0,7216.0,61.0,29800.0,0.0,0.0,0.0,1.0,2279.0,7477.0,69.7,0.0,0.0,,65.0,17.0,17.0,0.0,17.0,,17.0,,0.0,5.0,6.0,6.0,8.0,0.0,8.0,10.0,6.0,8.0,0.0,0.0,0.0,0.0,100.0,33.3,0.0,0.0,29800.0,18232.0,24700.0,0.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
19931,20000,20000,20000.0,60 months,17.09,498.02,D,D1,owner,10+ years,OWN,240000.0,Source Verified,Nov-2017,Charged Off,n,,,home_improvement,Home improvement,117xx,NY,21.15,0.0,Oct-2004,3.0,,71.0,20.0,1.0,38397,53.3,29.0,f,0.00,0.00,3198.600000,3198.60,213.19,246.85,0.0,2738.56,492.9408,Jan-2018,498.02,,Jan-2018,0.0,,1,Individual,,,,0.0,0.0,108139.0,1.0,6.0,3.0,4.0,3.0,69742.0,64.0,4.0,5.0,10965.0,60.0,72000.0,2.0,2.0,5.0,9.0,5692.0,30520.0,54.2,0.0,0.0,157.0,134.0,7.0,3.0,0.0,7.0,,2.0,,0.0,9.0,10.0,11.0,13.0,13.0,14.0,16.0,11.0,19.0,0.0,0.0,0.0,7.0,100.0,9.1,1.0,0.0,181628.0,108139.0,66600.0,109628.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
19961,28500,28500,28500.0,36 months,9.93,918.68,B,B2,owner,6 years,OWN,70000.0,Not Verified,Dec-2017,Current,n,,,debt_consolidation,Debt consolidation,750xx,TX,38.19,0.0,Sep-1992,0.0,,,8.0,0.0,27339,24.2,22.0,w,18408.59,18408.59,12845.800000,12845.80,10091.41,2754.39,0.0,0.00,0.0000,Feb-2019,918.68,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,125127.0,0.0,3.0,2.0,2.0,7.0,97788.0,87.0,1.0,1.0,10108.0,55.0,87500.0,1.0,3.0,4.0,3.0,15641.0,24773.0,30.2,0.0,0.0,158.0,302.0,10.0,7.0,0.0,10.0,,7.0,,0.0,2.0,4.0,3.0,13.0,6.0,5.0,16.0,4.0,8.0,0.0,0.0,0.0,3.0,100.0,0.0,0.0,0.0,193651.0,125127.0,35500.0,106151.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
19972,11000,11000,11000.0,36 months,11.99,365.31,B,B5,owner,< 1 year,MORTGAGE,45000.0,Not Verified,Nov-2017,Current,n,,,small_business,Business,184xx,PA,19.07,1.0,Oct-1998,0.0,19.0,,7.0,0.0,11041,43.6,26.0,w,6889.07,6889.07,5465.000000,5465.00,4110.93,1354.07,0.0,0.00,0.0000,Feb-2019,365.31,Mar-2019,Feb-2019,0.0,,1,Individual,,,,0.0,0.0,23049.0,0.0,2.0,0.0,0.0,29.0,12008.0,45.0,0.0,1.0,5569.0,44.0,25300.0,0.0,0.0,0.0,1.0,3842.0,14159.0,43.8,0.0,0.0,229.0,143.0,23.0,23.0,0.0,23.0,19.0,,19.0,0.0,3.0,3.0,4.0,6.0,17.0,5.0,9.0,3.0,7.0,0.0,0.0,0.0,0.0,96.2,33.3,0.0,0.0,52300.0,23049.0,25200.0,27000.0,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


In [116]:
# 컬럼 평균값 계산
owner_df['annual_inc'].mean()

90019.41139240506

In [119]:
# 코드 하나 변경으로 간단한 분석 가능
# owner가 아닌 사람들의 평균
df.loc[df['emp_title']!='owner']['annual_inc'].mean()

78147.9090248899

## 데이터 재구조화

In [128]:
# 각 직업별 평균연봉이 궁금하다 groupby
# 엑셀의 pivol table 과 비슷한 기능
# df.groupby('emp_title').mean()
df.groupby('emp_title').mean()['annual_inc'].sort_values(ascending=False)[:20]

emp_title
trash truck driver                    6500031.0
billing analyst                       4784000.0
tourist guide                         2416960.0
full-time rn                          1116000.0
film editor                            988000.0
manager director fixed income          900000.0
senior client partner                  850000.0
terminal operator                      810000.0
sales & marketing                      741250.0
senior partner                         712000.0
partner and managing director          700000.0
cardiologist                           650000.0
partner - attorney                     635000.0
senior trader                          550000.0
founder ceo                            550000.0
portfolio manager                      535000.0
vp valuations                          525000.0
1st vice president of wealth mgmt.     512000.0
orthopaedic surgeon                    500000.0
chief ops officer                      500000.0
Name: annual_inc, dtype: float

In [129]:
df.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length',
       ...
       'hardship_payoff_balance_amount', 'hardship_last_payment_amount',
       'disbursement_method', 'debt_settlement_flag',
       'debt_settlement_flag_date', 'settlement_status', 'settlement_date',
       'settlement_amount', 'settlement_percentage', 'settlement_term'],
      dtype='object', length=143)

In [134]:
# pivot_table
table = pd.pivot_table(
    df,
    values='int_rate',
    index='grade',
    columns='term',
    aggfunc=len

)

In [135]:
table

term,36 months,60 months
grade,Unnamed: 1_level_1,Unnamed: 2_level_1
A,4257,204
B,4621,1689
C,3168,2178
D,1626,1394
E,221,473
F,7,130
G,3,29


## 결측치 처리
> 데이터 분석을 위해서는 데이터셋 내에 빈 값이 있는 경우 분석에 방해가 될 수 있는 여지가 많습니다.  
모든 결측치를 없애야 하는 것은 아니지만 되도록이면 결측치를 채우는 방법, 혹은 없애는 방법등으로 결측치를 처리합니다.  
몇가지 예시를 살펴보면서 결측치 처리에 대해 알아봅시다.

In [136]:
# info() 함수는 결측치에 대한 정보도 보여줍니다.
# 컬럼별 isnull() 함수를 사용해도 무방합니다.b
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 143 columns):
 #    Column                                      Non-Null Count  Dtype  
---   ------                                      --------------  -----  
 0    loan_amnt                                   20000 non-null  int64  
 1    funded_amnt                                 20000 non-null  int64  
 2    funded_amnt_inv                             20000 non-null  float64
 3    term                                        20000 non-null  object 
 4    int_rate                                    20000 non-null  float64
 5    installment                                 20000 non-null  float64
 6    grade                                       20000 non-null  object 
 7    sub_grade                                   20000 non-null  object 
 8    emp_title                                   20000 non-null  object 
 9    emp_length                                  18296 non-null  object 
 1

In [None]:
df.loc[df['dti'].isnull(), 'dti']

    확인결과 emp_title, emp_length, dti에 결측치가 존재합니다.
    해당 컬럼의 결측치 샘플들을 살펴보고 결측치를 처리해 보겠습니다.

In [141]:
# 컬럼별 결측치 확인을 위한 isnull()함수 리턴값이 bool 형태로 반환되어 조건부 샘플링이 가능합니다.
df['dti'].isnull().sum()/len(df) * 100

0.19499999999999998

In [None]:
# dti 컬럼에 결측치가 존재하는 샘플 확인


    직업과 근속연수에 관한 부분은 데이터를 통한 유추나 계산값을 통해 채워넣을 수 있는 항목은 아닌 것 같습니다.
    다만 dti의 경우 실수로 채워져 있는 부분이니 수업을 위해 평균값 혹은 근사치를 계산하여 채워보도록 하겠습니다.

### 결측치 채우기

In [145]:
# fillna() 함수로 NaN 값을 dti 컬럼의 평균으로 채우기
df['dti'].fillna(df['dti'].mean(), inplace=True)
# fillna() 함수의 다양한 채우기 방법 파라메터 확인해보기


AttributeError: 'Series' object has no attribute 'info'

### 결측치 제거

In [146]:
# emp_title 결측치가 있는 샘플 확인
df.loc[df['emp_title'].isna()]

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term


In [149]:
# view값으로 dropna 결과값 확인
df.dropna()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term


In [150]:
# 결측치 제거
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 143 columns):
 #    Column                                      Non-Null Count  Dtype  
---   ------                                      --------------  -----  
 0    loan_amnt                                   20000 non-null  int64  
 1    funded_amnt                                 20000 non-null  int64  
 2    funded_amnt_inv                             20000 non-null  float64
 3    term                                        20000 non-null  object 
 4    int_rate                                    20000 non-null  float64
 5    installment                                 20000 non-null  float64
 6    grade                                       20000 non-null  object 
 7    sub_grade                                   20000 non-null  object 
 8    emp_title                                   20000 non-null  object 
 9    emp_length                                  18296 non-null  object 
 1