In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

"""
# matplotlib style
plt.rcParams['axes.unicode_minus']=False
plt.rcParams['font.family'] = "BM Hanna Pro"
plt.rcParams["figure.figsize"] = (12, 6)

# seaborn style
sns.set_theme(style='whitegrid', font='BM Hanna Pro', palette='pastel', rc={'figure.figsize': (10, 6)})
"""


sns.set_theme(
    style='whitegrid',       # 배경 스타일 (grid 포함)
    font='BM Hanna Pro',     # 폰트
    palette='pastel',        # 색상 테마
    rc={
        'axes.unicode_minus': False,  # 마이너스 기호 깨짐 방지
        'figure.figsize': (12, 6),    # 그래프 크기 설정
        # 'axes.labelsize': 14,         # 축 레이블 크기
        # 'xtick.labelsize': 12,        # x축 눈금 크기
        # 'ytick.labelsize': 12,        # y축 눈금 크기
        # 'legend.fontsize': 12,        # 범례 폰트 크기
        'axes.titleweight': 'bold',   # 제목 굵게
        'axes.titlesize': 16          # 제목 크기
    }
)

%config InlineBackend.figure_format = 'retina'

In [2]:
df = pd.read_csv("data/loan.csv", index_col=0)
df

Unnamed: 0_level_0,married_or_not,self_employed_or_not,applicant_income,loan_amount,property_area,loan_status
loan_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LP1003,Y,N,4583,128.4,Rural,N
LP1005,Y,Y,3000,66.0,City,Y
LP1006,Y,N,2583,120.5,City,Y
LP1008,N,N,6000,141.4,City,Y
LP1011,Y,Y,5417,267.4,City,Y
...,...,...,...,...,...,...
LP2974,Y,N,3232,108.6,Rural,Y
LP2979,Y,N,4106,40.7,Rural,Y
LP2983,Y,N,8072,253.4,City,Y
LP2984,Y,N,7583,187.6,City,Y


In [3]:
df = df.rename(columns={
                    'applicant_income':'income',
                    'loan_amount' : 'amount'
                })

In [4]:
df

Unnamed: 0_level_0,married_or_not,self_employed_or_not,income,amount,property_area,loan_status
loan_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LP1003,Y,N,4583,128.4,Rural,N
LP1005,Y,Y,3000,66.0,City,Y
LP1006,Y,N,2583,120.5,City,Y
LP1008,N,N,6000,141.4,City,Y
LP1011,Y,Y,5417,267.4,City,Y
...,...,...,...,...,...,...
LP2974,Y,N,3232,108.6,Rural,Y
LP2979,Y,N,4106,40.7,Rural,Y
LP2983,Y,N,8072,253.4,City,Y
LP2984,Y,N,7583,187.6,City,Y


In [5]:
df.query('income > 5000')

Unnamed: 0_level_0,married_or_not,self_employed_or_not,income,amount,property_area,loan_status
loan_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LP1008,N,N,6000,141.4,City,Y
LP1011,Y,Y,5417,267.4,City,Y
LP1020,Y,N,12841,349.3,Suburb,N
LP1043,Y,N,7660,104.1,City,N
LP1046,Y,N,5955,315.5,City,Y
...,...,...,...,...,...,...
LP2948,Y,N,5780,192.9,City,Y
LP2953,Y,N,5703,128.5,City,Y
LP2959,Y,N,12000,497.0,Suburb,Y
LP2983,Y,N,8072,253.4,City,Y


In [6]:
income_mean = df['income'].mean()
df.query('income > @income_mean')

Unnamed: 0_level_0,married_or_not,self_employed_or_not,income,amount,property_area,loan_status
loan_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LP1008,N,N,6000,141.4,City,Y
LP1011,Y,Y,5417,267.4,City,Y
LP1020,Y,N,12841,349.3,Suburb,N
LP1043,Y,N,7660,104.1,City,N
LP1046,Y,N,5955,315.5,City,Y
...,...,...,...,...,...,...
LP2948,Y,N,5780,192.9,City,Y
LP2953,Y,N,5703,128.5,City,Y
LP2959,Y,N,12000,497.0,Suburb,Y
LP2983,Y,N,8072,253.4,City,Y


In [7]:
df.query("property_area == 'City'")

Unnamed: 0_level_0,married_or_not,self_employed_or_not,income,amount,property_area,loan_status
loan_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LP1005,Y,Y,3000,66.0,City,Y
LP1006,Y,N,2583,120.5,City,Y
LP1008,N,N,6000,141.4,City,Y
LP1011,Y,Y,5417,267.4,City,Y
LP1013,Y,N,2333,95.3,City,Y
...,...,...,...,...,...,...
LP2949,N,,416,350.6,City,N
LP2953,Y,N,5703,128.5,City,Y
LP2960,Y,N,2400,,City,N
LP2983,Y,N,8072,253.4,City,Y


In [8]:
df.query("property_area in ['City', 'Suburb']")

Unnamed: 0_level_0,married_or_not,self_employed_or_not,income,amount,property_area,loan_status
loan_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LP1005,Y,Y,3000,66.0,City,Y
LP1006,Y,N,2583,120.5,City,Y
LP1008,N,N,6000,141.4,City,Y
LP1011,Y,Y,5417,267.4,City,Y
LP1013,Y,N,2333,95.3,City,Y
...,...,...,...,...,...,...
LP2960,Y,N,2400,,City,N
LP2961,Y,N,3400,174.0,Suburb,Y
LP2983,Y,N,8072,253.4,City,Y
LP2984,Y,N,7583,187.6,City,Y


In [9]:
df.query("property_area not in ['City', 'Suburb']")

Unnamed: 0_level_0,married_or_not,self_employed_or_not,income,amount,property_area,loan_status
loan_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LP1003,Y,N,4583,128.4,Rural,N
LP1029,N,N,1853,114.1,Rural,N
LP1038,Y,N,4887,133.5,Rural,N
LP1050,Y,N,3365,112.4,Rural,N
LP1097,N,Y,4692,106.2,Rural,N
...,...,...,...,...,...,...
LP2950,Y,,2894,155.5,Rural,Y
LP2958,N,N,3676,172.3,Rural,Y
LP2964,Y,N,3987,157.3,Rural,Y
LP2974,Y,N,3232,108.6,Rural,Y


In [11]:
data = pd.read_csv("data/netflix.csv")

data

Unnamed: 0,title,genre,release,runtime,score,language
0,Dark Forces,Thriller,2020-08-21,81,2.6,Spanish
1,Leyla Everlasting,Comedy,2020-12-04,112,3.7,Turkish
2,Sardar Ka Grandson,Comedy,2021-05-18,139,4.1,Hindi
3,Searching for Sheela,Documentary,2021-04-22,58,4.1,English
4,The Call,Drama,2020-11-27,112,4.1,Korean
...,...,...,...,...,...,...
225,Seaspiracy,Documentary,2021-03-24,89,8.2,English
226,The Three Deaths of Marisela Escobedo,Documentary,2020-10-14,109,8.2,Spanish
227,Dancing with the Birds,Documentary,2019-10-23,51,8.3,English
228,Emicida: AmarElo - It's All For Yesterday,Documentary,2020-12-08,89,8.6,Portuguese


In [14]:
data['year'] = pd.to_datetime(data['release']).dt.year

In [15]:
data

Unnamed: 0,title,genre,release,runtime,score,language,year
0,Dark Forces,Thriller,2020-08-21,81,2.6,Spanish,2020
1,Leyla Everlasting,Comedy,2020-12-04,112,3.7,Turkish,2020
2,Sardar Ka Grandson,Comedy,2021-05-18,139,4.1,Hindi,2021
3,Searching for Sheela,Documentary,2021-04-22,58,4.1,English,2021
4,The Call,Drama,2020-11-27,112,4.1,Korean,2020
...,...,...,...,...,...,...,...
225,Seaspiracy,Documentary,2021-03-24,89,8.2,English,2021
226,The Three Deaths of Marisela Escobedo,Documentary,2020-10-14,109,8.2,Spanish,2020
227,Dancing with the Birds,Documentary,2019-10-23,51,8.3,English,2019
228,Emicida: AmarElo - It's All For Yesterday,Documentary,2020-12-08,89,8.6,Portuguese,2020


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   title     230 non-null    object 
 1   genre     230 non-null    object 
 2   release   230 non-null    object 
 3   runtime   230 non-null    int64  
 4   score     230 non-null    float64
 5   language  230 non-null    object 
 6   year      230 non-null    int32  
dtypes: float64(1), int32(1), int64(1), object(4)
memory usage: 11.8+ KB


In [17]:
data.groupby(['year', 'genre'])['score'].mean()

year  genre          
2019  Comedy             5.625000
      Documentary        6.920000
      Drama              6.335714
      Romantic comedy    5.871429
      Thriller           5.842857
2020  Comedy             5.571429
      Documentary        7.177778
      Drama              6.055556
      Romantic comedy    5.687500
      Thriller           5.666667
2021  Comedy             5.228571
      Documentary        6.378571
      Drama              6.533333
      Romantic comedy    5.600000
      Thriller           5.566667
Name: score, dtype: float64

In [19]:
data.pivot_table(index='year', values=['score', 'runtime'], columns='genre', aggfunc='max')

Unnamed: 0_level_0,runtime,runtime,runtime,runtime,runtime,score,score,score,score,score
genre,Comedy,Documentary,Drama,Romantic comedy,Thriller,Comedy,Documentary,Drama,Romantic comedy,Thriller
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2019,104,144,151,102,130,6.9,8.3,7.9,6.8,6.6
2020,124,109,151,131,149,7.2,9.0,7.8,6.4,7.3
2021,139,112,142,109,142,6.3,8.2,7.2,6.6,6.5
