<a href="https://colab.research.google.com/github/DoItSon/playdata/blob/main/%EB%A8%B8%EC%8B%A0%EB%9F%AC%EB%8B%9D/10_data_leakage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data leakage
- 데이터 유출
- 예측할 때 사용할 수 없는 정보가 학습 데이터로 사용되는 것 (모델이 과대평가 된다.)

In [None]:
DATA_PATH = "/content/drive/MyDrive/data/"
SEED = 42

In [None]:
import pandas as pd
import numpy as np

# 변수 설명
- customerID : 고객ID
- gender : 성별(1:여성, 0:남성)
- SeniorCitizen : 고령자여부(1:Yes, 0:No)
- Partner : 기혼여부(1:Yes, 0: No)
- Dependents : 부양가족여부(1:Yes, 0:No)
- tenure : 가입기간(월단위)
- PhoneService : 집전화이용여부(1:Yes, 0:No)
- MultipleLines : 다중회선여부(1:Yes, 0:No)
- InternetService : 인터넷이용방식
- OnlineSecurity : 인터넷보안서비스사용여부(1:Yes, 0:No)
- OnlineBackup : 인터넷백업서비스사용여부(1:Yes, 0:No)
- DeviceProtection : 기기방화벽서비스사용여부(1:Yes, 0:No)
- TechSupport : 인터넷기술지원서비스사용여부(1:Yes, 0:No)
- StreamingTV : 스트리밍TV여부(1:Yes, 0:No)
- StreamingMovies : 스트리밍영화여부(1:Yes, 0:No)
- Contract : 약정기간(연단위, 0은 약정없음)
- PaperlessBilling : 지로여부(1:Yes, 0:No)
- PaymentMethod : 지불방법
- MonthlyCharges : 한달요금
- TotalCharges : 총지불요금
- Churn : 이탈여부(1:Yes, 0:No)

In [None]:
train_churn = pd.read_csv(f"{DATA_PATH}train_churn.csv")
test_churn = pd.read_csv(f"{DATA_PATH}test_churn.csv")
train_churn.shape, test_churn.shape

In [None]:
train_churn.isnull().sum().sum(), test_churn.isnull().sum().sum()

(0, 0)

In [None]:
train_churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5282 entries, 0 to 5281
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        5282 non-null   object 
 1   gender            5282 non-null   int64  
 2   SeniorCitizen     5282 non-null   int64  
 3   Partner           5282 non-null   int64  
 4   Dependents        5282 non-null   int64  
 5   tenure            5282 non-null   int64  
 6   PhoneService      5282 non-null   int64  
 7   MultipleLines     5282 non-null   int64  
 8   InternetService   5282 non-null   object 
 9   OnlineSecurity    5282 non-null   int64  
 10  OnlineBackup      5282 non-null   int64  
 11  DeviceProtection  5282 non-null   int64  
 12  TechSupport       5282 non-null   int64  
 13  StreamingTV       5282 non-null   int64  
 14  StreamingMovies   5282 non-null   int64  
 15  Contract          5282 non-null   int64  
 16  PaperlessBilling  5282 non-null   int64  


In [None]:
train = train_churn.select_dtypes("number").iloc[:,:-1] # -1 => 정답값 제거
test = test_churn.select_dtypes("number")

train.shape, test.shape

((5282, 17), (1761, 17))

- 결측치 확인

In [None]:
train_churn.isnull().sum().sum() , test_churn.isnull().sum().sum()

(0, 0)

- 범주형 인코딩 하기 전에 범주의 고유값 개수 확인하기

In [None]:
train_churn.select_dtypes("object").nunique()

customerID         5282
InternetService       3
PaymentMethod         4
dtype: int64

- 학습데이터 먼저 원핫인코딩

In [None]:
from sklearn.preprocessing import OneHotEncoder
cols = ["InternetService","PaymentMethod"]
enc = OneHotEncoder(handle_unknown="ignore") # handle_unkwon= 새로운 정보가 들어올 경우 에러가 나는 것을 방지 (새로운 정보를 000으로 입력한다.)


tmp = pd.DataFrame(
    enc.fit_transform(train_churn[cols]).toarray(),
    columns = enc.get_feature_names_out() # 컬럼 이름 넣기
)

train = pd.concat([train,tmp],axis=1)
train.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,PaperlessBilling,MonthlyCharges,TotalCharges,InternetService_DSL,InternetService_Fiber optic,InternetService_No,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,0,1,1,0,0,0,0,0,...,0,25.3,25.3,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,0,0,0,7,1,0,0,0,1,...,0,75.15,525.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1,0,0,1,4,1,0,0,0,0,...,0,20.05,85.5,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,1,0,0,0,29,1,1,0,0,0,...,0,76.0,2215.25,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,1,0,0,0,3,1,1,0,0,0,...,0,75.1,270.7,0.0,1.0,0.0,0.0,1.0,0.0,0.0


- 테스트 데이터에 대해서는 transform 만 해야한다. 
- test 데이터는 절대 fit_transform 금지!
- fit_transform을 하면 데이터 유출이 발생한다.

In [None]:
tmp = pd.DataFrame(
    enc.transform(test_churn[cols]).toarray(),
    columns = enc.get_feature_names_out()
)

test = pd.concat([test,tmp],axis = 1)
test.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,PaperlessBilling,MonthlyCharges,TotalCharges,InternetService_DSL,InternetService_Fiber optic,InternetService_No,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,0,1,0,1,0,0,0,0,0,...,0,24.8,24.8,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,0,0,0,41,1,1,0,0,0,...,0,25.25,996.45,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,1,0,1,1,52,1,0,0,0,0,...,1,19.35,1031.7,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1,0,0,0,1,1,0,0,0,1,...,1,76.35,76.35,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0,0,0,0,67,1,0,0,0,0,...,1,50.55,3260.1,1.0,0.0,0.0,1.0,0.0,0.0,0.0


# 추가 피쳐 만들어 보기
    - 가입기간별로 여성의 비율을 피쳐로 추가 하기

In [None]:
train_tmp = train_churn.groupby("tenure")["gender"].agg("mean").reset_index()
train_tmp = train_tmp.rename(columns  = {"gender":"gender_rate"})
train_tmp.head()

Unnamed: 0,tenure,gender_rate
0,0,0.375
1,1,0.451327
2,2,0.53012
3,3,0.48366
4,4,0.518519


In [None]:
train = train.merge(train_tmp,how="left",on="tenure")

## 테스트 데이터에 대해서 학습데이터에 추가한 동일한 피쳐 추가 시 데이터 유출 사례
- 실제 서비스에서 예측해야 하는 데이터를 하나의 샘플만 예측 해야할때도 있고 여러개의 샘플을 예측해야할때가 있다.
- 다음과 같이 피쳐를 추가 할 경우 하나의 샘플만 예측해야하는경우 피쳐를 만들수 없다.


In [None]:
test_tmp = test_churn.groupby("tenure")["gender"].agg("mean").reset_index()
test_tmp = test_tmp.rename(columns  = {"gender":"gender_rate"})
test.merge(test_tmp,how="left",on="tenure")

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,MonthlyCharges,TotalCharges,InternetService_DSL,InternetService_Fiber optic,InternetService_No,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,gender_rate
0,1,0,1,0,1,0,0,0,0,0,...,24.80,24.80,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.496894
1,0,0,0,0,41,1,1,0,0,0,...,25.25,996.45,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.562500
2,1,0,1,1,52,1,0,0,0,0,...,19.35,1031.70,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.413793
3,1,0,0,0,1,1,0,0,0,1,...,76.35,76.35,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.496894
4,0,0,0,0,67,1,0,0,0,0,...,50.55,3260.10,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.454545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1756,1,0,0,0,3,1,1,0,0,0,...,75.80,246.30,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.574468
1757,1,0,1,0,8,1,1,0,0,0,...,90.25,743.75,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.541667
1758,0,0,0,0,29,1,1,0,1,1,...,70.90,1964.60,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.388889
1759,1,0,0,0,2,0,0,0,0,0,...,34.70,62.25,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.583333


- 위와 같은 피쳐에 경우 다음과 같이 학습데이터에서 추출한 피쳐를 적용하는 방식으로 해야 데이터 유출을 방지 할수 있다.

In [None]:
# 정답데이터에 피쳐를 추가하거나 통계치를 뽑는 것은 말도 안된다. train으로 merge해라!
# if 결측치가 생기면 trian에서 나온 통계치로 채워 넣어라!
test = test.merge(train_tmp,how="left",on="tenure")