## 초기 세팅

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 그래프 기본 테마 설정
# https://coldbrown.co.kr/2023/07/%ED%8C%8C%EC%9D%B4%EC%8D%AC-%EC%8B%A4%EC%A0%84%ED%8E%B8-08-seaborn-sns-set%EC%9D%84-%ED%86%B5%ED%95%B4-%EC%8A%A4%ED%83%80%EC%9D%BC-%EC%84%A4%EC%A0%95%ED%95%98%EA%B8%B0/
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 복잡한 통계 처리를 위한 라이브러리
from scipy import stats

In [2]:
## 데이터프레임을 넣고 column별 특성 및 결측값, 고유값들을 확인하는 함수를 작성해본다.
## 필수는 아니지만 전체적인 흐름을 파악하기 쉬워진다.

def resumetable(df):
  print(f'데이터셋 크기: {df.shape}')                                # 데이터프레임의 전체 크기(행, 열) 출력

  summary = pd.DataFrame(df.dtypes, columns=['데이터 타입'])         # 각 피처의 데이터 타입을 가져와 데이터프레임으로 생성
  summary = summary.reset_index()                                    # 인덱스를 초기화하여 컬럼으로 변환
  summary = summary.rename(columns={'index':'피처'})                 # 'index' 컬럼명을 '피처'로 변경

  summary['결측값 개수'] = df.isnull().sum().values                 # 각 피처의 결측값(null) 개수 계산
  summary['고유값 개수'] = df.nunique().values                      # 각 피처의 고유값 개수 계산

  summary['첫 번째 값'] = df.loc[0].values                          # 각 피처의 첫 번째 샘플 값
  summary['두 번째 값'] = df.loc[1].values                          # 각 피처의 두 번째 샘플 값
  summary['세 번째 값'] = df.loc[2].values                          # 각 피처의 세 번째 샘플 값

  return summary                                                     # 요약 테이블 반환

In [4]:
tel = pd.read_csv('data/tel_data_onehotenco.csv')
churn = pd.read_excel('data/Telco_customer_churn.xlsx')

---

## 데이터셋 확인

### 원래 데이터셋

In [5]:
tel

Unnamed: 0.1,Unnamed: 0,고객ID,성별,고령자여부,배우자여부,부양가족여부,가입개월수,전화서비스가입여부,복수회선여부,인터넷서비스유형,...,TV스트리밍이용여부,영화스트리밍이용여부,계약기간유형,전자청구서이용여부,결제방법,월요금,총요금,이탈여부,인터넷서비스가입여부,개월수*월요금
0,0,7590-VHVEG,False,False,True,False,1,False,False,DSL,...,False,False,Month-to-month,True,Electronic check,29.85,29.85,False,True,29.85
1,1,5575-GNVDE,True,False,False,False,34,True,False,DSL,...,False,False,One year,False,Mailed check,56.95,1889.50,False,True,1936.30
2,2,3668-QPYBK,True,False,False,False,2,True,False,DSL,...,False,False,Month-to-month,True,Mailed check,53.85,108.15,True,True,107.70
3,3,7795-CFOCW,True,False,False,False,45,False,False,DSL,...,False,False,One year,False,Bank transfer (automatic),42.30,1840.75,False,True,1903.50
4,4,9237-HQITU,False,False,False,False,2,True,False,Fiber optic,...,False,False,Month-to-month,True,Electronic check,70.70,151.65,True,True,141.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,7038,6840-RESVB,True,False,True,True,24,True,True,DSL,...,True,True,One year,True,Mailed check,84.80,1990.50,False,True,2035.20
7039,7039,2234-XADUH,False,False,True,True,72,True,True,Fiber optic,...,True,True,One year,True,Credit card (automatic),103.20,7362.90,False,True,7430.40
7040,7040,4801-JZAZL,False,False,True,True,11,False,False,DSL,...,False,False,Month-to-month,True,Electronic check,29.60,346.45,False,True,325.60
7041,7041,8361-LTMKD,True,True,True,False,4,True,True,Fiber optic,...,False,False,Month-to-month,True,Mailed check,74.40,306.60,True,True,297.60


In [6]:
resumetable(tel)

데이터셋 크기: (7043, 24)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,첫 번째 값,두 번째 값,세 번째 값
0,Unnamed: 0,int64,0,7043,0,1,2
1,고객ID,object,0,7043,7590-VHVEG,5575-GNVDE,3668-QPYBK
2,성별,bool,0,2,False,True,True
3,고령자여부,bool,0,2,False,False,False
4,배우자여부,bool,0,2,True,False,False
5,부양가족여부,bool,0,2,False,False,False
6,가입개월수,int64,0,73,1,34,2
7,전화서비스가입여부,bool,0,2,False,True,True
8,복수회선여부,bool,0,2,False,False,False
9,인터넷서비스유형,object,0,3,DSL,DSL,DSL


### 추가된 데이터셋

In [7]:
churn

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.307420,Female,...,Month-to-month,Yes,Electronic check,70.70,151.65,Yes,1,67,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Month-to-month,Yes,Electronic check,104.80,3046.05,Yes,1,84,5003,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Month-to-month,Yes,Bank transfer (automatic),103.70,5036.3,Yes,1,89,5340,Competitor had better devices
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,2569-WGERO,1,United States,California,Landers,92285,"34.341737, -116.539416",34.341737,-116.539416,Female,...,Two year,Yes,Bank transfer (automatic),21.15,1419.4,No,0,45,5306,
7039,6840-RESVB,1,United States,California,Adelanto,92301,"34.667815, -117.536183",34.667815,-117.536183,Male,...,One year,Yes,Mailed check,84.80,1990.5,No,0,59,2140,
7040,2234-XADUH,1,United States,California,Amboy,92304,"34.559882, -115.637164",34.559882,-115.637164,Female,...,One year,Yes,Credit card (automatic),103.20,7362.9,No,0,71,5560,
7041,4801-JZAZL,1,United States,California,Angelus Oaks,92305,"34.1678, -116.86433",34.167800,-116.864330,Female,...,Month-to-month,Yes,Electronic check,29.60,346.45,No,0,59,2793,


In [8]:
resumetable(churn)

데이터셋 크기: (7043, 33)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,첫 번째 값,두 번째 값,세 번째 값
0,CustomerID,object,0,7043,3668-QPYBK,9237-HQITU,9305-CDSKC
1,Count,int64,0,1,1,1,1
2,Country,object,0,1,United States,United States,United States
3,State,object,0,1,California,California,California
4,City,object,0,1129,Los Angeles,Los Angeles,Los Angeles
5,Zip Code,int64,0,1652,90003,90005,90006
6,Lat Long,object,0,1652,"33.964131, -118.272783","34.059281, -118.30742","34.048013, -118.293953"
7,Latitude,float64,0,1652,33.964131,34.059281,34.048013
8,Longitude,float64,0,1651,-118.272783,-118.30742,-118.293953
9,Gender,object,0,2,Male,Female,Female


---

## EDA
- 컬럼명 변경

In [13]:
new_columns_dict = {
    'CutomerID': '고객ID',
    'Count': '카운트',
    'Country': '나라',
    'State': '주',
    'City': '도시',
    'Zip Code': '우편번호',
    'Lat Long': '위경도',
    'Latitude': '위도',
    'Longitude': '경도',
    'Gender': '성별',
    'Senior Citizen': '고령자여부',
    'Partner': '배우자여부',
    'Dependents': '부양가족여부',
    'Tenure Months': '가입개월수2',
    'Phone Service': '전화서비스가입여부',
    'Multiple Lines': '복수회선여부',
    'Internet Service': '인터넷서비스유형',
    'Online Security': '온라인보안서비스여부',
    'Online Backup': '온라인백업서비스여부',
    'Device Protection': '기기보호서비스여부',
    'Tech Support': '기술지원서비스여부',
    'Streaming TV': 'TV스트리밍이용여부',
    'Streaming Movies': '영화스트리밍이용여부',
    'Contract': '계약기간유형',
    'Paperless Billing': '전자청구서이용여부',
    'Payment Method': '결제방법',
    'Monthly Charges': '월요금',
    'Total Charges': '총요금',
    'Churn Label': '이탈여부',
    'Churn Value': '이탈여부(bool)',
    'Churn Score': '이탈점수',
    'CLTV': '고객생애가치',
    'Churn Reason': '이탈이유'
}

churn = churn.rename(columns=new_columns_dict)
resumetable(churn)

데이터셋 크기: (7043, 33)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,첫 번째 값,두 번째 값,세 번째 값
0,CustomerID,object,0,7043,3668-QPYBK,9237-HQITU,9305-CDSKC
1,카운트,int64,0,1,1,1,1
2,나라,object,0,1,United States,United States,United States
3,주,object,0,1,California,California,California
4,도시,object,0,1129,Los Angeles,Los Angeles,Los Angeles
5,우편번호,int64,0,1652,90003,90005,90006
6,위경도,object,0,1652,"33.964131, -118.272783","34.059281, -118.30742","34.048013, -118.293953"
7,위도,float64,0,1652,33.964131,34.059281,34.048013
8,경도,float64,0,1651,-118.272783,-118.30742,-118.293953
9,성별,object,0,2,Male,Female,Female
