## 초기 세팅

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 그래프 기본 테마 설정
# https://coldbrown.co.kr/2023/07/%ED%8C%8C%EC%9D%B4%EC%8D%AC-%EC%8B%A4%EC%A0%84%ED%8E%B8-08-seaborn-sns-set%EC%9D%84-%ED%86%B5%ED%95%B4-%EC%8A%A4%ED%83%80%EC%9D%BC-%EC%84%A4%EC%A0%95%ED%95%98%EA%B8%B0/
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 복잡한 통계 처리를 위한 라이브러리
from scipy import stats

In [14]:
## 데이터프레임을 넣고 column별 특성 및 결측값, 고유값들을 확인하는 함수를 작성해본다.
## 필수는 아니지만 전체적인 흐름을 파악하기 쉬워진다.

def resumetable(df):
  print(f'데이터셋 크기: {df.shape}')                                # 데이터프레임의 전체 크기(행, 열) 출력

  summary = pd.DataFrame(df.dtypes, columns=['데이터 타입'])         # 각 피처의 데이터 타입을 가져와 데이터프레임으로 생성
  summary = summary.reset_index()                                    # 인덱스를 초기화하여 컬럼으로 변환
  summary = summary.rename(columns={'index':'피처'})                 # 'index' 컬럼명을 '피처'로 변경

  summary['결측값 개수'] = df.isnull().sum().values                 # 각 피처의 결측값(null) 개수 계산
  summary['고유값 개수'] = df.nunique().values                      # 각 피처의 고유값 개수 계산

  summary['첫 번째 값'] = df.loc[0].values                          # 각 피처의 첫 번째 샘플 값
  summary['두 번째 값'] = df.loc[1].values                          # 각 피처의 두 번째 샘플 값
  summary['세 번째 값'] = df.loc[2].values                          # 각 피처의 세 번째 샘플 값

  return summary                                                     # 요약 테이블 반환

In [None]:
tel = pd.read_csv('data/tel_data_onehotenco.csv')
new_churn = pd.read_excel('data/Telco_customer_churn.xlsx')
demographics = pd.read_excel('data/Telco_customer_churn_demographics.xlsx')
location = pd.read_excel('data/Telco_customer_churn_location.xlsx')
population = pd.read_excel('data/Telco_customer_churn_population.xlsx')
services = pd.read_excel('data/Telco_customer_churn_services.xlsx')
status = pd.read_excel('data/Telco_customer_churn_status.xlsx')

---

# 데이터셋 확인

### 원래 데이터셋

In [7]:
tel

Unnamed: 0.1,Unnamed: 0,고객ID,성별,고령자여부,배우자여부,부양가족여부,가입개월수,전화서비스가입여부,복수회선여부,인터넷서비스유형,...,TV스트리밍이용여부,영화스트리밍이용여부,계약기간유형,전자청구서이용여부,결제방법,월요금,총요금,이탈여부,인터넷서비스가입여부,개월수*월요금
0,0,7590-VHVEG,False,False,True,False,1,False,False,DSL,...,False,False,Month-to-month,True,Electronic check,29.85,29.85,False,True,29.85
1,1,5575-GNVDE,True,False,False,False,34,True,False,DSL,...,False,False,One year,False,Mailed check,56.95,1889.50,False,True,1936.30
2,2,3668-QPYBK,True,False,False,False,2,True,False,DSL,...,False,False,Month-to-month,True,Mailed check,53.85,108.15,True,True,107.70
3,3,7795-CFOCW,True,False,False,False,45,False,False,DSL,...,False,False,One year,False,Bank transfer (automatic),42.30,1840.75,False,True,1903.50
4,4,9237-HQITU,False,False,False,False,2,True,False,Fiber optic,...,False,False,Month-to-month,True,Electronic check,70.70,151.65,True,True,141.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,7038,6840-RESVB,True,False,True,True,24,True,True,DSL,...,True,True,One year,True,Mailed check,84.80,1990.50,False,True,2035.20
7039,7039,2234-XADUH,False,False,True,True,72,True,True,Fiber optic,...,True,True,One year,True,Credit card (automatic),103.20,7362.90,False,True,7430.40
7040,7040,4801-JZAZL,False,False,True,True,11,False,False,DSL,...,False,False,Month-to-month,True,Electronic check,29.60,346.45,False,True,325.60
7041,7041,8361-LTMKD,True,True,True,False,4,True,True,Fiber optic,...,False,False,Month-to-month,True,Mailed check,74.40,306.60,True,True,297.60


---

### 이탈률 있는 추가 데이터셋

In [5]:
new_churn

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.307420,Female,...,Month-to-month,Yes,Electronic check,70.70,151.65,Yes,1,67,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Month-to-month,Yes,Electronic check,104.80,3046.05,Yes,1,84,5003,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Month-to-month,Yes,Bank transfer (automatic),103.70,5036.3,Yes,1,89,5340,Competitor had better devices
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,2569-WGERO,1,United States,California,Landers,92285,"34.341737, -116.539416",34.341737,-116.539416,Female,...,Two year,Yes,Bank transfer (automatic),21.15,1419.4,No,0,45,5306,
7039,6840-RESVB,1,United States,California,Adelanto,92301,"34.667815, -117.536183",34.667815,-117.536183,Male,...,One year,Yes,Mailed check,84.80,1990.5,No,0,59,2140,
7040,2234-XADUH,1,United States,California,Amboy,92304,"34.559882, -115.637164",34.559882,-115.637164,Female,...,One year,Yes,Credit card (automatic),103.20,7362.9,No,0,71,5560,
7041,4801-JZAZL,1,United States,California,Angelus Oaks,92305,"34.1678, -116.86433",34.167800,-116.864330,Female,...,Month-to-month,Yes,Electronic check,29.60,346.45,No,0,59,2793,


In [16]:
resumetable(new_churn)

데이터셋 크기: (7043, 33)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,첫 번째 값,두 번째 값,세 번째 값
0,CustomerID,object,0,7043,3668-QPYBK,9237-HQITU,9305-CDSKC
1,Count,int64,0,1,1,1,1
2,Country,object,0,1,United States,United States,United States
3,State,object,0,1,California,California,California
4,City,object,0,1129,Los Angeles,Los Angeles,Los Angeles
5,Zip Code,int64,0,1652,90003,90005,90006
6,Lat Long,object,0,1652,"33.964131, -118.272783","34.059281, -118.30742","34.048013, -118.293953"
7,Latitude,float64,0,1652,33.964131,34.059281,34.048013
8,Longitude,float64,0,1651,-118.272783,-118.30742,-118.293953
9,Gender,object,0,2,Male,Female,Female


---

In [9]:
demographics

Unnamed: 0,Customer ID,Count,Gender,Age,Under 30,Senior Citizen,Married,Dependents,Number of Dependents
0,8779-QRDMV,1,Male,78,No,Yes,No,No,0
1,7495-OOKFY,1,Female,74,No,Yes,Yes,Yes,1
2,1658-BYGOY,1,Male,71,No,Yes,No,Yes,3
3,4598-XLKNJ,1,Female,78,No,Yes,Yes,Yes,1
4,4846-WHAFZ,1,Female,80,No,Yes,Yes,Yes,1
...,...,...,...,...,...,...,...,...,...
7038,2569-WGERO,1,Female,30,No,No,No,No,0
7039,6840-RESVB,1,Male,38,No,No,Yes,Yes,2
7040,2234-XADUH,1,Female,30,No,No,Yes,Yes,2
7041,4801-JZAZL,1,Female,32,No,No,Yes,Yes,2


In [17]:
resumetable(demographics)

데이터셋 크기: (7043, 9)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,첫 번째 값,두 번째 값,세 번째 값
0,Customer ID,object,0,7043,8779-QRDMV,7495-OOKFY,1658-BYGOY
1,Count,int64,0,1,1,1,1
2,Gender,object,0,2,Male,Female,Male
3,Age,int64,0,62,78,74,71
4,Under 30,object,0,2,No,No,No
5,Senior Citizen,object,0,2,Yes,Yes,Yes
6,Married,object,0,2,No,Yes,No
7,Dependents,object,0,2,No,Yes,Yes
8,Number of Dependents,int64,0,10,0,1,3


---

In [10]:
location

Unnamed: 0,Customer ID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude
0,8779-QRDMV,1,United States,California,Los Angeles,90022,"34.02381, -118.156582",34.023810,-118.156582
1,7495-OOKFY,1,United States,California,Los Angeles,90063,"34.044271, -118.185237",34.044271,-118.185237
2,1658-BYGOY,1,United States,California,Los Angeles,90065,"34.108833, -118.229715",34.108833,-118.229715
3,4598-XLKNJ,1,United States,California,Inglewood,90303,"33.936291, -118.332639",33.936291,-118.332639
4,4846-WHAFZ,1,United States,California,Whittier,90602,"33.972119, -118.020188",33.972119,-118.020188
...,...,...,...,...,...,...,...,...,...
7038,2569-WGERO,1,United States,California,Landers,92285,"34.341737, -116.539416",34.341737,-116.539416
7039,6840-RESVB,1,United States,California,Adelanto,92301,"34.667815, -117.536183",34.667815,-117.536183
7040,2234-XADUH,1,United States,California,Amboy,92304,"34.559882, -115.637164",34.559882,-115.637164
7041,4801-JZAZL,1,United States,California,Angelus Oaks,92305,"34.1678, -116.86433",34.167800,-116.864330


In [22]:
resumetable(location)

데이터셋 크기: (7043, 9)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,첫 번째 값,두 번째 값,세 번째 값
0,Customer ID,object,0,7043,8779-QRDMV,7495-OOKFY,1658-BYGOY
1,Count,int64,0,1,1,1,1
2,Country,object,0,1,United States,United States,United States
3,State,object,0,1,California,California,California
4,City,object,0,1106,Los Angeles,Los Angeles,Los Angeles
5,Zip Code,int64,0,1626,90022,90063,90065
6,Lat Long,object,0,1679,"34.02381, -118.156582","34.044271, -118.185237","34.108833, -118.229715"
7,Latitude,float64,0,1626,34.02381,34.044271,34.108833
8,Longitude,float64,0,1625,-118.156582,-118.185237,-118.229715


---

In [11]:
population

Unnamed: 0,ID,Zip Code,Population
0,1,90001,54492
1,2,90002,44586
2,3,90003,58198
3,4,90004,67852
4,5,90005,43019
...,...,...,...
1666,1667,96145,4002
1667,1668,96146,942
1668,1669,96148,678
1669,1670,96150,33038


In [18]:
resumetable(population)

데이터셋 크기: (1671, 3)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,첫 번째 값,두 번째 값,세 번째 값
0,ID,int64,0,1671,1,2,3
1,Zip Code,int64,0,1671,90001,90002,90003
2,Population,int64,0,1607,54492,44586,58198


---

In [12]:
services

Unnamed: 0,Customer ID,Count,Quarter,Referred a Friend,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,...,Unlimited Data,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue
0,8779-QRDMV,1,Q3,No,0,1,,No,0.00,No,...,No,Month-to-Month,Yes,Bank Withdrawal,39.65,39.65,0.00,20,0.00,59.65
1,7495-OOKFY,1,Q3,Yes,1,8,Offer E,Yes,48.85,Yes,...,Yes,Month-to-Month,Yes,Credit Card,80.65,633.30,0.00,0,390.80,1024.10
2,1658-BYGOY,1,Q3,No,0,18,Offer D,Yes,11.33,Yes,...,Yes,Month-to-Month,Yes,Bank Withdrawal,95.45,1752.55,45.61,0,203.94,1910.88
3,4598-XLKNJ,1,Q3,Yes,1,25,Offer C,Yes,19.76,No,...,Yes,Month-to-Month,Yes,Bank Withdrawal,98.50,2514.50,13.43,0,494.00,2995.07
4,4846-WHAFZ,1,Q3,Yes,1,37,Offer C,Yes,6.33,Yes,...,Yes,Month-to-Month,Yes,Bank Withdrawal,76.50,2868.15,0.00,0,234.21,3102.36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,2569-WGERO,1,Q3,No,0,72,,Yes,22.77,No,...,No,Two Year,Yes,Bank Withdrawal,21.15,1419.40,19.31,0,1639.44,3039.53
7039,6840-RESVB,1,Q3,Yes,1,24,Offer C,Yes,36.05,Yes,...,Yes,One Year,Yes,Mailed Check,84.80,1990.50,48.23,0,865.20,2807.47
7040,2234-XADUH,1,Q3,Yes,4,72,,Yes,29.66,Yes,...,Yes,One Year,Yes,Credit Card,103.20,7362.90,45.38,0,2135.52,9453.04
7041,4801-JZAZL,1,Q3,Yes,1,11,,No,0.00,No,...,Yes,Month-to-Month,Yes,Bank Withdrawal,29.60,346.45,27.24,0,0.00,319.21


In [19]:
resumetable(services)

데이터셋 크기: (7043, 30)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,첫 번째 값,두 번째 값,세 번째 값
0,Customer ID,object,0,7043,8779-QRDMV,7495-OOKFY,1658-BYGOY
1,Count,int64,0,1,1,1,1
2,Quarter,object,0,1,Q3,Q3,Q3
3,Referred a Friend,object,0,2,No,Yes,No
4,Number of Referrals,int64,0,12,0,1,0
5,Tenure in Months,int64,0,72,1,8,18
6,Offer,object,3877,5,,Offer E,Offer D
7,Phone Service,object,0,2,No,Yes,Yes
8,Avg Monthly Long Distance Charges,float64,0,3584,0.0,48.85,11.33
9,Multiple Lines,object,0,2,No,Yes,Yes


---

In [13]:
status

Unnamed: 0,Customer ID,Count,Quarter,Satisfaction Score,Customer Status,Churn Label,Churn Value,Churn Score,CLTV,Churn Category,Churn Reason
0,8779-QRDMV,1,Q3,3,Churned,Yes,1,91,5433,Competitor,Competitor offered more data
1,7495-OOKFY,1,Q3,3,Churned,Yes,1,69,5302,Competitor,Competitor made better offer
2,1658-BYGOY,1,Q3,2,Churned,Yes,1,81,3179,Competitor,Competitor made better offer
3,4598-XLKNJ,1,Q3,2,Churned,Yes,1,88,5337,Dissatisfaction,Limited range of services
4,4846-WHAFZ,1,Q3,2,Churned,Yes,1,67,2793,Price,Extra data charges
...,...,...,...,...,...,...,...,...,...,...,...
7038,2569-WGERO,1,Q3,5,Stayed,No,0,45,5306,,
7039,6840-RESVB,1,Q3,3,Stayed,No,0,59,2140,,
7040,2234-XADUH,1,Q3,4,Stayed,No,0,71,5560,,
7041,4801-JZAZL,1,Q3,4,Stayed,No,0,59,2793,,


In [20]:
resumetable(status)

데이터셋 크기: (7043, 11)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,첫 번째 값,두 번째 값,세 번째 값
0,Customer ID,object,0,7043,8779-QRDMV,7495-OOKFY,1658-BYGOY
1,Count,int64,0,1,1,1,1
2,Quarter,object,0,1,Q3,Q3,Q3
3,Satisfaction Score,int64,0,5,3,3,2
4,Customer Status,object,0,3,Churned,Churned,Churned
5,Churn Label,object,0,2,Yes,Yes,Yes
6,Churn Value,int64,0,2,1,1,1
7,Churn Score,int64,0,81,91,69,81
8,CLTV,int64,0,3438,5433,5302,3179
9,Churn Category,object,5174,5,Competitor,Competitor,Competitor


In [26]:
resumetable(new_churn)

데이터셋 크기: (7043, 33)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,첫 번째 값,두 번째 값,세 번째 값
0,CustomerID,object,0,7043,3668-QPYBK,9237-HQITU,9305-CDSKC
1,Count,int64,0,1,1,1,1
2,Country,object,0,1,United States,United States,United States
3,State,object,0,1,California,California,California
4,City,object,0,1129,Los Angeles,Los Angeles,Los Angeles
5,Zip Code,int64,0,1652,90003,90005,90006
6,Lat Long,object,0,1652,"33.964131, -118.272783","34.059281, -118.30742","34.048013, -118.293953"
7,Latitude,float64,0,1652,33.964131,34.059281,34.048013
8,Longitude,float64,0,1651,-118.272783,-118.30742,-118.293953
9,Gender,object,0,2,Male,Female,Female
