## 초기 세팅

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 그래프 기본 테마 설정
# https://coldbrown.co.kr/2023/07/%ED%8C%8C%EC%9D%B4%EC%8D%AC-%EC%8B%A4%EC%A0%84%ED%8E%B8-08-seaborn-sns-set%EC%9D%84-%ED%86%B5%ED%95%B4-%EC%8A%A4%ED%83%80%EC%9D%BC-%EC%84%A4%EC%A0%95%ED%95%98%EA%B8%B0/
sns.set()

# 그래프 기본 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['figure.figsize'] = 12, 6
plt.rcParams['font.size'] = 14
plt.rcParams['axes.unicode_minus'] = False

# 복잡한 통계 처리를 위한 라이브러리
from scipy import stats

In [2]:
## 데이터프레임을 넣고 column별 특성 및 결측값, 고유값들을 확인하는 함수를 작성해본다.
## 필수는 아니지만 전체적인 흐름을 파악하기 쉬워진다.

def resumetable(df):
  print(f'데이터셋 크기: {df.shape}')                                # 데이터프레임의 전체 크기(행, 열) 출력

  summary = pd.DataFrame(df.dtypes, columns=['데이터 타입'])         # 각 피처의 데이터 타입을 가져와 데이터프레임으로 생성
  summary = summary.reset_index()                                    # 인덱스를 초기화하여 컬럼으로 변환
  summary = summary.rename(columns={'index':'피처'})                 # 'index' 컬럼명을 '피처'로 변경

  summary['결측값 개수'] = df.isnull().sum().values                 # 각 피처의 결측값(null) 개수 계산
  summary['고유값 개수'] = df.nunique().values                      # 각 피처의 고유값 개수 계산

  summary['첫 번째 값'] = df.iloc[0].values                          # 각 피처의 첫 번째 샘플 값
  summary['두 번째 값'] = df.iloc[1].values                          # 각 피처의 두 번째 샘플 값
  summary['세 번째 값'] = df.iloc[2].values                          # 각 피처의 세 번째 샘플 값

  return summary                                                     # 요약 테이블 반환

In [3]:
tel = pd.read_csv('data/tel_data.csv')
churn = pd.read_excel('data/Telco_customer_churn.xlsx')

---

## 데이터셋 확인

### 원래 데이터셋

In [4]:
tel

Unnamed: 0.1,Unnamed: 0,고객ID,성별,고령자여부,배우자여부,부양가족여부,가입개월수,전화서비스가입여부,복수회선여부,인터넷서비스유형,...,TV스트리밍이용여부,영화스트리밍이용여부,계약기간유형,전자청구서이용여부,결제방법,월요금,총요금,이탈여부,인터넷서비스가입여부,개월수*월요금
0,0,7590-VHVEG,False,False,True,False,1,False,False,DSL,...,False,False,Month-to-month,True,Electronic check,29.85,29.85,False,True,29.85
1,1,5575-GNVDE,True,False,False,False,34,True,False,DSL,...,False,False,One year,False,Mailed check,56.95,1889.50,False,True,1936.30
2,2,3668-QPYBK,True,False,False,False,2,True,False,DSL,...,False,False,Month-to-month,True,Mailed check,53.85,108.15,True,True,107.70
3,3,7795-CFOCW,True,False,False,False,45,False,False,DSL,...,False,False,One year,False,Bank transfer (automatic),42.30,1840.75,False,True,1903.50
4,4,9237-HQITU,False,False,False,False,2,True,False,Fiber optic,...,False,False,Month-to-month,True,Electronic check,70.70,151.65,True,True,141.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,7038,6840-RESVB,True,False,True,True,24,True,True,DSL,...,True,True,One year,True,Mailed check,84.80,1990.50,False,True,2035.20
7039,7039,2234-XADUH,False,False,True,True,72,True,True,Fiber optic,...,True,True,One year,True,Credit card (automatic),103.20,7362.90,False,True,7430.40
7040,7040,4801-JZAZL,False,False,True,True,11,False,False,DSL,...,False,False,Month-to-month,True,Electronic check,29.60,346.45,False,True,325.60
7041,7041,8361-LTMKD,True,True,True,False,4,True,True,Fiber optic,...,False,False,Month-to-month,True,Mailed check,74.40,306.60,True,True,297.60


In [5]:
resumetable(tel)

데이터셋 크기: (7043, 24)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,첫 번째 값,두 번째 값,세 번째 값
0,Unnamed: 0,int64,0,7043,0,1,2
1,고객ID,object,0,7043,7590-VHVEG,5575-GNVDE,3668-QPYBK
2,성별,bool,0,2,False,True,True
3,고령자여부,bool,0,2,False,False,False
4,배우자여부,bool,0,2,True,False,False
5,부양가족여부,bool,0,2,False,False,False
6,가입개월수,int64,0,73,1,34,2
7,전화서비스가입여부,bool,0,2,False,True,True
8,복수회선여부,bool,0,2,False,False,False
9,인터넷서비스유형,object,0,3,DSL,DSL,DSL


### 추가된 데이터셋

In [6]:
churn

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.307420,Female,...,Month-to-month,Yes,Electronic check,70.70,151.65,Yes,1,67,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Month-to-month,Yes,Electronic check,104.80,3046.05,Yes,1,84,5003,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Month-to-month,Yes,Bank transfer (automatic),103.70,5036.3,Yes,1,89,5340,Competitor had better devices
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,2569-WGERO,1,United States,California,Landers,92285,"34.341737, -116.539416",34.341737,-116.539416,Female,...,Two year,Yes,Bank transfer (automatic),21.15,1419.4,No,0,45,5306,
7039,6840-RESVB,1,United States,California,Adelanto,92301,"34.667815, -117.536183",34.667815,-117.536183,Male,...,One year,Yes,Mailed check,84.80,1990.5,No,0,59,2140,
7040,2234-XADUH,1,United States,California,Amboy,92304,"34.559882, -115.637164",34.559882,-115.637164,Female,...,One year,Yes,Credit card (automatic),103.20,7362.9,No,0,71,5560,
7041,4801-JZAZL,1,United States,California,Angelus Oaks,92305,"34.1678, -116.86433",34.167800,-116.864330,Female,...,Month-to-month,Yes,Electronic check,29.60,346.45,No,0,59,2793,


In [7]:
resumetable(churn)

데이터셋 크기: (7043, 33)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,첫 번째 값,두 번째 값,세 번째 값
0,CustomerID,object,0,7043,3668-QPYBK,9237-HQITU,9305-CDSKC
1,Count,int64,0,1,1,1,1
2,Country,object,0,1,United States,United States,United States
3,State,object,0,1,California,California,California
4,City,object,0,1129,Los Angeles,Los Angeles,Los Angeles
5,Zip Code,int64,0,1652,90003,90005,90006
6,Lat Long,object,0,1652,"33.964131, -118.272783","34.059281, -118.30742","34.048013, -118.293953"
7,Latitude,float64,0,1652,33.964131,34.059281,34.048013
8,Longitude,float64,0,1651,-118.272783,-118.30742,-118.293953
9,Gender,object,0,2,Male,Female,Female


---

## EDA
- 컬럼명 변경

In [8]:
new_columns_dict = {
    'CustomerID': '고객ID',
    'Count': '카운트',
    'Country': '나라',
    'State': '주',
    'City': '도시',
    'Zip Code': '우편번호',
    'Lat Long': '위경도',
    'Latitude': '위도',
    'Longitude': '경도',
    'Gender': '성별',
    'Senior Citizen': '고령자여부',
    'Partner': '배우자여부',
    'Dependents': '부양가족여부',
    'Tenure Months': '가입개월수2',
    'Phone Service': '전화서비스가입여부',
    'Multiple Lines': '복수회선여부',
    'Internet Service': '인터넷서비스유형',
    'Online Security': '온라인보안서비스여부',
    'Online Backup': '온라인백업서비스여부',
    'Device Protection': '기기보호서비스여부',
    'Tech Support': '기술지원서비스여부',
    'Streaming TV': 'TV스트리밍이용여부',
    'Streaming Movies': '영화스트리밍이용여부',
    'Contract': '계약기간유형',
    'Paperless Billing': '전자청구서이용여부',
    'Payment Method': '결제방법',
    'Monthly Charges': '월요금',
    'Total Charges': '총요금',
    'Churn Label': '이탈여부',
    'Churn Value': '이탈여부(bool)',
    'Churn Score': '이탈점수',
    'CLTV': '고객생애가치',
    'Churn Reason': '이탈이유'
}

churn = churn.rename(columns=new_columns_dict)
resumetable(churn)

데이터셋 크기: (7043, 33)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,첫 번째 값,두 번째 값,세 번째 값
0,고객ID,object,0,7043,3668-QPYBK,9237-HQITU,9305-CDSKC
1,카운트,int64,0,1,1,1,1
2,나라,object,0,1,United States,United States,United States
3,주,object,0,1,California,California,California
4,도시,object,0,1129,Los Angeles,Los Angeles,Los Angeles
5,우편번호,int64,0,1652,90003,90005,90006
6,위경도,object,0,1652,"33.964131, -118.272783","34.059281, -118.30742","34.048013, -118.293953"
7,위도,float64,0,1652,33.964131,34.059281,34.048013
8,경도,float64,0,1651,-118.272783,-118.30742,-118.293953
9,성별,object,0,2,Male,Female,Female


### 의문점
- 가입개월수의 원래 변수명이 다름 (tenure, Tenure Months)
- 둘은 같은 데이터인가?

- tel에 churn의 고객ID, 가입개월수2 joined

In [9]:
df_joined = tel.merge(churn[['고객ID', '가입개월수2']], on='고객ID', how='left')

- 고객ID, 가입개월수, 가입개월수2만 추출

In [10]:
df_months = df_joined[['고객ID', '가입개월수', '가입개월수2']]

- '일치여부' 컬럼 추가 (boolean)

In [11]:
df_joined['일치여부'] = df_joined['가입개월수'] == df_joined['가입개월수2']
final_df = df_joined[['고객ID', '가입개월수', '가입개월수2', '일치여부']]
final_df

Unnamed: 0,고객ID,가입개월수,가입개월수2,일치여부
0,7590-VHVEG,1,1,True
1,5575-GNVDE,34,34,True
2,3668-QPYBK,2,2,True
3,7795-CFOCW,45,45,True
4,9237-HQITU,2,2,True
...,...,...,...,...
7038,6840-RESVB,24,24,True
7039,2234-XADUH,72,72,True
7040,4801-JZAZL,11,11,True
7041,8361-LTMKD,4,4,True


In [12]:
total_count = df_joined['일치여부'].count()
false_count = total_count - df_joined['일치여부'].sum()
print(false_count)

0


**→ 전부 일치**

→ '가입개월수2'의 컬럼명 '가입개월수'로 변경

In [13]:
churn = churn.rename(columns={'가입개월수2': '가입개월수'})
churn.columns

Index(['고객ID', '카운트', '나라', '주', '도시', '우편번호', '위경도', '위도', '경도', '성별',
       '고령자여부', '배우자여부', '부양가족여부', '가입개월수', '전화서비스가입여부', '복수회선여부', '인터넷서비스유형',
       '온라인보안서비스여부', '온라인백업서비스여부', '기기보호서비스여부', '기술지원서비스여부', 'TV스트리밍이용여부',
       '영화스트리밍이용여부', '계약기간유형', '전자청구서이용여부', '결제방법', '월요금', '총요금', '이탈여부',
       '이탈여부(bool)', '이탈점수', '고객생애가치', '이탈이유'],
      dtype='object')

In [14]:
print(churn['결제방법'].unique())

['Mailed check' 'Electronic check' 'Bank transfer (automatic)'
 'Credit card (automatic)']


In [15]:
churn.drop('카운트', axis=1, inplace=True)

In [16]:
# tel과 마찬가지로 ' ' 처리
churn['총요금'] = churn['총요금'].replace(' ', 0)
churn['총요금'] = churn['총요금'].astype(float)

  churn['총요금'] = churn['총요금'].replace(' ', 0)


In [17]:
resumetable(churn)

데이터셋 크기: (7043, 32)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,첫 번째 값,두 번째 값,세 번째 값
0,고객ID,object,0,7043,3668-QPYBK,9237-HQITU,9305-CDSKC
1,나라,object,0,1,United States,United States,United States
2,주,object,0,1,California,California,California
3,도시,object,0,1129,Los Angeles,Los Angeles,Los Angeles
4,우편번호,int64,0,1652,90003,90005,90006
5,위경도,object,0,1652,"33.964131, -118.272783","34.059281, -118.30742","34.048013, -118.293953"
6,위도,float64,0,1652,33.964131,34.059281,34.048013
7,경도,float64,0,1651,-118.272783,-118.30742,-118.293953
8,성별,object,0,2,Male,Female,Female
9,고령자여부,object,0,2,No,No,No


---

### Boolean으로 변환
- **성별**
  - Male : True
  - Female : False

In [18]:
churn = churn.replace({'No internet service': 'No'})
churn['복수회선여부'] = churn['복수회선여부'].replace({'No phone service': 'No'})
churn = churn.replace({'Male' : True, 'Female': False})
churn = churn.replace({'Yes': True, 'No': False})
resumetable(churn)

데이터셋 크기: (7043, 32)


  churn = churn.replace({'Male' : True, 'Female': False})
  churn = churn.replace({'Yes': True, 'No': False})


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,첫 번째 값,두 번째 값,세 번째 값
0,고객ID,object,0,7043,3668-QPYBK,9237-HQITU,9305-CDSKC
1,나라,object,0,1,United States,United States,United States
2,주,object,0,1,California,California,California
3,도시,object,0,1129,Los Angeles,Los Angeles,Los Angeles
4,우편번호,int64,0,1652,90003,90005,90006
5,위경도,object,0,1652,"33.964131, -118.272783","34.059281, -118.30742","34.048013, -118.293953"
6,위도,float64,0,1652,33.964131,34.059281,34.048013
7,경도,float64,0,1651,-118.272783,-118.30742,-118.293953
8,성별,bool,0,2,True,False,False
9,고령자여부,bool,0,2,False,False,False


In [19]:
churn.dtypes

고객ID           object
나라             object
주              object
도시             object
우편번호            int64
위경도            object
위도            float64
경도            float64
성별               bool
고령자여부            bool
배우자여부            bool
부양가족여부           bool
가입개월수           int64
전화서비스가입여부        bool
복수회선여부           bool
인터넷서비스유형       object
온라인보안서비스여부       bool
온라인백업서비스여부       bool
기기보호서비스여부        bool
기술지원서비스여부        bool
TV스트리밍이용여부       bool
영화스트리밍이용여부       bool
계약기간유형         object
전자청구서이용여부        bool
결제방법           object
월요금           float64
총요금           float64
이탈여부             bool
이탈여부(bool)      int64
이탈점수            int64
고객생애가치          int64
이탈이유           object
dtype: object

In [20]:
churn['인터넷서비스유형'].value_counts()

인터넷서비스유형
Fiber optic    3096
DSL            2421
False          1526
Name: count, dtype: int64

In [21]:
# 인터넷서비스유형은 'False' 대신 'No Service'로 고유값 표기
churn['인터넷서비스유형'] = churn['인터넷서비스유형'].replace(False, 'No Service')
churn['인터넷서비스유형'].value_counts()

인터넷서비스유형
Fiber optic    3096
DSL            2421
No Service     1526
Name: count, dtype: int64

In [22]:
# churn.to_csv('data/Telco_customer_churn_sehee.csv', index=False, encoding='utf-8-sig')

<span style="color:blue"> **data 폴더에 ~~ _sehee 라는 이름으로 들어가요**</span>

<span style="color:blue"> 제가 제작중 실수해서 오류가 계속 납니다. 만약 불러왔는데 Unnamed: 0 컬럼 있으면 아래 코드로 불러와 주세요.</span>

```df = pd.read_csv('data/Telco_customer_churn_sehee.csv', encoding='utf-8-sig', index_col=0)```

---

## 이탈이유 분석

In [23]:
print(churn['이탈이유'].unique())

['Competitor made better offer' 'Moved' 'Competitor had better devices'
 'Competitor offered higher download speeds'
 'Competitor offered more data' 'Price too high' 'Product dissatisfaction'
 'Service dissatisfaction' 'Lack of self-service on Website'
 'Network reliability' 'Limited range of services'
 'Lack of affordable download/upload speed' 'Long distance charges'
 'Extra data charges' "Don't know" 'Poor expertise of online support'
 'Poor expertise of phone support' 'Attitude of service provider'
 'Attitude of support person' 'Deceased' nan]


In [24]:
# 컬럼 전체 보기
churn.columns

Index(['고객ID', '나라', '주', '도시', '우편번호', '위경도', '위도', '경도', '성별', '고령자여부',
       '배우자여부', '부양가족여부', '가입개월수', '전화서비스가입여부', '복수회선여부', '인터넷서비스유형',
       '온라인보안서비스여부', '온라인백업서비스여부', '기기보호서비스여부', '기술지원서비스여부', 'TV스트리밍이용여부',
       '영화스트리밍이용여부', '계약기간유형', '전자청구서이용여부', '결제방법', '월요금', '총요금', '이탈여부',
       '이탈여부(bool)', '이탈점수', '고객생애가치', '이탈이유'],
      dtype='object')

In [25]:
# 카피 만들어서 필요없는 컬럼 탈락
churn_copy = churn.copy()
churn_copy.drop(['나라', '주', '도시', '우편번호', '위경도', '이탈점수'], axis=1, inplace=True)

In [26]:
churn_copy

Unnamed: 0,고객ID,위도,경도,성별,고령자여부,배우자여부,부양가족여부,가입개월수,전화서비스가입여부,복수회선여부,...,영화스트리밍이용여부,계약기간유형,전자청구서이용여부,결제방법,월요금,총요금,이탈여부,이탈여부(bool),고객생애가치,이탈이유
0,3668-QPYBK,33.964131,-118.272783,True,False,False,False,2,True,False,...,False,Month-to-month,True,Mailed check,53.85,108.15,True,1,3239,Competitor made better offer
1,9237-HQITU,34.059281,-118.307420,False,False,False,True,2,True,False,...,False,Month-to-month,True,Electronic check,70.70,151.65,True,1,2701,Moved
2,9305-CDSKC,34.048013,-118.293953,False,False,False,True,8,True,True,...,True,Month-to-month,True,Electronic check,99.65,820.50,True,1,5372,Moved
3,7892-POOKP,34.062125,-118.315709,False,False,True,True,28,True,True,...,True,Month-to-month,True,Electronic check,104.80,3046.05,True,1,5003,Moved
4,0280-XJGEX,34.039224,-118.266293,True,False,False,True,49,True,True,...,True,Month-to-month,True,Bank transfer (automatic),103.70,5036.30,True,1,5340,Competitor had better devices
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,2569-WGERO,34.341737,-116.539416,False,False,False,False,72,True,False,...,False,Two year,True,Bank transfer (automatic),21.15,1419.40,False,0,5306,
7039,6840-RESVB,34.667815,-117.536183,True,False,True,True,24,True,True,...,True,One year,True,Mailed check,84.80,1990.50,False,0,2140,
7040,2234-XADUH,34.559882,-115.637164,False,False,True,True,72,True,True,...,True,One year,True,Credit card (automatic),103.20,7362.90,False,0,5560,
7041,4801-JZAZL,34.167800,-116.864330,False,False,True,True,11,False,False,...,False,Month-to-month,True,Electronic check,29.60,346.45,False,0,2793,


In [27]:
demo_stat = pd.read_csv('data/demo_stat.csv')
services = pd.read_csv('data/services.csv')

In [28]:
demo_stat

Unnamed: 0,고객ID,성별,나이,30세미만여부,고령자여부,결혼여부,부양가족여부,부양가족수,고객만족도점수,현재고객상태,이탈여부,고객생애가치,가입개월수
0,0002-ORFBO,False,37,False,False,True,False,0,3,Stayed,False,2205,9
1,0003-MKNFE,True,46,False,False,False,False,0,5,Stayed,False,5414,9
2,0004-TLHLJ,True,50,False,False,False,False,0,1,Churned,True,4479,4
3,0011-IGKFF,True,78,False,True,True,False,0,1,Churned,True,3714,13
4,0013-EXCHZ,False,75,False,True,True,False,0,1,Churned,True,3464,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,9987-LUTYD,False,20,True,False,False,False,0,4,Stayed,False,3161,13
7039,9992-RRAMN,True,40,False,False,True,False,0,1,Churned,True,5248,22
7040,9992-UJOEL,True,22,True,False,False,False,0,5,Joined,False,5870,2
7041,9993-LHIEB,True,21,True,False,True,False,0,3,Stayed,False,4792,67


In [29]:
# churn_copy와 겹치는 컬럼 제거
demo_stat.drop(['성별', '고령자여부', '결혼여부', '부양가족여부','이탈여부','고객생애가치','가입개월수'], axis=1, inplace=True)

In [30]:
demo_stat

Unnamed: 0,고객ID,나이,30세미만여부,부양가족수,고객만족도점수,현재고객상태
0,0002-ORFBO,37,False,0,3,Stayed
1,0003-MKNFE,46,False,0,5,Stayed
2,0004-TLHLJ,50,False,0,1,Churned
3,0011-IGKFF,78,False,0,1,Churned
4,0013-EXCHZ,75,False,0,1,Churned
...,...,...,...,...,...,...
7038,9987-LUTYD,20,True,0,4,Stayed
7039,9992-RRAMN,40,False,0,1,Churned
7040,9992-UJOEL,22,True,0,5,Joined
7041,9993-LHIEB,21,True,0,3,Stayed


In [31]:
services

Unnamed: 0,고객ID,친구추천여부,친구추천횟수,가입혜택,전화서비스가입여부,장거리통화요금,월평균다운로드용량(GB),프리미엄기술지원여부,음악스트리밍이용여부,무제한데이터이용여부,전자청구서이용여부,총환불액,총초과데이터요금,총장거리통화요금,총납부금,가입개월수
0,8779-QRDMV,False,0,No,False,0.00,8,False,False,False,True,0.00,20,0.00,59.65,1
1,7495-OOKFY,True,1,Offer E,True,48.85,17,False,False,True,True,0.00,0,390.80,1024.10,8
2,1658-BYGOY,False,0,Offer D,True,11.33,52,False,True,True,True,45.61,0,203.94,1910.88,18
3,4598-XLKNJ,True,1,Offer C,True,19.76,12,False,False,True,True,13.43,0,494.00,2995.07,25
4,4846-WHAFZ,True,1,Offer C,True,6.33,14,False,False,True,True,0.00,0,234.21,3102.36,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,2569-WGERO,False,0,No,True,22.77,0,False,False,False,True,19.31,0,1639.44,3039.53,72
7039,6840-RESVB,True,1,Offer C,True,36.05,24,True,True,True,True,48.23,0,865.20,2807.47,24
7040,2234-XADUH,True,4,No,True,29.66,59,False,True,True,True,45.38,0,2135.52,9453.04,72
7041,4801-JZAZL,True,1,No,False,0.00,17,False,False,True,True,27.24,0,0.00,319.21,11


In [32]:
# churn_copy와 겹치는 컬럼 제거
services.drop(['가입개월수', '전화서비스가입여부', '전자청구서이용여부'], axis=1, inplace=True)

In [33]:
# 병합
merged_df = pd.merge(churn_copy, demo_stat, on='고객ID', how='outer')

# 결과 확인
merged_df

Unnamed: 0,고객ID,위도,경도,성별,고령자여부,배우자여부,부양가족여부,가입개월수,전화서비스가입여부,복수회선여부,...,총요금,이탈여부,이탈여부(bool),고객생애가치,이탈이유,나이,30세미만여부,부양가족수,고객만족도점수,현재고객상태
0,0002-ORFBO,34.827662,-118.999073,False,False,True,False,9,True,False,...,593.30,False,0,2205,,37,False,0,3,Stayed
1,0003-MKNFE,34.162515,-118.203869,True,False,False,False,9,True,True,...,542.40,False,0,5414,,46,False,0,5,Stayed
2,0004-TLHLJ,33.645672,-117.922613,True,False,False,False,4,True,False,...,280.85,True,1,4479,Price too high,50,False,0,1,Churned
3,0011-IGKFF,38.014457,-122.115432,True,True,True,False,13,True,False,...,1237.85,True,1,3714,Product dissatisfaction,78,False,0,1,Churned
4,0013-EXCHZ,34.227846,-119.079903,False,True,True,False,3,True,False,...,267.40,True,1,3464,Network reliability,75,False,0,1,Churned
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,9987-LUTYD,32.759327,-116.997260,False,False,False,False,13,True,False,...,742.90,False,0,3161,,20,True,0,4,Stayed
7039,9992-RRAMN,37.734971,-120.954271,True,False,True,False,22,True,True,...,1873.70,True,1,5248,Product dissatisfaction,40,False,0,1,Churned
7040,9992-UJOEL,39.108252,-123.645121,True,False,False,False,2,True,False,...,92.75,False,0,5870,,22,True,0,5,Joined
7041,9993-LHIEB,33.001813,-117.263628,True,False,True,False,67,True,False,...,4627.65,False,0,4792,,21,True,0,3,Stayed


In [34]:
reason_merge = pd.merge(merged_df, services, on='고객ID', how='outer')
reason_merge

Unnamed: 0,고객ID,위도,경도,성별,고령자여부,배우자여부,부양가족여부,가입개월수,전화서비스가입여부,복수회선여부,...,가입혜택,장거리통화요금,월평균다운로드용량(GB),프리미엄기술지원여부,음악스트리밍이용여부,무제한데이터이용여부,총환불액,총초과데이터요금,총장거리통화요금,총납부금
0,0002-ORFBO,34.827662,-118.999073,False,False,True,False,9,True,False,...,No,42.39,16,True,False,True,0.00,0,381.51,974.81
1,0003-MKNFE,34.162515,-118.203869,True,False,False,False,9,True,True,...,No,10.69,10,False,True,False,38.33,10,96.21,610.28
2,0004-TLHLJ,33.645672,-117.922613,True,False,False,False,4,True,False,...,Offer E,33.65,30,False,False,True,0.00,0,134.60,415.45
3,0011-IGKFF,38.014457,-122.115432,True,True,True,False,13,True,False,...,Offer D,27.82,4,False,False,True,0.00,0,361.66,1599.51
4,0013-EXCHZ,34.227846,-119.079903,False,True,True,False,3,True,False,...,No,7.38,11,True,False,True,0.00,0,22.14,289.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,9987-LUTYD,32.759327,-116.997260,False,False,False,False,13,True,False,...,Offer D,46.68,59,True,True,True,0.00,0,606.84,1349.74
7039,9992-RRAMN,37.734971,-120.954271,True,False,True,False,22,True,True,...,Offer D,16.20,17,False,True,True,0.00,0,356.40,2230.10
7040,9992-UJOEL,39.108252,-123.645121,True,False,False,False,2,True,False,...,Offer E,18.62,51,False,False,True,0.00,0,37.24,129.99
7041,9993-LHIEB,33.001813,-117.263628,True,False,True,False,67,True,False,...,Offer A,2.12,58,True,True,True,0.00,0,142.04,4769.69


In [40]:
resumetable(reason_merge)

데이터셋 크기: (7043, 43)


Unnamed: 0,피처,데이터 타입,결측값 개수,고유값 개수,첫 번째 값,두 번째 값,세 번째 값
0,고객ID,object,0,7043,0002-ORFBO,0003-MKNFE,0004-TLHLJ
1,위도,float64,0,1652,34.827662,34.162515,33.645672
2,경도,float64,0,1651,-118.999073,-118.203869,-117.922613
3,성별,bool,0,2,False,True,True
4,고령자여부,bool,0,2,False,False,False
5,배우자여부,bool,0,2,True,False,False
6,부양가족여부,bool,0,2,False,False,False
7,가입개월수,int64,0,73,9,9,4
8,전화서비스가입여부,bool,0,2,True,True,True
9,복수회선여부,bool,0,2,False,True,False


In [41]:
# reason_merge.to_csv('data/total총정리.csv', index=False, encoding='utf-8-sig')

In [35]:
reason_notnull = reason_merge.dropna(subset=['이탈이유'])
reason_notnull

Unnamed: 0,고객ID,위도,경도,성별,고령자여부,배우자여부,부양가족여부,가입개월수,전화서비스가입여부,복수회선여부,...,가입혜택,장거리통화요금,월평균다운로드용량(GB),프리미엄기술지원여부,음악스트리밍이용여부,무제한데이터이용여부,총환불액,총초과데이터요금,총장거리통화요금,총납부금
2,0004-TLHLJ,33.645672,-117.922613,True,False,False,False,4,True,False,...,Offer E,33.65,30,False,False,True,0.0,0,134.60,415.45
3,0011-IGKFF,38.014457,-122.115432,True,True,True,False,13,True,False,...,Offer D,27.82,4,False,False,True,0.0,0,361.66,1599.51
4,0013-EXCHZ,34.227846,-119.079903,False,True,True,False,3,True,False,...,No,7.38,11,True,False,True,0.0,0,22.14,289.54
18,0022-TCJCI,37.680844,-122.481310,True,True,False,False,45,True,False,...,No,10.67,17,False,False,True,0.0,0,480.15,3271.65
19,0023-HGHWL,37.161544,-121.649371,True,True,False,False,1,False,False,...,No,0.00,9,False,False,True,0.0,0,0.00,25.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7020,9961-JBNMK,33.889605,-115.257009,True,True,False,False,21,True,False,...,No,42.27,7,False,False,True,10.2,0,887.67,2907.77
7023,9965-YOKZB,38.636102,-120.522149,True,True,False,False,9,True,True,...,No,10.04,13,False,False,True,0.0,0,90.36,880.51
7036,9985-MWVIX,36.807595,-118.901544,False,False,False,False,1,True,False,...,Offer E,42.09,9,False,False,True,0.0,0,42.09,112.24
7037,9986-BONCE,33.362575,-117.299644,False,False,False,False,4,True,False,...,No,2.01,0,False,False,False,0.0,0,8.04,93.54


In [36]:
reason_mapping = {
    'Competitor made better offer': '경쟁사가 더 나은 제안을 했습니다',
    'Moved': '이동했습니다',
    'Competitor had better devices': '경쟁사가 더 나은 기기를 보유했습니다',
    'Competitor offered higher download speeds': '경쟁사가 더 빠른 다운로드 속도를 제공함',
    'Competitor offered more data': '경쟁사가 더 많은 데이터를 제공함',
    'Price too high': '가격이 너무 높음',
    'Product dissatisfaction': '제품 불만족',
    'Service dissatisfaction': '서비스 불만족',
    'Lack of self-service on Website': '웹사이트의 셀프 서비스 부족',
    'Network reliability': '네트워크 안정성',
    'Limited range of services': '제한된 서비스 범위',
    'Lack of affordable download/upload speed': '저렴한 다운로드/업로드 속도 부족',
    'Long distance charges': '장거리 요금',
    'Extra data charges': '추가 데이터 요금',
    "Don't know": '모르겠다',
    'Poor expertise of online support': '온라인 지원의 전문성 부족',
    'Poor expertise of phone support': '전화 지원의 전문성 부족',
    'Attitude of service provider': '서비스 제공업체의 태도',
    'Attitude of support person': '지원 담당자의 태도',
    'Deceased': '사망'
}

In [37]:
reason_notnull['한글이탈이유'] = reason_notnull['이탈이유'].map(reason_mapping)
reason_notnull

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reason_notnull['한글이탈이유'] = reason_notnull['이탈이유'].map(reason_mapping)


Unnamed: 0,고객ID,위도,경도,성별,고령자여부,배우자여부,부양가족여부,가입개월수,전화서비스가입여부,복수회선여부,...,장거리통화요금,월평균다운로드용량(GB),프리미엄기술지원여부,음악스트리밍이용여부,무제한데이터이용여부,총환불액,총초과데이터요금,총장거리통화요금,총납부금,한글이탈이유
2,0004-TLHLJ,33.645672,-117.922613,True,False,False,False,4,True,False,...,33.65,30,False,False,True,0.0,0,134.60,415.45,가격이 너무 높음
3,0011-IGKFF,38.014457,-122.115432,True,True,True,False,13,True,False,...,27.82,4,False,False,True,0.0,0,361.66,1599.51,제품 불만족
4,0013-EXCHZ,34.227846,-119.079903,False,True,True,False,3,True,False,...,7.38,11,True,False,True,0.0,0,22.14,289.54,네트워크 안정성
18,0022-TCJCI,37.680844,-122.481310,True,True,False,False,45,True,False,...,10.67,17,False,False,True,0.0,0,480.15,3271.65,제한된 서비스 범위
19,0023-HGHWL,37.161544,-121.649371,True,True,False,False,1,False,False,...,0.00,9,False,False,True,0.0,0,0.00,25.10,경쟁사가 더 나은 제안을 했습니다
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7020,9961-JBNMK,33.889605,-115.257009,True,True,False,False,21,True,False,...,42.27,7,False,False,True,10.2,0,887.67,2907.77,제품 불만족
7023,9965-YOKZB,38.636102,-120.522149,True,True,False,False,9,True,True,...,10.04,13,False,False,True,0.0,0,90.36,880.51,네트워크 안정성
7036,9985-MWVIX,36.807595,-118.901544,False,False,False,False,1,True,False,...,42.09,9,False,False,True,0.0,0,42.09,112.24,지원 담당자의 태도
7037,9986-BONCE,33.362575,-117.299644,False,False,False,False,4,True,False,...,2.01,0,False,False,False,0.0,0,8.04,93.54,네트워크 안정성


In [39]:
# reason_notnull.to_csv('data/reason_notnull.csv', index=False, encoding='utf-8-sig')