## 작업형 제 2 유형

### ✏️ 고객의 개인 정보와 행동 패턴을 기반으로 고객의 세분화(Segmentation) 결과를 예측하시오
- 제공된 데이터 목록 : train.csv, test.csv
- 예측할 컬럼 : Segmentation(고객 세분화 결과값 : 1, 2, 3, 4)
    - 학습용 데이터(train.csv)를 이용하여 고객의 세분화 결과를 예측하는 모델을 만든 후 이를 평가용 데이터(test.csv)에 적용하여 얻은 예측값을 다음과 같은 형식의 csv 파일로 생성하시오

        - 제출 파일은 다음 2개의 컬럼을 포함해야 한다

            - ID : 고객 식별자

            - Segmentation : 예측된 세분화 결과

            - 제출 파일명 : result.csv

        - 제출한 모델의 성능은 Macro F1 Score 평가지표에 따라 채점한다 

In [1]:
# 1. 문제 정의
## 고객 정의(Segmentation) 데이터, 고객의 개인 정보와 행동 패턴을 기반으로 고객 세분화를 예측

# 2. 라이브러리 및 데이터 불러오기
import pandas as pd
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [2]:
# 3. 탐색적 데이터 분석(EDA)
## 데이터 크기
print(train.shape, test.shape, '\n')

## 데이터 샘플 확인
print(train.head(), '\n')
print(test.head(), '\n')

## 데이터 자료형(타입)
print(train.info(), '\n')

## 기초 통계량
print(train.describe(), '\n')

## object 의 unique 개수
print(train.describe(include='object'), '\n')
print(test.describe(include='object'), '\n')

## 결측치 여부
print(train.isnull().sum().sum(), '\n')
print(test.isnull().sum().sum(), '\n')

## Segmentation 컬럼 종류에 따른 개수
print(train['Segmentation'].value_counts())

(6665, 11) (2154, 10) 

       ID  Gender Ever_Married  Age Graduated  Profession  Work_Experience  \
0  462809    Male           No   22        No  Healthcare              1.0   
1  466315  Female          Yes   67       Yes    Engineer              1.0   
2  461735    Male          Yes   67       Yes      Lawyer              0.0   
3  461319    Male          Yes   56        No      Artist              0.0   
4  460156    Male           No   32       Yes  Healthcare              1.0   

  Spending_Score  Family_Size  Var_1  Segmentation  
0            Low          4.0  Cat_4             4  
1            Low          1.0  Cat_6             2  
2           High          2.0  Cat_6             2  
3        Average          2.0  Cat_6             3  
4            Low          3.0  Cat_6             3   

       ID  Gender Ever_Married  Age Graduated  Profession  Work_Experience  \
0  458989  Female          Yes   36       Yes    Engineer              0.0   
1  458994    Male          Yes 

In [3]:
# 4. 데이터 전처리
## 자료형이 object 인 컬럼을 원-핫 인코딩
target = train.pop('Segmentation')
print(train.shape, test.shape, '\n')

train = pd.get_dummies(train)
test = pd.get_dummies(test)

print(train.shape, test.shape)

(6665, 10) (2154, 10) 

(6665, 29) (2154, 29)


In [4]:
# 5. 검증 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=0)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(5332, 29) (1333, 29) (5332,) (1333,)


In [5]:
# 머신러닝 학습 및 평가
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# !pip install xgboost
# import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import f1_score

# 로지스틱 회귀
lr = LogisticRegression(random_state=0)
lr.fit(X_train, y_train)
pred = lr.predict(X_val)
print('로지스틱 회귀 :', f1_score(y_val, pred, average='macro'))

# 의사결정 나무
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train, y_train)
pred = dt.predict(X_val)
print('의사결정나무 :', f1_score(y_val, pred, average='macro'))

# 랜덤 포레스트
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)
print('랜덤 포레스트 :', f1_score(y_val, pred, average='macro'))

# xgboost -> 에러 발생 ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3], got [1 2 3 4]
## Xgboost 분류 모델은 예측 클래스가 0 부터 시작해야 함(0, 1, 2, 3 으로 있어야 정상작동)
## 예측 전후에 값 변경이 필요하므로 생략
# xg = xgb.XGBClassifier(random_state=0)
# xg.fit(X_train, y_train)
# pred = xg.predict(X_val)
# print('xgboost :', f1_score(y_val, pred, average='macro'))

# lightGBM -> 채택
lg = lgb.LGBMClassifier(random_state=0)
lg.fit(X_train, y_train)
pred = lg.predict(X_val)
print('lightGBM :', f1_score(y_val, pred, average='macro'))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


로지스틱 회귀 : 0.33131634079352795
의사결정나무 : 0.42783783376258655
랜덤 포레스트 : 0.5020137672414862
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000176 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 398
[LightGBM] [Info] Number of data points in the train set: 5332, number of used features: 29
[LightGBM] [Info] Start training from score -1.414444
[LightGBM] [Info] Start training from score -1.434710
[LightGBM] [Info] Start training from score -1.372881
[LightGBM] [Info] Start training from score -1.326597
lightGBM : 0.5392434827587952


In [6]:
# 예측 및 결과 파일 생성
pred = lg.predict(test)
submit = pd.DataFrame({'ID':test['ID'], 'Segmentation':pred})
submit.to_csv('result.csv', index=False)

print(pd.read_csv('result.csv').head())

       ID  Segmentation
0  458989             2
1  458994             3
2  459000             3
3  459003             2
4  459005             2


In [7]:
# 8-1. 성능 개선(ID 삭제)

# 데이터 불러오기
import pandas as pd
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

target = train.pop('Segmentation')

# ID 제외 -> 삭제 전에는 0.5392434827587952, 삭제 후에는 0.5277491575057244 이므로 채택하지 않음
train.drop('ID', axis=1, inplace=True)
test_id = test.pop('ID')

# 원-핫 인코딩
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# 검증 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=0)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

# LightGBM
import lightgbm as lgb
from sklearn.metrics import f1_score
lg = lgb.LGBMClassifier(random_state=0, verbose=-1)
lg.fit(X_train, y_train)
pred = lg.predict(X_val)
print(f1_score(y_val, pred, average='macro'))

(5332, 28) (1333, 28) (5332,) (1333,)
0.5277491575057244


In [8]:
# 8-2. 성능 개선(인코딩)

# 데이터 불러오기
import pandas as pd
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

target = train.pop('Segmentation')

# 레이블 인코딩 -> 원-핫 : 0.5392434827587952, 레이블 : 0.5253104251107019 이므로 채택 X
from sklearn.preprocessing import LabelEncoder
cols = train.select_dtypes(include='object').columns
for col in cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

# 검증 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=0)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

# LightGBM
import lightgbm as lgb
from sklearn.metrics import f1_score
lg = lgb.LGBMClassifier(random_state=0, verbose=-1)
lg.fit(X_train, y_train)
pred = lg.predict(X_val)
print(f1_score(y_val, pred, average='macro'))

(5332, 10) (1333, 10) (5332,) (1333,)
0.5253104251107019


In [13]:
# 8-3. 성능 개선(스케일링)

# 데이터 불러오기
import pandas as pd
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

target = train.pop('Segmentation')

# 스케일링 -> 전 : 0.5392434827587952
from sklearn.preprocessing import StandardScaler    # 0.5412871352631483 채택
from sklearn.preprocessing import MinMaxScaler      # 0.5346995165417529
from sklearn.preprocessing import RobustScaler      # 0.536902942390099
scaler = StandardScaler()
cols = train.select_dtypes(exclude='object').columns
train[cols] = scaler.fit_transform(train[cols])
test[cols] = scaler.transform(test[cols])

# 원-핫 인코딩
train = pd.get_dummies(train)
test = pd.get_dummies(test)

# 검증 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2, random_state=0)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

# LightGBM
import lightgbm as lgb
from sklearn.metrics import f1_score
lg = lgb.LGBMClassifier(random_state=0, verbose=-1)
lg.fit(X_train, y_train)
pred = lg.predict(X_val)
print(f1_score(y_val, pred, average='macro'))

(5332, 29) (1333, 29) (5332,) (1333,)
0.5412871352631483


In [14]:
# 9. 최종 파일 제출
pred = lg.predict(test)
submit = pd.DataFrame({'ID':test['ID'], 'Segmentation':pred})
submit.to_csv('result.csv', index=False)

print(pd.read_csv('result.csv').head())

         ID  Segmentation
0 -1.765557             2
1 -1.763608             3
2 -1.761270             3
3 -1.760101             2
4 -1.759322             2
