In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Cakd3_Project/1.ldata_현정

/content/drive/MyDrive/Cakd3_Project/1.ldata_현정


In [3]:
import warnings
warnings.filterwarnings('ignore')

from matplotlib import rc
import matplotlib.pyplot as plt 
rc('font',family='Malgun Gothic') # 한글
plt.rcParams['axes.unicode_minus'] = False # 마이너스 부호

import os
import pandas as pd
import seaborn as sns
import numpy as np
from IPython.core.display import HTML
import time
import re

# <font color=red>__데이터 불러오기__</font>

In [4]:
# 파일 불러와서 변수에 저장
path = './dataset'
file_list = os.listdir(path)

data_li = []
for file in file_list:
    data = pd.read_csv(f'dataset/{file}',encoding='cp949')
    file_name = file.replace('.txt','')
    globals()[file_name] = data
    data_li.append(file_name)

In [5]:
# purprd 구매일자 type 변경 및 년, 반기, 분기, 요일 추가
purprd['구매일자'] = pd.to_datetime(purprd['구매일자'], format='%Y%m%d')
purprd['year'] = purprd['구매일자'].dt.year
purprd['quarter'] = purprd['구매일자'].dt.quarter
purprd['weekday'] = purprd['구매일자'].dt.weekday # 월 0 ~ 일 6

def to_half(year,quarter):
    if (year==2014) & (quarter in [1,2]):
        return 1
    elif (year==2014) & (quarter in [3,4]):
        return 2
    elif (year==2015) & (quarter in [1,2]):
        return 3
    else:
        return 4

purprd['half'] = purprd.apply(lambda x: to_half(x['year'], x['quarter']), axis=1)

In [6]:
# 기존고객 => 매 분기 1회 이상 구매한 고객으로 한정
# 기존고객만 남긴 dataframe 생성
all_cust = pd.pivot_table(purprd,
                         index='고객번호',
                         columns='half',
                         values='구매금액',
                         aggfunc='sum')

existing_cust_idx = all_cust.dropna().index.tolist() # 기존고객 고객번호

for data in data_li:
    try:
        globals()[data] = globals()[data].query(f'고객번호 == {existing_cust_idx}')
    except:
        pass

In [7]:
# 연령대 묶어줌
def cat_age(age):
    if age == '19세이하':
        return 10
    elif age in ['20세~24세', '25세~29세']:
        return 20
    elif age in ['30세~34세', '35세~39세']:
        return 30
    elif age in ['40세~44세', '45세~49세']:
        return 40
    elif age in ['50세~54세', '55세~59세']:
        return 50
    else:
        return 60
    
cust['연령대'] = cust['연령대'].apply(lambda x: cat_age(x))

# <font color=red>__종속변수__</font>

In [8]:
# 종속변수
def get_label(p1, p2):
    """
    전체 매출 증감율을 고려한 고객별 매출 증감율(반기 기준)
    -> 감소고객 : 1
    -> 증가고객 : 0
    """
    sales = pd.pivot_table(purprd,index='고객번호', # 고객별 반기 매출
                              columns = 'half',
                              values = '구매금액',
                              aggfunc= 'sum')
    rate_variation = (sum(sales[int(p2)])-sum(sales[int(p1)]))/sum(sales[int(p1)]) # 전체 매출 증감율
    sales[f'y'] = (sales[int(p2)] - sales[int(p1)])/sales[int(p1)]/rate_variation # 고객별 매출 증감율
    
    def to_label(sales_variation): # 매출 감소 고객 : 1
        if sales_variation >= 0:
            return 0
        else:
            return 1
    
    sales[f'y'] = sales[f'y'].apply(lambda x: to_label(x))
    sales = sales[[f'y']]
    return sales

# <font color=red>__독립변수__</font>

# 1) membership

In [9]:
# membership 가입 개수
def membership_count():
    membership_cust = pd.pivot_table(membership,
                                      index='고객번호',
                                      columns='멤버십명',
                                      values='가입년월',
                                      aggfunc='count').fillna(0)
    membership_cust['가입개수'] = membership_cust.sum(axis=1)
    return membership_cust[['가입개수']]

In [10]:
# 최초 membership 가입년도
def membership_date():
    membership['가입년월'] = pd.to_datetime(membership['가입년월'], format='%Y%m')
    membership['가입년도'] = membership['가입년월'].dt.year

    first_membership_date = pd.pivot_table(membership,
                                    index='고객번호',
                                    values='가입년도',
                                    aggfunc='min')
    return first_membership_date

# 2) channel

In [11]:
# app login 횟수
def app_count():
    channel_count = pd.pivot_table(channel,
                  index='고객번호',
                  columns='제휴사',
                  values='이용횟수')

    channel_count['APP로그인횟수'] = channel_count[channel_count.columns[channel_count.columns.str.contains('APP')]].sum(axis=1)
    return channel_count[['APP로그인횟수']]

# 3) compuse

In [12]:
# B제휴사 경쟁사 이용률
def B_compuse_rate():
    compuse_count = pd.pivot_table(compuse,
                    index='고객번호',
                    columns='경쟁사',
                    values='제휴사',
                    aggfunc='count').fillna(0)
    compuse_count['c_B'] = compuse_count['B01'] + compuse_count['B02']

    purprd_count = pd.pivot_table(purprd.drop_duplicates(subset='영수증번호'),
                                  index='고객번호',
                                  columns='제휴사',
                                  values='영수증번호',
                                  aggfunc='count').fillna(0)

    compuse_count = compuse_count.join(purprd_count)

    compuse_count[f'c_B_rate'] = round(compuse_count['c_B']/(compuse_count['c_B']+compuse_count['B'])*100, 2)
    return compuse_count[['c_B_rate']]

#### prodcat

In [13]:
# 상품분류 - 대분류, 구매목적분류 추가(수작업)
cat_name = pd.read_excel('상품분류.xlsx', index_col=0)[['소분류코드','대분류','구매목적분류']]
prodcat = pd.merge(prodcat, cat_name, on=['소분류코드'])
purprd = pd.merge(purprd, prodcat[['소분류코드','중분류명','소분류명','대분류','구매목적분류']])

# 5) purprd

In [14]:
### 비율 계산
def to_rate(df, name):
    total = df.sum(axis=1)
    for col in df.columns:
        df[f'{col}_{name}_rate'] = round(df[col]/total*100, 2)
        df.drop(col, axis=1, inplace=True)
    return df

In [15]:
# 선매품, 편의품 구매금액 비중
def purpose_cat_amount(p1, p2):
    purpose_cat = pd.pivot_table(purprd.query(f'half==[{p1},{p2}]'),
                                      index='고객번호',
                                      columns='구매목적분류',
                                      values='구매금액',
                                      aggfunc='sum').fillna(0)
    to_rate(purpose_cat, 'amount')

    purpose_cat.drop('전문품_amount_rate',axis=1,inplace=True)
    return purpose_cat

In [16]:
# 대분류별 구매횟수 비중
def major_cat_count(p1, p2):
    major_cat_count = pd.pivot_table(purprd.query(f'half==[{p1},{p2}]'),
                                      index='고객번호',
                                      columns='대분류',
                                      values='구매금액',
                                      aggfunc='count').fillna(0)

    to_rate(major_cat_count, 'count')
    return major_cat_count[['미용품_count_rate','스포츠레저_count_rate','패션잡화_count_rate',\
                    '의류_count_rate','인테리어_count_rate']]

In [17]:
# 대분류별 구매금액 비중
def major_cat_amount(p1, p2):
    major_cat_amount = pd.pivot_table(purprd.query(f'half==[{p1},{p2}]'),
                                      index='고객번호',
                                      columns='대분류',
                                      values='구매금액',
                                      aggfunc='sum').fillna(0)

    to_rate(major_cat_amount, 'amount')
    return major_cat_amount[['가공식품_amount_rate','교육문화_amount_rate','기타_amount_rate',\
                     '신선식품_amount_rate','일상용품_amount_rate']]

In [18]:
# 제휴사별 구매횟수 비중
def affiliate_count(p1, p2):
    affiliate_count = pd.pivot_table(purprd.query(f'half==[{p1},{p2}]'),
                                      index='고객번호',
                                      columns='제휴사',
                                      values='구매금액',
                                      aggfunc='count').fillna(0)
    to_rate(affiliate_count, 'count')
    return affiliate_count

In [19]:
# 제휴사별 구매금액 비중
def affiliate_mount(p1, p2):
    affiliate_mount = pd.pivot_table(purprd.query(f'half==[{p1},{p2}]'),
                                      index='고객번호',
                                      columns='제휴사',
                                      values='구매금액',
                                      aggfunc='sum').fillna(0)
    to_rate(affiliate_mount, 'mount')
    return affiliate_mount

In [20]:
### 증감율 계산(purprd, 구매금액 기준)
def purprd_amount_pv(col, period1, period2):
    for i in [period1, period2]:
        globals()[f'p{i}'] = pd.pivot_table(purprd.query(f'half=={i}'),
                                           index='고객번호',
                                           columns=col,
                                           values='구매금액',
                                           aggfunc='sum').fillna(0)
        
    variation = (globals()[f'p{period2}'] - globals()[f'p{period1}'])/globals()[f'p{period1}']*100
    return variation.replace({np.inf:100, np.nan:0})

# <font color=red>__dataset__</font>

# full_dataset

In [21]:
def make_dataset(p1, p2, p3):
    dataset = cust.set_index('고객번호')[['연령대','성별']].join([ # 연령대, 성별
        membership_count(), # membership 가입 개수                           
        membership_date(), # 최초 membership 가입년도                       
        app_count(), # app login 횟수                                        
        B_compuse_rate(), # B제휴사 경쟁사 이용률                            
        purpose_cat_amount(p1, p2), # 선매품, 편의품 구매금액 비중           
        major_cat_count(p1, p2), # 대분류별 구매횟수 비중(미용품,스포츠레저,패션잡화,의류,인테리어) 
        major_cat_amount(p1, p2), # 대분류별 구매금액 비중(가공식품,교육문화,기타,신선식품,일상용품)
        affiliate_count(p1, p2), # 제휴사별 구매횟수 비중
        affiliate_mount(p1, p2), # 제휴사별 구매금액 비중
        purprd_amount_pv('구매목적분류', p1, p2), # 편의품 구매금액 증감율
        purprd_amount_pv('대분류', p1, p2), # 대분류별 구매금액 증감율
        purprd_amount_pv('제휴사', p1, p2), # 제휴사별 구매금액 증감율
        get_label(p1, p3)]) # 종속변수
    dataset.fillna(0, inplace=True)
    return dataset

dataset1 = make_dataset(1,2,3) # train(train / vaild)
dataset2 = make_dataset(2,3,4) # test

In [22]:
# Labelencoder
from sklearn.preprocessing import LabelEncoder

le_cols = ['연령대','성별','가입년도']

for col in le_cols:
    globals()[f'le_{col}'] = LabelEncoder()
    dataset1[col] = globals()[f'le_{col}'].fit_transform(dataset1[col])
    dataset2[col] = globals()[f'le_{col}'].transform(dataset2[col])

In [None]:
# Category
# cat_cols = dataset1.columns[4:-7].tolist()

# def to_cat(df, col, n=6):
#     data = df[col].astype(float)
#     cat_data = pd.cut(data, n, labels=list(range(1, n+1)))
#     return cat_data

# for col in cat_cols:
#     dataset1[col] = to_cat(dataset1, col)
#     dataset2[col] = to_cat(dataset2, col)

In [None]:
# # Category(증감율 => 마이너스 / 플러스 나누고 카테고리)
# cat_plus_cols = ['편의품', '가공식품', '미용품', '스포츠레저', '의류', 'A']

# def to_cat_plus(df, col, n1=3, n2=3):
#     data = df[col]
    
#     data_minus = data[data<=0]
#     data_minus_cut = pd.cut(data_minus,n1,labels=list(range(1,n1+1)))

#     data_plus = data[0<data]
#     data_plus_cut = pd.cut(data_plus,n2,labels=list(range(n1+1,n1+n2+1)))
    
#     return pd.concat([data_minus_cut,data_plus_cut])

# for col in cat_plus_cols:
#     dataset1[col] = to_cat_plus(dataset1, col)
#     dataset2[col] = to_cat_plus(dataset2, col)

In [None]:
# 데이터 표준화 - Standardscaler
# from sklearn.preprocessing import StandardScaler

# for col in dataset1.drop('y',axis=1).columns:
#   globals()[f'sc_{col}'] = StandardScaler()
#   dataset1[col] = globals()[f'sc_{col}'].fit_transform(dataset1[[col]])
#   dataset2[col] = globals()[f'sc_{col}'].transform(dataset2[[col]])

In [23]:
# 데이터 표준화 - MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

for col in dataset1.drop('y',axis=1).columns:
  globals()[f'mm_{col}'] = MinMaxScaler()
  dataset1[col] = globals()[f'mm_{col}'].fit_transform(dataset1[[col]])
  dataset2[col] = globals()[f'mm_{col}'].transform(dataset2[[col]])

In [None]:
# for col in dataset1.dtypes[dataset1.dtypes=='category'].index:
#     dataset1[col] = dataset1[col].astype(int)
#     dataset2[col] = dataset2[col].astype(int)

In [24]:
# 저장
dataset1.to_csv('./full_dataset1.csv')
dataset2.to_csv('./full_dataset2.csv')

# small_dataset

In [25]:
# 대분류별 구매횟수 비중
def major_cat_count(p1, p2):
    major_cat_count = pd.pivot_table(purprd.query(f'half==[{p1},{p2}]'),
                                      index='고객번호',
                                      columns='대분류',
                                      values='구매금액',
                                      aggfunc='count').fillna(0)

    to_rate(major_cat_count, 'count')
    return major_cat_count[['스포츠레저_count_rate','패션잡화_count_rate',\
                    '의류_count_rate','인테리어_count_rate']]

In [26]:
# 대분류별 구매금액 비중
def major_cat_amount(p1, p2):
    major_cat_amount = pd.pivot_table(purprd.query(f'half==[{p1},{p2}]'),
                                      index='고객번호',
                                      columns='대분류',
                                      values='구매금액',
                                      aggfunc='sum').fillna(0)

    to_rate(major_cat_amount, 'amount')
    return major_cat_amount[['가공식품_amount_rate','일상용품_amount_rate']]

In [27]:
# 제휴사별 구매횟수 비중
def affiliate_count(p1, p2):
    affiliate_count = pd.pivot_table(purprd.query(f'half==[{p1},{p2}]'),
                                      index='고객번호',
                                      columns='제휴사',
                                      values='구매금액',
                                      aggfunc='count').fillna(0)
    to_rate(affiliate_count, 'count')
#     return affiliate_count
    return affiliate_count['A_count_rate']

In [28]:
# 제휴사별 구매금액 비중
def affiliate_mount(p1, p2):
    affiliate_mount = pd.pivot_table(purprd.query(f'half==[{p1},{p2}]'),
                                      index='고객번호',
                                      columns='제휴사',
                                      values='구매금액',
                                      aggfunc='sum').fillna(0)
    to_rate(affiliate_mount, 'mount')
#     return affiliate_mount
    return affiliate_mount['A_mount_rate']

In [29]:
def make_dataset(p1, p2, p3):
    dataset = pd.DataFrame(get_label(p1, p3)).join([ # 종속변수
        B_compuse_rate(), # B제휴사 경쟁사 이용률                            
        purpose_cat_amount(p1, p2), # 선매품, 편의품 구매금액 비중           
        major_cat_count(p1, p2), # 대분류별 구매횟수 비중(스포츠레저,패션잡화,의류,인테리어) 
        major_cat_amount(p1, p2), # 대분류별 구매금액 비중(가공식품,일상용품)
        affiliate_count(p1, p2), # 제휴사별 구매횟수 비중(A)
        affiliate_mount(p1, p2), # 제휴사별 구매금액 비중(A)
        purprd_amount_pv('구매목적분류', p1, p2)[['편의품']], # 편의품 구매금액 증감율
        purprd_amount_pv('대분류', p1, p2)[['가공식품','미용품','스포츠레저','의류']], # 대분류별 구매금액 증감율
        purprd_amount_pv('제휴사', p1, p2)[['A']], # 제휴사별 구매금액 증감율
        purprd_amount_pv(None,p1, p2) # 구매금액 증감율
        ])
    dataset.fillna(0, inplace=True)
    return dataset

s_dataset1 = make_dataset(1,2,3)
s_dataset2 = make_dataset(2,3,4)

In [30]:
# 데이터 표준화 - MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

for col in s_dataset1.drop('y',axis=1).columns:
  globals()[f'mm_{col}'] = MinMaxScaler()
  s_dataset1[col] = globals()[f'mm_{col}'].fit_transform(s_dataset1[[col]])
  s_dataset2[col] = globals()[f'mm_{col}'].transform(s_dataset2[[col]])

In [31]:
# 저장
s_dataset1.to_csv('./small_dataset1.csv')
s_dataset2.to_csv('./small_dataset2.csv')

# RandomForest기반 변수 선택 dataset

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(dataset1.drop('y',axis=1),
                                                    dataset1.y,
                                                    test_size=0.2,
                                                    random_state=1004)

f_select = SelectFromModel(RandomForestClassifier(n_estimators=300, random_state=0), threshold='median')
f_select.fit(X_train, y_train)
f_scores = pd.DataFrame()

f_scores['attribute'] = X_train.columns
f_scores['support'] = f_select.get_support()
f_scores[f_scores['support'] == True]

Unnamed: 0,attribute,support
6,선매품_amount_rate,True
7,편의품_amount_rate,True
8,미용품_count_rate,True
9,스포츠레저_count_rate,True
10,패션잡화_count_rate,True
11,의류_count_rate,True
13,가공식품_amount_rate,True
16,신선식품_amount_rate,True
17,일상용품_amount_rate,True
18,A_count_rate,True


In [39]:
selected_cols = f_scores[f_scores['support'] == True]['attribute'].values.tolist()
selected_cols.append('y')

sfm_dataset1 = dataset1[selected_cols]
sfm_dataset2 = dataset2[selected_cols]

In [43]:
sfm_dataset1.to_csv('./sfm_dataset1.csv')
sfm_dataset2.to_csv('./sfm_dataset2.csv')

# 전진선택법

출처 : https://todayisbetterthanyesterday.tistory.com/10

In [46]:
import statsmodels.api as sm

def processSubset(X,y,feature_set):
    model = sm.OLS(y,X[list(feature_set)]) # Modeling
    regr = model.fit() # model fitting
    AIC = regr.aic # model's AIC
    return {"model" : regr, "AIC" : AIC}


In [51]:
def forward(X,y,predictors):
	
    # predictor - 현재 선택되어있는 변수
    # 데이터 변수들이 미리정의된 predictors에 있는지 없는지 확인 및 분류
    
    remaining_predictors = [p for p in X.columns if p not in predictors]
    tic = time.time()
    results = []
    for p in remaining_predictors :
    	results.append(processSubset(X=X,y=y,feature_set=predictors+[p]))
    
    # 데이터프레임으로 변환
    models = pd.DataFrame(results)
    
    # AIC가 가장 낮은 것을 선택
    best_model = models.loc[models['AIC'].argmin()]
    toc = time.time()
    print("Processed ",models.shape[0], "models on", len(predictors)+1, "predictors in", (toc-tic))
    print("Selected predictors:",best_model["model"].model.exog_names,"AIC: ",best_model[0])
    return best_model
    
### 전진선택법 모델

def forward_model(X,y):

  Fmodels = pd.DataFrame(columns=["AIC","model"])
  tic = time.time()
  
  # 미리 정의된 데이터 변수
  predictors = []
  Fmodel_before = Fmodels["AIC"]
  # 변수 1~10개 : 0-9 -> 1-10
  for i in range(1,len(X.columns)+1):
    
    Forward_result = forward(X=X,y=y,predictors=predictors)
    if i > 1 :
      if Forward_result["AIC"] > Fmodel_before:
        break
    Fmodels.loc[i] = Forward_result
    predictors = Fmodels.loc[i]["model"].model.exog_names
    Fmodel_before = Fmodels.loc[i]["AIC"]
    predictors = [k for k in predictors if k != 'const']
  toc = time.time()
  print("Total elapsed time:",(toc-tic), "seconds.")
  
  return (Fmodels['model'][len(Fmodels['model'])])

In [52]:
X_train, X_test, y_train, y_test = train_test_split(dataset1.drop('y',axis=1),
                                                    dataset1.y,
                                                    test_size=0.2,
                                                    random_state=1004)

Forward_best_model = forward_model(X=X_train, y=y_train)

Processed  45 models on 1 predictors in 0.10240578651428223
Selected predictors: ['연령대'] AIC:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7f814c9a01d0>
Processed  44 models on 2 predictors in 0.17083263397216797
Selected predictors: ['연령대', '선매품_amount_rate'] AIC:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7f814cf2a050>
Processed  43 models on 3 predictors in 0.2057046890258789
Selected predictors: ['연령대', '선매품_amount_rate', '편의품_amount_rate'] AIC:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7f814cf1f710>
Processed  42 models on 4 predictors in 0.24411892890930176
Selected predictors: ['연령대', '선매품_amount_rate', '편의품_amount_rate', '신선식품'] AIC:  <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7f814cf4f510>
Processed  41 models on 5 predictors in 0.2626817226409912
Selected predictors: ['연령대', '선매품_amount_rate', '편의품_amount_rate', '신선식품', '편의품'] AIC:  <statsmodels.regres

In [53]:
Forward_best_model.aic

21521.474700989827

In [76]:
forward_summary = Forward_best_model.summary().tables[1].as_html()
forward_summary = pd.read_html(forward_summary, header=0, index_col=0)[0]
f_cols = forward_summary.index.tolist()
f_cols.append('y')

In [77]:
f_dataset1 = dataset1[f_cols]
f_dataset2 = dataset2[f_cols]

In [79]:
f_dataset1.to_csv('./f_dataset1.csv')
f_dataset2.to_csv('./f_dataset2.csv')

# 후진소거법

In [62]:
import itertools 

def backward(X,y,predictors):
    tic = time.time()
    results = []
    
    # 데이터 변수들이 미리 정의된 predictors 조합 확인
    
    for combo in itertools.combinations(predictors, len(predictors) - 1):
    	results.append(processSubset(X=X,y=y,feature_set=list(combo)))
    models = pd.DataFrame(results)
    
    # 가장 낮은 AIC를 가진 모델을 선택
    best_model = models.loc[models['AIC'].argmin()]
    toc = time.time()
    
    print("Processed ",models.shape[0], "models on", len(predictors) - 1, "predictors in",(toc-tic))
    print("Selected predictors:",best_model['model'].model.exog_names,' AIC:',best_model[0])
    return best_model
    
def backward_model(X,y) :
    Bmodels = pd.DataFrame(columns=["AIC","model"], index = range(1,len(X.columns)))
    tic = time.time()
    predictors = X.columns
    Bmodel_before = processSubset(X,y,predictors)['AIC']
    while (len(predictors) > 1):
      Backward_result = backward(X=X_train, y=y_train, predictors=predictors)
      if Backward_result['AIC'] > Bmodel_before:
        break
      Bmodels.loc[len(predictors) -1] = Backward_result
      predictors = Bmodels.loc[len(predictors) - 1]['model'].model.exog_names
      Bmodel_before = Backward_result["AIC"]
      predictors = [k for k in predictors if k != 'const']
    
    toc = time.time()
    print("Total elapsed time:",(toc-tic),"seconds.")
    return (Bmodels["model"].dropna().iloc[0])

In [63]:
Backward_best_model = backward_model(X=X_train, y=y_train)

Processed  45 models on 44 predictors in 3.48599910736084
Selected predictors: ['연령대', '성별', '가입개수', '가입년도', 'APP로그인횟수', 'c_B_rate', '선매품_amount_rate', '편의품_amount_rate', '미용품_count_rate', '패션잡화_count_rate', '의류_count_rate', '인테리어_count_rate', '가공식품_amount_rate', '교육문화_amount_rate', '기타_amount_rate', '신선식품_amount_rate', '일상용품_amount_rate', 'A_count_rate', 'B_count_rate', 'C_count_rate', 'D_count_rate', 'A_mount_rate', 'B_mount_rate', 'C_mount_rate', 'D_mount_rate', '선매품', '전문품', '편의품', '가공식품', '교육문화', '기타', '디지털', '미용품', '스포츠레저', '신선식품', '의류', '의약', '인테리어', '일상용품', '패션잡화', 'A', 'B', 'C', 'D']  AIC: <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7f8164a537d0>
Processed  44 models on 43 predictors in 2.49800705909729
Selected predictors: ['연령대', '성별', '가입개수', '가입년도', 'APP로그인횟수', 'c_B_rate', '선매품_amount_rate', '편의품_amount_rate', '미용품_count_rate', '패션잡화_count_rate', '의류_count_rate', '가공식품_amount_rate', '교육문화_amount_rate', '기타_amount_rate', '신선식품_amount_rate', '일상

In [64]:
Backward_best_model.aic

21517.60228823617

In [80]:
backward_summary = Backward_best_model.summary().tables[1].as_html()
backward_summary = pd.read_html(backward_summary, header=0, index_col=0)[0]
b_cols = backward_summary.index.tolist()
b_cols.append('y')

In [81]:
b_dataset1 = dataset1[b_cols]
b_dataset2 = dataset2[b_cols]

In [82]:
b_dataset1.to_csv('./b_dataset1.csv')
b_dataset2.to_csv('./b_dataset2.csv')

# <font color=red>__아노바분석🤖__</font>

In [None]:
# # anova pvalue 함수👻
# from scipy import stats

# def anova_test(dataset):
#     num = 1
#     data = dataset.drop('y',axis=1)
#     target = dataset.y
#     data = data.join(target)
    
#     for n in range(len(data.columns[:-1])):
#         grps = [data[data.columns[-1]].tolist() for _, data in data.groupby(data.columns[n])]        
#         F, p = stats.f_oneway(*grps)
#         if p >= 0.05:
#             print(num, data.columns[n],':', round(p,3),'무의미')
#         elif p < 0.05:
#             print(num, data.columns[n],':', round(p,3),'😊')
#         num += 1
            
# anova_test(dataset1)