In [None]:
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns               # 数据可视化包

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.base import TransformerMixin
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

plt.style.use('fivethirtyeight')    # 数据可视化主题

# 用随机数种子保证随机数永远一致
np.random.seed(7)

# 分箱
class CustomCutter(TransformerMixin):
    def __init__(self):
        pass

    def transform(self, X, cols, bins=5, labels=False):
        for col in cols:
            X[col] = pd.cut(X[col], bins=bins, labels=labels).astype(int)
        return X

    def fit(self, *_):
        return self

# 亚编码-OneHot
class CustomDummifier(TransformerMixin):
    def __init__(self):
        pass

    def transform(self, X, cols=None):
        return pd.get_dummies(X, columns=cols)

    def fit(self, *_):
        return self

dumifier = CustomDummifier()
cutter = CustomCutter()
min_max_scaler = MinMaxScaler()
normalizer = Normalizer(copy=True, norm='l2')
poly2 = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)

In [None]:
# 读取训练集
x_y = pd.read_csv('case2_training.csv',delimiter=',',skiprows=1)
x_y.columns = ['ID', 'Region', 'Date', 'Weekday', 'Apartment', 'Beds', 'Review', 'Pic Quality', 'Price', 'Accept']
x_y = x_y.drop(columns=['ID'])
x_y.head()

#### 1. 探索性数据分析
|变量类型|字段|
|:----:|:------:|
|无序分类变量|Region、Date、Weekday、Apartment||
|整型|Beds|
|浮点|Review、Pic Quality、Price|

In [None]:
x_y.info()

In [None]:
x_y.isnull().sum()

In [None]:
x_y.describe()

#### 2. 数据处理(特征构造)

##### 2.1 时间特征

In [None]:
# 是否周末
x_y['is_weekend'] = x_y['Weekday'].apply(lambda x: 1 if x in [6, 7] else 0)

# 一年中的第几个周
x_y['week_no'] = x_y['Date'].apply(lambda x:(x//7)+1)

# 一年中的第几个月
def get_month_no(day):
    for month_no,i in enumerate([0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366]):
        if day<=i: return month_no
x_y['month_no'] = x_y['Date'].apply(lambda x:get_month_no(x))

# 一年中的第几个季度
def get_season_no(day):
    for month_no,i in enumerate([0, 91, 182, 274, 366]):
        if day<=i: return month_no
x_y['season_no'] = x_y['Date'].apply(lambda x:get_season_no(x))

# 是否年初
x_y['is_year_start'] = x_y['month_no'].apply(lambda x: 1 if x in [1,2] else 0)

# 是否年尾
x_y['is_year_end'] = x_y['month_no'].apply(lambda x: 1 if x in [11,12] else 0)

# OneHot
x_y = pd.get_dummies(x_y, columns=['Weekday', 'week_no', 'month_no', 'season_no'])
x_y = x_y.drop(columns=['Date'])

##### 2.2 统计聚合特征


##### 2.2 无序分类变量

In [None]:
# 无序分类变量编码
x_y = dumifier.transform(x_y, cols=['Region', 'Beds', 'Review', ])


##### 2.3 数值型变量

In [None]:
# 使用 min-max 标准化
for col in ['Price']:
    print(col,min(x_y[col]),max(x_y[col]))
    x_y[col] = min_max_scaler.fit_transform(x_y[col].values.reshape(-1,1))
    print(f'{col}-01',min(x_y[col]),max(x_y[col]))
    print()

# 交叉特征
# y = x_y['Accept']
# x = poly2.fit_transform(x_y.drop(columns=['Accept']))
# x = pd.DataFrame(x, columns=poly2.get_feature_names())
# x_y = pd.concat([x, y], axis=1)

x_y

#### 3. 特征选择

In [None]:
# 用 Seaborn 生成热图
# sns.heatmap(x_y.corr())

In [None]:
# 只有特征和响应的相关性
# x_y.corr()['Accept']

In [None]:
# MASK: False:0.2～0.2、True:超过正负 0.2。
# mask = x_y.corr()['Accept'].abs() > 0.0
# mask

In [None]:
# x_y = x_y[x_y.columns[mask]]
x_y

#### 4. 模型训练

In [None]:
x, y = x_y.drop(columns=['Accept']), x_y['Accept']
print(x.shape, y.shape)
print(x_y['Accept'].value_counts(normalize=True))

# 拆分数据集
train_x, test_x, train_y, test_y = \
    train_test_split(x, y, test_size=0.25, random_state=7)

# 流水线定义
pipe_params = {
    'classify__C': [2],
    # 'classify__C': [1,2,3,4],
    # 'classify__penalty': ['l1', 'l2']
}

pipe = Pipeline([
    # 分类模型
    ('classify', LogisticRegression(solver='liblinear'))
])

# 评估方法、CV设置
# scoring = {'AUC': 'roc_auc', 'Accuracy': 'accuracy'}
# grid = GridSearchCV(pipe, pipe_params, cv=4, scoring=scoring, refit='AUC')
grid = GridSearchCV(pipe, pipe_params, cv=4, scoring='accuracy')

grid.fit(train_x, train_y)
print(grid.best_score_, grid.best_params_)
print(grid.cv_results_)

# baseline
# grid = LogisticRegression(C=2)
# grid.fit(train_x, train_y)

pred_y = grid.predict(test_x)
if isinstance(test_y, pd.Series):
    val_y = test_y.to_numpy()
print(f'精确度：{metrics.accuracy_score(test_y, pred_y)}')

fpr, tpr, thresholds = metrics.roc_curve(test_y, pred_y)
print(f'auc：{metrics.auc(fpr, tpr)}')# 精准度
