In [1]:
from sklearn.model_selection import KFold,StratifiedKFold
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('./titanic/train.csv')
test = pd.read_csv('./titanic/test.csv')

In [3]:
train['Age'] = train['Age'].fillna(train['Age'].mode())
test['Age'] = test['Age'].fillna(test['Age'].mode())

train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode())

train.rename(columns={'Survived':'label'},inplace=True)
test.rename(columns={'Survived':'label'},inplace=True)

train = train.drop(['Name','Ticket'],axis=1)
test = test.drop(['Name','Ticket'],axis=1)

In [4]:
train.head()

Unnamed: 0,PassengerId,label,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [5]:
train.columns

Index(['PassengerId', 'label', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked'],
      dtype='object')

In [6]:
#疑问：分组统计特征，要在train和test中单独求吗？

In [7]:
# 目标编码
# https://mp.weixin.qq.com/s/taMj-x-qLz5sw-7zld5BmA
# target encoding，可以理解为转化率，就是聚合id后对label求mean、sum、count(lable=1)/count(all)，
# 这里需要考虑到数据穿越造成线下严重过拟合的情况，一般需要通过交叉平滑处理，如划分为5折，用其中4折去统计剩下的一折，
# 然后对过大或过小的进行平滑，对测试集直接用训练集来做统计，若数据集有时间因素存在，则需要根据时间顺序来处理；
def kfold_mean(df_train, df_test, target, target_mean_list):
    folds = StratifiedKFold(n_splits=5)

    mean_of_target = df_train[target].mean()

    for fold_, (trn_idx, val_idx) in tqdm(enumerate(folds.split(df_train, y=df_train[target]))):
        tr_x = df_train.iloc[trn_idx, :]
        vl_x = df_train.iloc[val_idx, :]

        for col in target_mean_list:
            df_train.loc[vl_x.index, f'{col}_target_enc'] = vl_x[col].map(tr_x.groupby(col)[target].mean())

    for col in target_mean_list:
        df_train[f'{col}_target_enc'].fillna(mean_of_target, inplace=True)

        df_test[f'{col}_target_enc'] = df_test[col].map(df_train.groupby(col)[f'{col}_target_enc'].mean())

        df_test[f'{col}_target_enc'].fillna(mean_of_target, inplace=True)
    return pd.concat([df_train, df_test], ignore_index=True)

feature_list =  ['Sex','Embarked']
data = pd.concat([train, test], ignore_index=True)
data = kfold_mean(data[~data['label'].isna()], data[data['label'].isna()],'label',feature_list)

data.head()

5it [00:00, 333.45it/s]


Unnamed: 0,PassengerId,label,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_target_enc,Embarked_target_enc
0,1,0.0,3,male,22.0,1,0,7.25,S,0.195279,0.335271
1,2,1.0,1,female,38.0,1,0,71.2833,C,0.739837,0.589552
2,3,1.0,3,female,26.0,0,0,7.925,S,0.739837,0.335271
3,4,1.0,1,female,35.0,1,0,53.1,S,0.739837,0.335271
4,5,0.0,3,male,35.0,0,0,8.05,S,0.195279,0.335271


In [8]:
#数值交叉特征
countfea = ['Fare','Pclass','Age']
#数值特征与数值特征之间做加减乘除
for f1 in tqdm(countfea, desc="count_feas 基本交叉特征"):
    for f2 in countfea:
        if f1 != f2:
            data['{}_{}_ratio'.format(f1, f2)] = data[f1].values / data[f2].values
            data['{}_{}_multi'.format(f1, f2)] = data[f1].values * data[f2].values
            data['{}_{}_add'.format(f1, f2)] = data[f1].values + data[f2].values
            data['{}_{}_diff'.format(f1, f2)] = data[f1].values - data[f2].values
data.head()

count_feas 基本交叉特征: 100%|██████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 499.26it/s]


Unnamed: 0,PassengerId,label,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_target_enc,...,Pclass_Age_add,Pclass_Age_diff,Age_Fare_ratio,Age_Fare_multi,Age_Fare_add,Age_Fare_diff,Age_Pclass_ratio,Age_Pclass_multi,Age_Pclass_add,Age_Pclass_diff
0,1,0.0,3,male,22.0,1,0,7.25,S,0.195279,...,25.0,-19.0,3.034483,159.5,29.25,14.75,7.333333,66.0,25.0,19.0
1,2,1.0,1,female,38.0,1,0,71.2833,C,0.739837,...,39.0,-37.0,0.533084,2708.7654,109.2833,-33.2833,38.0,38.0,39.0,37.0
2,3,1.0,3,female,26.0,0,0,7.925,S,0.739837,...,29.0,-23.0,3.280757,206.05,33.925,18.075,8.666667,78.0,29.0,23.0
3,4,1.0,1,female,35.0,1,0,53.1,S,0.739837,...,36.0,-34.0,0.659134,1858.5,88.1,-18.1,35.0,35.0,36.0,34.0
4,5,0.0,3,male,35.0,0,0,8.05,S,0.195279,...,38.0,-32.0,4.347826,281.75,43.05,26.95,11.666667,105.0,38.0,32.0


In [9]:
#类别特征与数值特征之间
catefea = ['Sex','Embarked']
countfea = ['Fare','Age']
#例如男性购买的船票票价均值、最大值、最小值中位数。
for i in tqdm(countfea,desc="count_feas 基本聚合特征"):
    for j in catefea:
        data['{}_{}_mean'.format(i,j)] = data.groupby(j)[i].transform('mean')
        data['{}_{}_median'.format(i,j)] = data.groupby(j)[i].transform('median')
        data['{}_{}_max'.format(i,j)] = data.groupby(j)[i].transform('max')
        data['{}_{}_min'.format(i,j)] = data.groupby(j)[i].transform('min')
        data['{}_{}_std'.format(i,j)] = data.groupby(j)[i].transform('std')
        
data.head()

count_feas 基本聚合特征: 100%|██████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 111.11it/s]


Unnamed: 0,PassengerId,label,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_target_enc,...,Age_Sex_mean,Age_Sex_median,Age_Sex_max,Age_Sex_min,Age_Sex_std,Age_Embarked_mean,Age_Embarked_median,Age_Embarked_max,Age_Embarked_min,Age_Embarked_std
0,1,0.0,3,male,22.0,1,0,7.25,S,0.195279,...,30.585228,28.0,80.0,0.33,14.280581,29.245205,28.0,80.0,0.17,14.047507
1,2,1.0,1,female,38.0,1,0,71.2833,C,0.739837,...,28.687088,27.0,76.0,0.17,14.576962,32.33217,30.0,71.0,0.42,15.258092
2,3,1.0,3,female,26.0,0,0,7.925,S,0.739837,...,28.687088,27.0,76.0,0.17,14.576962,29.245205,28.0,80.0,0.17,14.047507
3,4,1.0,1,female,35.0,1,0,53.1,S,0.739837,...,28.687088,27.0,76.0,0.17,14.576962,29.245205,28.0,80.0,0.17,14.047507
4,5,0.0,3,male,35.0,0,0,8.05,S,0.195279,...,30.585228,28.0,80.0,0.33,14.280581,29.245205,28.0,80.0,0.17,14.047507


In [10]:
#偏离值特征,顾名思义，偏离均值、最大值、最小值的误差值
catefea = ['Sex','Embarked']
countfea = ['Fare','Age']                 
for group in catefea:
    for feature in countfea:
        tmp = data.groupby(group)[feature].agg([sum, min, max, np.mean]).reset_index()
        tmp = pd.merge(data, tmp, on=group, how='left')
        data['{}-mean_gb_{}'.format(feature, group)] = data[feature] - tmp['mean']
        data['{}-min_gb_{}'.format(feature, group)] = data[feature] - tmp['min']
        data['{}-max_gb_{}'.format(feature, group)] = data[feature] - tmp['max']
        data['{}/sum_gb_{}'.format(feature, group)] = data[feature] / tmp['sum']
data.head()

Unnamed: 0,PassengerId,label,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_target_enc,...,Age-max_gb_Sex,Age/sum_gb_Sex,Fare-mean_gb_Embarked,Fare-min_gb_Embarked,Fare-max_gb_Embarked,Fare/sum_gb_Embarked,Age-mean_gb_Embarked,Age-min_gb_Embarked,Age-max_gb_Embarked,Age/sum_gb_Embarked
0,1,0.0,3,male,22.0,1,0,7.25,S,0.195279,...,-58.0,0.001093,-20.168824,7.25,-255.75,0.00029,-7.245205,21.83,-58.0,0.000962
1,2,1.0,1,female,38.0,1,0,71.2833,C,0.739837,...,-38.0,0.003414,8.947033,67.2708,-441.0459,0.004235,5.66783,37.58,-33.0,0.005544
2,3,1.0,3,female,26.0,0,0,7.925,S,0.739837,...,-50.0,0.002336,-19.493824,7.925,-255.075,0.000317,-3.245205,25.83,-54.0,0.001137
3,4,1.0,1,female,35.0,1,0,53.1,S,0.739837,...,-41.0,0.003144,25.681176,53.1,-209.9,0.002121,5.754795,34.83,-45.0,0.00153
4,5,0.0,3,male,35.0,0,0,8.05,S,0.195279,...,-45.0,0.001739,-19.368824,8.05,-254.95,0.000322,5.754795,34.83,-45.0,0.00153


In [11]:
# 特征unique count特征，针对类别型特征
# for index, col1 in enumerate(['age', 'province', 'city', 'model']):
#     for col2 in ['age', 'province', 'city', 'model'][index:]:
#         data['{}_in_{}_count'.format(col1, col2)] = data.groupby(col1)[col2].transform('count')
#         data['{}_in_{}_nunique'.format(col1, col2)] = data.groupby(col1)[col2].transform('nunique')
#         data['{}_in_{}_nunique/{}_in_{}_count'.format(col1, col2, col1, col2)] = data['{}_in_{}_nunique'.format(col1,col2)] /data['{}_in_{}_count'.format(col1,col2)]

#         data['{}_in_{}_count'.format(col2, col1)] = data.groupby(col2)[col1].transform('count')
#         data['{}_in_{}_nunique'.format(col2, col1)] = data.groupby(col2)[col1].transform('nunique')
#         data['{}_in_{}_nunique/{}_in_{}_count'.format(col2, col1, col2, col1)] = data['{}_in_{}_nunique'.format(col2,col1)] / data['{}_in_{}_count'.format(col2, col1)]


In [12]:
#类别与类别特征逻辑上的交叉
#例如：即是男性，pclass又是1的样本；值是布尔型，非0即1。
def cross_two(base_info,name_1,name_2):
    new_col=[]
    encode=0
    dic={}
    val_1=base_info[name_1]
    val_2=base_info[name_2]
    for i in tqdm(range(len(val_1))):
        tmp=str(val_1[i])+'_'+str(val_2[i])
        if tmp in dic:
            new_col.append(dic[tmp])
        else:
            dic[tmp]=encode
            new_col.append(encode)
            encode+=1
    return new_col

new_col=cross_two(data,'Pclass','Sex')#作企业类型-小类的交叉特征
data['Pclass_Sex']=new_col
data.head()

100%|██████████████████████████████████████████████████████████████████████████| 1309/1309 [00:00<00:00, 163549.12it/s]


Unnamed: 0,PassengerId,label,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_target_enc,...,Age/sum_gb_Sex,Fare-mean_gb_Embarked,Fare-min_gb_Embarked,Fare-max_gb_Embarked,Fare/sum_gb_Embarked,Age-mean_gb_Embarked,Age-min_gb_Embarked,Age-max_gb_Embarked,Age/sum_gb_Embarked,Pclass_Sex
0,1,0.0,3,male,22.0,1,0,7.25,S,0.195279,...,0.001093,-20.168824,7.25,-255.75,0.00029,-7.245205,21.83,-58.0,0.000962,0
1,2,1.0,1,female,38.0,1,0,71.2833,C,0.739837,...,0.003414,8.947033,67.2708,-441.0459,0.004235,5.66783,37.58,-33.0,0.005544,1
2,3,1.0,3,female,26.0,0,0,7.925,S,0.739837,...,0.002336,-19.493824,7.925,-255.075,0.000317,-3.245205,25.83,-54.0,0.001137,2
3,4,1.0,1,female,35.0,1,0,53.1,S,0.739837,...,0.003144,25.681176,53.1,-209.9,0.002121,5.754795,34.83,-45.0,0.00153,1
4,5,0.0,3,male,35.0,0,0,8.05,S,0.195279,...,0.001739,-19.368824,8.05,-254.95,0.000322,5.754795,34.83,-45.0,0.00153,0


In [13]:
# 频数统计,计算类别特征每一类的个数，例如 男性的count是843，那男性对应的衍生特征值是843
cat_col = ['Sex','Embarked','Pclass']
for col in cat_col:
    data[col + '_COUNT'] = data[col].map(data[col].value_counts())
    col_idx = data[col].value_counts()

data.head()

Unnamed: 0,PassengerId,label,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_target_enc,...,Fare-max_gb_Embarked,Fare/sum_gb_Embarked,Age-mean_gb_Embarked,Age-min_gb_Embarked,Age-max_gb_Embarked,Age/sum_gb_Embarked,Pclass_Sex,Sex_COUNT,Embarked_COUNT,Pclass_COUNT
0,1,0.0,3,male,22.0,1,0,7.25,S,0.195279,...,-255.75,0.00029,-7.245205,21.83,-58.0,0.000962,0,843,914.0,709
1,2,1.0,1,female,38.0,1,0,71.2833,C,0.739837,...,-441.0459,0.004235,5.66783,37.58,-33.0,0.005544,1,466,270.0,323
2,3,1.0,3,female,26.0,0,0,7.925,S,0.739837,...,-255.075,0.000317,-3.245205,25.83,-54.0,0.001137,2,466,914.0,709
3,4,1.0,1,female,35.0,1,0,53.1,S,0.739837,...,-209.9,0.002121,5.754795,34.83,-45.0,0.00153,1,466,914.0,323
4,5,0.0,3,male,35.0,0,0,8.05,S,0.195279,...,-254.95,0.000322,5.754795,34.83,-45.0,0.00153,0,843,914.0,709


In [14]:
#分箱，等频、等宽
def bucket(data,count_feas,bucket_len):
    #通过除法映射到间隔均匀的分箱中，每个分箱的取值范围都是Age/10,相当于宽度10等宽分箱
    for fea in tqdm(count_feas,desc='分箱特征'):
        data['{}_bin'.format(fea)] = np.floor_divide(data[fea], bucket_len)

count_feas = ['Age']
bucket(data,count_feas,5)
data.head()

分箱特征: 100%|█████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 996.75it/s]


Unnamed: 0,PassengerId,label,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_target_enc,...,Fare/sum_gb_Embarked,Age-mean_gb_Embarked,Age-min_gb_Embarked,Age-max_gb_Embarked,Age/sum_gb_Embarked,Pclass_Sex,Sex_COUNT,Embarked_COUNT,Pclass_COUNT,Age_bin
0,1,0.0,3,male,22.0,1,0,7.25,S,0.195279,...,0.00029,-7.245205,21.83,-58.0,0.000962,0,843,914.0,709,4.0
1,2,1.0,1,female,38.0,1,0,71.2833,C,0.739837,...,0.004235,5.66783,37.58,-33.0,0.005544,1,466,270.0,323,7.0
2,3,1.0,3,female,26.0,0,0,7.925,S,0.739837,...,0.000317,-3.245205,25.83,-54.0,0.001137,2,466,914.0,709,5.0
3,4,1.0,1,female,35.0,1,0,53.1,S,0.739837,...,0.002121,5.754795,34.83,-45.0,0.00153,1,466,914.0,323,7.0
4,5,0.0,3,male,35.0,0,0,8.05,S,0.195279,...,0.000322,5.754795,34.83,-45.0,0.00153,0,843,914.0,709,7.0


In [15]:
# 基本的类别特征转换方法label_encode
from sklearn.preprocessing import LabelEncoder
for col in tqdm(['Sex']):
    le = LabelEncoder()
    le.fit(data[col])
    data[col+'_label_encoder'] = le.transform(data[col])
    # test[col] = le.transform(test[col])
data.head()


100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 1001.27it/s]


Unnamed: 0,PassengerId,label,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_target_enc,...,Age-mean_gb_Embarked,Age-min_gb_Embarked,Age-max_gb_Embarked,Age/sum_gb_Embarked,Pclass_Sex,Sex_COUNT,Embarked_COUNT,Pclass_COUNT,Age_bin,Sex_label_encoder
0,1,0.0,3,male,22.0,1,0,7.25,S,0.195279,...,-7.245205,21.83,-58.0,0.000962,0,843,914.0,709,4.0,1
1,2,1.0,1,female,38.0,1,0,71.2833,C,0.739837,...,5.66783,37.58,-33.0,0.005544,1,466,270.0,323,7.0,0
2,3,1.0,3,female,26.0,0,0,7.925,S,0.739837,...,-3.245205,25.83,-54.0,0.001137,2,466,914.0,709,5.0,0
3,4,1.0,1,female,35.0,1,0,53.1,S,0.739837,...,5.754795,34.83,-45.0,0.00153,1,466,914.0,323,7.0,0
4,5,0.0,3,male,35.0,0,0,8.05,S,0.195279,...,5.754795,34.83,-45.0,0.00153,0,843,914.0,709,7.0,1


In [None]:
#构建序列特征，例如构建每个用户的登录行为序列
#例如一个用户会存在多个样本，对用户groupby获取登录序列，拼起来作为新特征
launch_grp = pd.DataFrame()

user_id = []
launch_date_str = []
for i in launch.groupby('user_id'):
    launch_date = []
    user_id.append(i[0])
    for j in i[1]['date']:
        launch_date.append(j)
    launch_date_str.append(str(launch_date))
launch_grp['user_id'] = list(user_id)
launch_grp['launch_date_str'] = list(launch_date_str)
launch_grp.head()

In [None]:
#对上述序列或文本特征进行w2v，构建embedding特征
from gensim.models.word2vec import Word2Vec

data['tagid'] = data['tagid'].apply(lambda x: eval(x))
sentences = data['tagid'].values.tolist()
for i in range(len(sentences)):
    sentences[i] = [str(x) for x in sentences[i]]   #将每个tagid转化成str格式

#训练数据格式如下
#sentences=[['外形', '外观', '好看', '屏幕', '特别'], ['手机', '好看', '段时间'], ['手机', '很漂亮', '评价']]

emb_size = 32
#model = Word2Vec(sentences,vector_size=emb_size, window=6, min_count=5, sg=0, hs=0, seed=1,epochs=5)
model = Word2Vec.load('./w2vmodel/w2vmodel.model')
emb_matrix = []
for seq in sentences:
    vec = []
    for w in seq:
#         if w in model.wv.vocab:
#             vec.append(model.wv[w])
        try:
            vec.append(model.wv[w])
        except KeyError:
                continue
                
    if len(vec) > 0:
        emb_matrix.append(np.mean(vec, axis=0))
    else:
        emb_matrix.append([0] * emb_size)
emb_matrix = np.array(emb_matrix)
for i in range(emb_size):
    data['tag_emb_{}'.format(i)] = emb_matrix[:, i]

In [None]:
#想象力特征