# 特征工程集合

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import missingno as msno
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve,validation_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.base import clone

from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost.sklearn import XGBRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor,LGBMClassifier,callback
from catboost import CatBoostRegressor,CatBoostClassifier

In [4]:
sample_submit = pd.read_csv('./data/提交示例.csv',encoding='utf-8')
sample_submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 490530 entries, 0 to 490529
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   pid     490530 non-null  object
 1   label   490530 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 7.5+ MB


# 读取数据

In [3]:
def reduce_mem_usage(data):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = data.memory_usage().sum() 
    
    for col in data.columns:
        col_type = data[col].dtype
        
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)
        else:
            data[col] = data[col].astype('category')
    end_mem = data.memory_usage().sum() 
    return data 

In [4]:
train_data = reduce_mem_usage(pd.read_csv('./data/train.csv',encoding='utf-8'))
test_data = reduce_mem_usage(pd.read_csv('./data/test.csv',encoding='utf-8'))

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209470 entries, 0 to 209469
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype   
---  ------       --------------   -----   
 0   pid          209470 non-null  category
 1   label        209470 non-null  int8    
 2   brand        209470 non-null  category
 3   model        209470 non-null  category
 4   province     209470 non-null  category
 5   city         209470 non-null  category
 6   enum_tag     209470 non-null  category
 7   numeric_tag  189629 non-null  category
dtypes: category(7), int8(1)
memory usage: 24.8 MB


# 数据预处理

In [None]:
def data_preprocessing(train_data,test_data):
 
    # 合并数据，方便处理和构造特征  
    data = pd.concat([train_data,test_data],ignore_index=True)
   
    # 将索引值生成‘pid_id’列
    data.reset_index(inplace=True)
    data.rename(columns={'index':'pid_id'},inplace=True)

    # 类别特征： 'brand','model','province','city'
    brand_dict = {}
    brand_list = list(data['brand'].unique())
    for ind,val in enumerate(brand_list):
        brand_dict[val] = ind
    data['brand'] = data['brand'].map(brand_dict).astype('float')
    
    model_dict = {}
    model_list = list(data['model'].unique())
    for ind,val in enumerate(model_list):
        model_dict[val] = ind
    data['model'] = data['model'].map(model_dict).astype('float')
    
    province_dict = {}
    province_list = list(data['province'].unique())
    for ind,val in enumerate(province_list):
        province_dict[val] = ind
    data['province'] = data['province'].map(province_dict).astype('float')
    
    city_dict = {}
    city_list = list(data['city'].unique())
    for ind,val in enumerate(city_list):
        city_dict[val] = ind
    data['city'] = data['city'].map(city_dict).astype('float')
    
    
    # 时间特征 ：‘'enum_tag'，'numeric_tag'
    # 将‘unkown’替换成空值
    data['enum_tag'] = data['enum_tag'].apply(lambda x:x if x!='unkown' else np.nan)
    data['numeric_tag'] = data['numeric_tag'].apply(lambda x:x if x!='unkown' else np.nan)
       
    # 将 ‘numeric_tag’列的空值用‘enum_tag’的值填充
    null_numeric_index = data[data['numeric_tag'].isnull()].index
    for i in null_numeric_index:
        data['numeric_tag'][i] = data['enum_tag'][i]
    del data['enum_tag']
    
    # 拆分‘numeric_tag’列
    # 1）一行拆多行（将父类订单拆成子订单） 
    data =data.drop('numeric_tag',axis=1).join(data['numeric_tag'].str.split('|',expand=True).stack().reset_index(level=1,drop=True).rename('numeric_tag'))
    data['numeric_tag'] =[i.replace("'",'') if "'" in str(i) else i for i in data['numeric_tag']]
    
    # 2） 一列拆多列，将'tag'拆分成'tagid'、'time'列    
    data['session_tagid']=data['numeric_tag'].apply(lambda x:str(x).split(';')[0])
    data['session_tagid']=data['session_tagid'].apply(lambda x:str(x)[str(x).find(':')+1:])
    data['session_time']=data['numeric_tag'].apply(lambda x:str(x)[str(x).find('time'):])
    data['session_time']=data['session_time'].apply(lambda x:str(x)[str(x).find(':')+1:])     
    data['session_time']= pd.to_datetime(data['session_time'],format='%Y-%m-%d', errors='coerce')
    data['value']=data['numeric_tag'].apply(session_value)
    data['value'] = data['value'].apply(lambda x:str(x)[str(x).find(':')+1:]) 
    del data['numeric_tag']

    # 压缩数据，通过调整数据类型，减少数据在内存中占用的空间
    reduce_list = ['brand','model','province','city','train']
    data[reduce_list] = data[reduce_list].astype(np.int16)
    data['value'] = data['value'].astype(np.float16)
    
    return data


def session_value(x):
    if str(x).find('value')==-1:
        return np.nan
    else:
        return str(x)[str(x).find('value'):str(x).find('time')-1]

In [None]:
data = data_preprocessing(train_data,test_data)

In [None]:
data.to_csv('./feature/feature_0.csv')

# 特征工程 1

In [28]:
def feature_engineering_1(data):
  
    # 时间特征提取
    data['year'] = data['session_time'].dt.year
    data['month'] = data['session_time'].dt.month
    data['dayofweek'] = data['session_time'].dt.dayofweek
    
    # 处理‘year’等于1970的异常数据
    data['year'] = data['year'].apply(lambda x:x if x!=1970.0 else np.nan)
    data['session_time'] = data['session_time'].apply(lambda x:x if str(x)[:4]!='1970' else np.nan)
    
    data['first_year'] = data.groupby(by=['pid'])['year'].transform('min')
    data['last_year'] = data.groupby(by=['pid'])['year'].transform('max')
    
    # 分类特征编码 'first_year','last_year'
    year_dict = {
            2023.0: 0,
            2022.0: 1,
            2021.0: 2,
            2020.0: 3,
            2019.0: 4,
            2018.0: 5,
            2017.0: 6,
            2015.0: 7,
            2010.0: 8,
            2009.0: 9}
    data['first_year'] = data['first_year'].map(year_dict).astype('int8')
    data['last_year'] = data['last_year'].map(year_dict).astype('int8')
    
    
    # 提取时间特征
    data = feature_merge(data,'pid')
    
    
    data['active_days'] = data['active_days'].apply(lambda x:x if x!=0 else -1)
    data['dif_days_total'] = (data['last_time'].astype('datetime64') - data['first_time'].astype('datetime64')).dt.days
    data['dif_days_avg'] = data['dif_days_total']/data['active_days']
    
    data['one_day_sessions'] = data.groupby(by=['pid','session_time'])['session_tagid'].transform('count')
    data['one_day_sessions_avg'] = data.groupby(by=['pid'])['one_day_sessions'].transform('mean').round(2)
    del data['one_day_sessions']
    
    # 时间特征衍生：‘data_2023'数据集
    data_2023 = data[data['year']==2023][['pid_id','session_time']]
    data_2023 = feature_merge_2023(data_2023,'pid_id')
    
    data_2023['2023_active_days'] = data_2023['2023_active_days'].apply(lambda x:x if x!=0 else -1)
    data_2023['2023_recency'] = (pd.to_datetime('20230805',format='%Y-%m-%d') - data_2023['2023_last_time']).dt.days
    data_2023['2023_dif_days_total'] = (data_2023['2023_last_time'] - data_2023['2023_first_time']).dt.days
    data_2023['2023_dif_days_avg'] = (data_2023['2023_dif_days_total']/data_2023['2023_active_days']).round(2)    
    data_2023['2023_day_sessions'] = data_2023[data_2023['last_year']==2023].groupby(by=['pid','session_time']).transform('count')
    data_2023['2023_day_sessions_avg'] = data_2023.groupby(by=['pid'])['2023_day_sessions'].transform('mean').round(2)
    
    # 删除一些特征并整体去重
    del data_2023['session_time']
    del data_2023['2023_day_sessions']
    data_2023.drop_duplicates(inplace=True)
     
    del data['session_tagid']
    del data['session_time']
    del data['year']
    del data['month']
    del data['dayofweek']
    data.drop_duplicates(inplace=True)
    
    # data和data_2023拼接
    data = data.merge(data_2023,how='left',on='pid_id')
        
    
    # 所有空值用‘-1’填充
    data = data.fillna(-1)
    
    # 数据压缩，通过调整数据类型，减少数据在内存中占用的空间
    data = reduce_mem_usage(data)
    
    return data


def feature_merge(data,feature):
    data_gb = data.groupby(feature)
    all_infos = {} 
    for key,value in data_gb:
        info = {}
        value = value[value['session_time']!=np.nan]
        info['session_count'] = len(value)
        info['first_time'] = value.session_time.min()
        info['last_time'] = value.session_time.max()
        info['active_days'] = value.session_time.nunique()
        all_infos[key] = info   
    df = pd.DataFrame(all_infos).T.reset_index().rename(columns={'index':str(feature)})
    data = data.merge(df,how='left',on=str(feature))    
    return data  


def feature_merge_2023(data,feature):
    data_gb = data.groupby(feature)
    all_infos = {} 
    for key,value in data_gb:
        info = {}
        value = value[value['session_time']!=np.nan]
        info['2023_session_count'] = len(value)
        info['2023_first_time'] = value.session_time.min()
        info['2023_last_time'] = value.session_time.max()
        info['2023_active_days'] = value.session_time.nunique()
        all_infos[key] = info   
    df = pd.DataFrame(all_infos).T.reset_index().rename(columns={'index':str(feature)})
    data = data.merge(df,how='left',on=str(feature))    
    return data  


def reduce_mem_usage(data):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = data.memory_usage().sum() 
    
    for col in data.columns:
        col_type = data[col].dtype
        
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)
        else:
            data[col] = data[col].astype('category')
    end_mem = data.memory_usage().sum() 
    return data 

In [None]:
data_1 = feature_engineering_1(data)

In [None]:
data_1.to_csv('./feature/feature_1.csv')

# 特征工程 2

In [None]:
def feature_engineering_2(data):
  
    # 时间特征提取
    data['year'] = data['session_time'].dt.year
    data['month'] = data['session_time'].dt.month
    data['dayofweek'] = data['session_time'].dt.dayofweek
    
   
    # 时间特征衍生‘session_amount’，‘last_time’，'first_time'
    data['session_count'] = data.groupby(by=['pid_id'])['session_tagid'].transform('count')
    data['first_time'] = data.groupby(by=['pid_id'])['session_time'].transform('min')
    data['last_time'] = data.groupby(by=['pid_id'])['session_time'].transform('max')
    
    data['first_year'] = data.groupby(by=['pid_id'])['year'].transform('min')
    data['last_year'] = data.groupby(by=['pid_id'])['year'].transform('max')
    
    data['dif_days'] = (data['last_time'].astype('datetime64') - data['first_time'].astype('datetime64')).dt.days
    data['dif_days_avg'] = data['dif_days']/data['session_amount']
        
    data['2023_recency'] = (pd.to_datetime('20230805',format='%Y-%m-%d') - data['last_time']).dt.days
    data['2023_frequency'] = data[data['year']==2023].groupby('pid_id')['session_time'].transform('count')
    data['2023_frequency'] = data['2023_frequency'].apply(lambda x:x if x>-1 else -1)
    data['2023_frequency'] = data.groupby('pid_id')['2023_frequency'].transform('max')    
    
    data['2023_last_time'] = data[data['year']==2023].groupby('pid_id')['session_time'].transform('max')
    data['2023_last_time'] = data.groupby('pid_id')['2023_last_time'].transform('max')
    data['2023_first_time'] = data[data['year']==2023].groupby('pid_id')['session_time'].transform('min')
    data['2023_first_time'] = data.groupby('pid_id')['2023_first_time'].transform('min')
    
    data['2023_dif_days'] = (data['2023_last_time'].astype('datetime64') - data['2023_first_time'].astype('datetime64')).dt.days
    data['2023_dif_days_avg'] = data['2023_dif_days']/data['2023_frequency']
    
    del data['session_tagid']
    del data['session_time']
    del data['last_time']
    del data['first_time']
    del data['2023_last_time']
    del data['2023_first_time']
    
    
    # 分类特征编码
    class_cols = ['first_year','last_year']
    for col in class_cols:
        lbl = LabelEncoder().fit(data[col])
        data[col] = lbl.transform(data[col])  
    
    # 时间统计特征独热 'year','month','dayofweek'
    onehot_list = ['pid_id','year','month','dayofweek']
    onehot_data = data[onehot_list]
    onehot_data = pd.get_dummies(onehot_data,columns=['year','month','dayofweek'])
    feature_list = onehot_data.drop('pid_id',axis=1).columns
    for feat in feature_list:
        onehot_data[str(feat) +'_count'] = onehot_data.groupby('pid_id')[feat].transform('sum')
    drop_list = ['year_1970.0', 'year_2009.0', 'year_2010.0', 'year_2015.0',
       'year_2017.0', 'year_2018.0', 'year_2019.0', 'year_2020.0',
       'year_2021.0', 'year_2022.0', 'year_2023.0', 'month_1.0', 'month_2.0',
       'month_3.0', 'month_4.0', 'month_5.0', 'month_6.0', 'month_7.0',
       'month_8.0', 'month_9.0', 'month_10.0', 'month_11.0', 'month_12.0',
       'dayofweek_0.0', 'dayofweek_1.0', 'dayofweek_2.0', 'dayofweek_3.0',
       'dayofweek_4.0', 'dayofweek_5.0', 'dayofweek_6.0']
    onehot_data = onehot_data.drop(columns=drop_list,axis=1)
    onehot_data.drop_duplicates(inplace=True)

    
    # 删除一些特征并整体去重
    del data['pid'] 
    del data['year']
    del data['month']
    del data['dayofweek']
    data.drop_duplicates(inplace=True)

    # data和onehot_data拼接   
    data = data.merge(onehot_data,how='left',on='pid_id')

    
    # 所有空值用‘-1’填充
    data = data.fillna(-1)
    
    #删除‘pid_id’
    del data['pid_id']
    
    
    
    # 数据压缩，通过调整数据类型，减少数据在内存中占用的空间
    data = reduce_mem_usage(data)
        
    return data


def reduce_mem_usage(data):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = data.memory_usage().sum() 
    
    for col in data.columns:
        col_type = data[col].dtype
        
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)
        else:
            data[col] = data[col].astype('category')
    end_mem = data.memory_usage().sum() 
    return data 

In [None]:
data_2 = feature_engineering_2(data)

In [None]:
data_2.to_csv('./feature/feature_2.csv')

# 特征工程 3

In [None]:
def feature_engineering_3(data):
  
    # 时间特征提取
    data['year'] = data['session_time'].dt.year
    data['month'] = data['session_time'].dt.month
    data['dayofweek'] = data['session_time'].dt.dayofweek
    
    # 处理‘year’等于1970的异常数据
    data['year'] = data['year'].apply(lambda x:x if x!=1970.0 else np.nan)
    data['session_time'] = data['session_time'].apply(lambda x:x if str(x)[:4]!='1970' else np.nan)
    
    data['first_year'] = data.groupby(by=['pid'])['year'].transform('min')
    data['last_year'] = data.groupby(by=['pid'])['year'].transform('max')    
    
    # 分类特征编码 'first_year','last_year'
    year_dict = {
            -1.0: -1,
            2023.0: 0,
            2022.0: 1,
            2021.0: 2,
            2020.0: 3,
            2019.0: 4,
            2018.0: 5,
            2017.0: 6,
            2015.0: 7,
            2010.0: 8,
            2009.0: 9}
    data['first_year'] = data['first_year'].map(year_dict).astype('int8')
    data['last_year'] = data['last_year'].map(year_dict).astype('int8')
    
    
    # 提取时间特征
    data = feature_merge(data,'pid')
    data['active_days'] = data['active_days'].apply(lambda x:x if x!=0 else -1)
    data['dif_days_total'] = (data['last_time'].astype('datetime64') - data['first_time'].astype('datetime64')).dt.days
    data['dif_days_avg'] = data['dif_days_total']/data['active_days']
    
    data['one_day_sessions'] = data.groupby(by=['pid','session_time'])['session_tagid'].transform('count')
    data['one_day_sessions_avg'] = data.groupby(by=['pid'])['one_day_sessions'].transform('mean').round(2)
    del data['one_day_sessions']
    
    # 时间特征衍生：‘data_2023'数据集
    data_2023 = data[data['year']==2023][['pid_id','session_time']]
    data_2023 = feature_merge_2023(data_2023,'pid_id')
    
    data_2023['2023_active_days'] = data_2023['2023_active_days'].apply(lambda x:x if x!=0 else -1)
    data_2023['2023_recency'] = (pd.to_datetime('20230805',format='%Y-%m-%d') - data_2023['2023_last_time']).dt.days
    data_2023['2023_dif_days_total'] = (data_2023['2023_last_time'] - data_2023['2023_first_time']).dt.days
    data_2023['2023_dif_days_avg'] = (data_2023['2023_dif_days_total']/data_2023['2023_active_days']).round(2)    
    data_2023['2023_day_sessions'] = data_2023[data_2023['last_year']==2023].groupby(by=['pid','session_time']).transform('count')
    data_2023['2023_day_sessions_avg'] = data_2023.groupby(by=['pid'])['2023_day_sessions'].transform('mean').round(2)
    
    # data_2023 删除一些特征并整体去重
    del data_2023['session_time']
    del data_2023['2023_day_sessions']
    data_2023.drop_duplicates(inplace=True)
          
    
    # 时间统计特征独热 :onehot_data('year','month','dayofweek')
    onehot_list = ['pid_id','year','month','dayofweek']
    onehot_data = data[onehot_list]
    onehot_data = pd.get_dummies(onehot_data,columns=['year','month','dayofweek'])
    feature_list = onehot_data.drop('pid_id',axis=1).columns
    for feat in feature_list:
        onehot_data[str(feat) +'_count'] = onehot_data.groupby('pid_id')[feat].transform('sum')
    drop_list = [ 'year_2009.0', 'year_2010.0', 'year_2015.0',
       'year_2017.0', 'year_2018.0', 'year_2019.0', 'year_2020.0',
       'year_2021.0', 'year_2022.0', 'year_2023.0', 'month_1.0', 'month_2.0',
       'month_3.0', 'month_4.0', 'month_5.0', 'month_6.0', 'month_7.0',
       'month_8.0', 'month_9.0', 'month_10.0', 'month_11.0', 'month_12.0',
       'dayofweek_0.0', 'dayofweek_1.0', 'dayofweek_2.0', 'dayofweek_3.0',
       'dayofweek_4.0', 'dayofweek_5.0', 'dayofweek_6.0']
    onehot_data = onehot_data.drop(columns=drop_list,axis=1)
    onehot_data.drop_duplicates(inplace=True)
    
    # 删除一些特征，整体去重
    del data['pid']
    del data['label']
    del data['session_tagid']
    del data['session_time']
    del data['year']
    del data['month']
    del data['dayofweek']    
    data.drop_duplicates(inplace=True)
    
    # data和data_2023拼接
    data = data.merge(data_2023,how='left',on='pid_id')
    
    # data和onehot_data拼接
    data = data.merge(onehot_data,how='left',on='pid_id')

    
    # 所有空值用‘-1’填充
    data = data.fillna(-1)
    
    # 数据压缩，通过调整数据类型，减少数据在内存中占用的空间
    data = reduce_mem_usage(data)
    
    return data


def feature_merge(data,feature):
    data_gb = data.groupby(feature)
    all_infos = {} 
    for key,value in data_gb:
        info = {}
        value = value[value['session_time']!=np.nan]
        info['session_count'] = len(value)
        info['first_time'] = value.session_time.min()
        info['last_time'] = value.session_time.max()
        info['active_days'] = value.session_time.nunique()
        all_infos[key] = info   
    df = pd.DataFrame(all_infos).T.reset_index().rename(columns={'index':str(feature)})
    data = data.merge(df,how='left',on=str(feature))    
    return data  

def feature_merge_2023(data,feature):
    data_gb = data.groupby(feature)
    all_infos = {} 
    for key,value in data_gb:
        info = {}
        value = value[value['session_time']!=np.nan]
        info['2023_session_count'] = len(value)
        info['2023_first_time'] = value.session_time.min()
        info['2023_last_time'] = value.session_time.max()
        info['2023_active_days'] = value.session_time.nunique()
        all_infos[key] = info   
    df = pd.DataFrame(all_infos).T.reset_index().rename(columns={'index':str(feature)})
    data = data.merge(df,how='left',on=str(feature))    
    return data  


def is_year_2023(x):
    if x==2023.0:
        return 1
    elif x==-1:
        return x
    else:
        return 0

    
def reduce_mem_usage(data):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = data.memory_usage().sum() 
    
    for col in data.columns:
        col_type = data[col].dtype
        
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)
        else:
            data[col] = data[col].astype('category')
    end_mem = data.memory_usage().sum() 
    return data 

In [None]:
data_3 = feature_engineering_3(data)

In [None]:
data_3.to_csv('./feature/feature_3.csv')

# 特征工程 4

In [None]:
def feature_engineering_4(data):
  
    # 时间特征提取
    data['year'] = data['session_time'].dt.year
    data['month'] = data['session_time'].dt.month
    data['dayofweek'] = data['session_time'].dt.dayofweek
    
    # 处理‘year’等于1970的异常数据
    data['year'] = data['year'].apply(lambda x:x if x!=1970.0 else np.nan)
    data['session_time'] = data['session_time'].apply(lambda x:x if str(x)[:4]!='1970' else np.nan)
    
    # 时间特征衍生
    data['first_year'] = data.groupby(by=['pid'])['year'].transform('min')
    data['last_year'] = data.groupby(by=['pid'])['year'].transform('max')
    data['is_first_year_2023'] = data['first_year'].apply(is_year_2023)
    data['is_last_year_2023'] = data['last_year'].apply(is_year_2023)
    
    data['session_count'] = data.groupby(by=['pid'])['session_tagid'].transform('count')
    data['first_time'] = data.groupby(by=['pid'])['session_time'].transform('min')
    data['last_time'] = data.groupby(by=['pid'])['session_time'].transform('max')
        
    data['dif_days_total'] = (data['last_time'].astype('datetime64') - data['first_time'].astype('datetime64')).dt.days
    data['dif_days_avg'] = (data['dif_days_total']/data['session_count']).round(2)
    
    data['active_days'] = data.groupby(by=['pid'])['session_time'].transform('count')
    data['one_day_sessions'] = data.groupby(by=['pid','session_time'])['session_tagid'].transform('count')
    data['one_day_sessions_avg'] = data.groupby(by=['pid'])['one_day_sessions'].transform('mean').round(2)
    
    data['2023_recency'] = (pd.to_datetime('20230805',format='%Y-%m-%d') - data['last_time']).dt.days
    data['2023_session_count'] = data[data['last_year']==2023].groupby('pid')['session_tagid'].transform('count')
    data['2023_last_time'] = data[data['last_year']==2023].groupby('pid')['session_time'].transform('max')
    data['2023_first_time'] = data[data['last_year']==2023].groupby('pid')['session_time'].transform('min')
    data['2023_dif_days_total'] = (data['2023_last_time'] - data['2023_first_time']).dt.days
    data['2023_dif_days_avg'] = (data['2023_dif_days_total']/data['2023_session_count']).round(2)     
    data['2023_active_days'] = data[data['last_year']==2023].groupby(by=['pid'])['session_time'].transform('count')
    data['2023_day_sessions'] = data[data['last_year']==2023].groupby(by=['pid','session_time'])['session_tagid'].transform('count')
    data['2023_day_sessions_avg'] = data.groupby(by=['pid'])['2023_day_sessions'].transform('mean').round(2)
    
    # 分类特征编码 'first_year','last_year'
    year_dict = {
            -1.0: -1,
            2023.0: 0,
            2022.0: 1,
            2021.0: 2,
            2020.0: 3,
            2019.0: 4,
            2018.0: 5,
            2017.0: 6,
            2015.0: 7,
            2010.0: 8,
            2009.0: 9}
    data['first_year'] = data['first_year'].map(year_dict).astype('int8')
    data['last_year'] = data['last_year'].map(year_dict).astype('int8')
    
    
    del data['pid'] 
    del data['session_tagid']
    del data['session_time']
    del data['last_time']
    del data['first_time']
    del data['2023_last_time']
    del data['2023_first_time']
    del data['one_day_sessions']
    del data['2023_day_sessions']
    
    # 时间统计特征独热 'year','month','dayofweek'
    onehot_list = ['pid_id','year','month','dayofweek']
    onehot_data = data[onehot_list]
    onehot_data = pd.get_dummies(onehot_data,columns=['year','month','dayofweek'])
    feature_list = onehot_data.drop('pid_id',axis=1).columns
    for feat in feature_list:
        onehot_data[str(feat) +'_count'] = onehot_data.groupby('pid_id')[feat].transform('sum')
    drop_list = [ 'year_2009.0', 'year_2010.0', 'year_2015.0',
       'year_2017.0', 'year_2018.0', 'year_2019.0', 'year_2020.0',
       'year_2021.0', 'year_2022.0', 'year_2023.0', 'month_1.0', 'month_2.0',
       'month_3.0', 'month_4.0', 'month_5.0', 'month_6.0', 'month_7.0',
       'month_8.0', 'month_9.0', 'month_10.0', 'month_11.0', 'month_12.0',
       'dayofweek_0.0', 'dayofweek_1.0', 'dayofweek_2.0', 'dayofweek_3.0',
       'dayofweek_4.0', 'dayofweek_5.0', 'dayofweek_6.0']
    onehot_data = onehot_data.drop(columns=drop_list,axis=1)
    onehot_data.drop_duplicates(inplace=True)
    
    # 删除一些特征，整体去重
    del data['year']
    del data['month']
    del data['dayofweek']
    
    data.drop_duplicates(inplace=True)
    
    # data和onehot_data拼接
    data = data.merge(onehot_data,how='left',on='pid_id')
    
    
    # 处理数值特征的异常数据，并进行数据分桶
    bins_c = [0,1,3,5,10,24,35,48,56,72,95,500]
    data['session_count_bin'] = pd.cut(data['session_count'],bins_c,labels=False)
    
    bins_act = [-1,1,2,3,7,10,15,20,32,90]
    data['active_days_bin'] = pd.cut(data['active_days'],bins_act,labels=False)
    
    bins_dif = [-1,0,7,14,30,60,90,120,150,180,240,365,730,1095,20000]
    data['dif_days_bin'] = pd.cut(data['dif_days_total'],bins_dif,labels=False)
    
    bins_dif_avg = [-1,0,7,14,21,30,60,90,120,180,365,10000]
    data['dif_days_avg_bin'] = pd.cut(data['dif_days_avg'],bins_dif_avg,labels=False)
    
    bins_se = [-1,1,2,3,5,12,24,35,65,500]
    data['day_sessions_avg_bin'] = pd.cut(data['day_sessions_avg'],bins_se,labels=False)
    
    bins_rec = [-1,60,90,120,150,180,220]
    data['2023_recency_bin'] = pd.cut(data['2023_recency'],bins_rec,labels=False)
    
    bins_sec = [0,1,3,5,10,21,27,32,43,65,120,320]
    data['2023_session_count_bin'] = pd.cut(data['2023_session_count'],bins_sec,labels=False)
    
    bins_acd = [0,1,4,9,12,20,25]
    data['2023_active_days_bin'] = pd.cut(data['2023_active_days'],bins_acd,labels=False)
    
    bins_2023_dif = [-1,0,3,7,14,21,30,60,90,120,180,200]
    data['2023_dif_days_bin'] = pd.cut(data['2023_dif_days_total'],bins_2023_dif,labels=False)
    
    bins_2023_dda= [-1,0,3,7,15,23,31,40,63,100]
    data['2023_dif_days_avg_bin'] = pd.cut(data['2023_dif_days_avg'],bins_dda,labels=False)
    
    bins_dse = [0,1,3,5,9,15,300]
    data['2023_day_sessions_avg_bin'] = pd.cut(data['2023_day_sessions_avg'],bins_dse,labels=False)  
    
    
    # 所有空值用‘-1’填充
    data = data.fillna(-1)
    
    
    # 数据压缩，通过调整数据类型，减少数据在内存中占用的空间
    data = reduce_mem_usage(data)
    
    return data


def is_year_2023(x):
    if x==2023.0:
        return 1
    elif x==-1:
        return x
    else:
        return 0

def reduce_mem_usage(data):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = data.memory_usage().sum() 
    
    for col in data.columns:
        col_type = data[col].dtype
        
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)
        else:
            data[col] = data[col].astype('category')
    end_mem = data.memory_usage().sum() 
    return data 

## 特征最终完整版

In [11]:
def tree_feature(train_data,test_data):
    
    # 合并数据，方便处理和构造特征  
    data = pd.concat([train_data,test_data],ignore_index=True)

    # 将索引值生成‘pid_id’列
    data.reset_index(inplace=True)
    data.rename(columns={'index':'pid_id'},inplace=True)
    
    # 数据预处理
    data = data_preprocessing(data)
    
    # 特征工程
    data = feature_engineering(data)
         
    # 筛选特征
#     data = select_feature(data)    
    
    return data



def data_preprocessing(data):
   
    # 时间特征 ：‘'enum_tag'，'numeric_tag'
    # 将‘unkown’替换成空值
    data['enum_tag'] = data['enum_tag'].apply(lambda x:x if x!='unkown' else np.nan)
    data['numeric_tag'] = data['numeric_tag'].apply(lambda x:x if x!='unkown' else np.nan)
       
    # 将 ‘numeric_tag’列的空值用‘enum_tag’的值填充
    null_numeric_index = data[data['numeric_tag'].isnull()].index
    for i in null_numeric_index:
        data['numeric_tag'][i] = data['enum_tag'][i]
    del data['enum_tag']
    
    # 拆分‘numeric_tag’列
    # 1）一行拆多行（将父类订单拆成子订单） 
    data =data.drop('numeric_tag',axis=1).join(data['numeric_tag'].str.split('|',expand=True).stack().reset_index(level=1,drop=True).rename('numeric_tag'))
    data['numeric_tag'] =[i.replace("'",'') if "'" in str(i) else i for i in data['numeric_tag']]
    
    # 2） 一列拆多列，将'tag'拆分成'tagid'、'time'列    
    data['session_tagid']=data['numeric_tag'].apply(lambda x:str(x).split(';')[0])
    data['session_tagid']=data['session_tagid'].apply(lambda x:str(x)[str(x).find(':')+1:])
    data['session_time']=data['numeric_tag'].apply(lambda x:str(x)[str(x).find('time'):])
    data['session_time']=data['session_time'].apply(lambda x:str(x)[str(x).find(':')+1:])     
    data['session_time']= pd.to_datetime(data['session_time'],format='%Y-%m-%d', errors='coerce')
    data['value']=data['numeric_tag'].apply(session_value)
    data['value'] = data['value'].apply(lambda x:str(x)[str(x).find(':')+1:]) 
    del data['numeric_tag']

    # 压缩数据，通过调整数据类型，减少数据在内存中占用的空间
    reduce_list = ['brand','model','province','city']
    data[reduce_list] = data[reduce_list].astype(np.int16)
    data['value'] = data['value'].astype(np.float16)
    
    return data



def feature_engineering(data):
  
    # 时间特征提取
    data['year'] = data['session_time'].dt.year
    data['month'] = data['session_time'].dt.month
    data['dayofweek'] = data['session_time'].dt.dayofweek
    
    # 处理‘year’等于1970的异常数据
    data['year'] = data['year'].apply(lambda x:x if x!=1970.0 else np.nan)
    data['session_time'] = data['session_time'].apply(lambda x:x if str(x)[:4]!='1970' else np.nan)
    
    # 'value' 取平均值
    data['value_mean'] = data.groupby(by=['pid'])['value'].transform('mean')
    del data['value']
    
    # 时间特征衍生
    data['first_year'] = data.groupby(by=['pid'])['year'].transform('min')
    data['last_year'] = data.groupby(by=['pid'])['year'].transform('max')
    
#     data['is_first_year_2023'] = data['first_year'].apply(is_year_2023)
#     data['is_last_year_2023'] = data['last_year'].apply(is_year_2023)
    
    # 提取时间特征 ：session_amount’，‘last_time’，'first_time'，'active_days'
    data = feature_merge(data,'pid')
    
    data['dif_days_total'] = (data['last_time'].astype('datetime64') - data['first_time'].astype('datetime64')).dt.days
    data['dif_days_avg'] = data['dif_days_total']/data['active_days']
    
    data['one_day_sessions'] = data.groupby(by=['pid','session_time'])['session_tagid'].transform('count')
    data['one_day_sessions_avg'] = data.groupby(by=['pid'])['one_day_sessions'].transform('mean').round(2)
    del data['one_day_sessions']
    
    # 时间特征衍生：‘data_2023'数据集，'2023_session_count'，'2023_last_time’，'2023_first_time'，'2023_active_days'
    data_2023 = data[data['year']==2023][['pid_id','session_time']]
    data_2023 = feature_merge_2023(data_2023,'pid_id')
    
    data_2023['2023_recency'] = (pd.to_datetime('20230805',format='%Y-%m-%d') - data_2023['2023_last_time']).dt.days
    data_2023['2023_dif_days_total'] = (data_2023['2023_last_time'] - data_2023['2023_first_time']).dt.days
    data_2023['2023_dif_days_avg'] = (data_2023['2023_dif_days_total']/data_2023['2023_active_days']).round(2)      
    data_2023['2023_day_sessions'] = data_2023.groupby(by=['pid_id','session_time']).transform('count')
    data_2023['2023_day_sessions_avg'] = data_2023.groupby(by=['pid_id'])['2023_day_sessions'].transform('mean').round(2)
    
    # data_2023 删除一些特征并整体去重
    del data_2023['session_time']
    del data_2023['2023_day_sessions']
    data_2023.drop_duplicates(inplace=True)
    

    
    # 时间统计特征独热 'year','month','dayofweek'
    onehot_list = ['pid_id','year','month','dayofweek']
    onehot_data = data[onehot_list]
    onehot_data = pd.get_dummies(onehot_data,columns=['year','month','dayofweek'])
    feature_list = onehot_data.drop('pid_id',axis=1).columns
    for feat in feature_list:
        onehot_data[str(feat) +'_count'] = onehot_data.groupby('pid_id')[feat].transform('sum')
    drop_list = [ 'year_2009.0', 'year_2010.0', 'year_2015.0',
       'year_2017.0', 'year_2018.0', 'year_2019.0', 'year_2020.0',
       'year_2021.0', 'year_2022.0', 'year_2023.0', 'month_1.0', 'month_2.0',
       'month_3.0', 'month_4.0', 'month_5.0', 'month_6.0', 'month_7.0',
       'month_8.0', 'month_9.0', 'month_10.0', 'month_11.0', 'month_12.0',
       'dayofweek_0.0', 'dayofweek_1.0', 'dayofweek_2.0', 'dayofweek_3.0',
       'dayofweek_4.0', 'dayofweek_5.0', 'dayofweek_6.0']
    onehot_data = onehot_data.drop(columns=drop_list,axis=1)
    onehot_data.drop_duplicates(inplace=True)
    
    # 删除一些特征，整体去重
    del data['pid'] 
    del data['session_tagid']
    del data['session_time']
    del data['last_time']
    del data['first_time']
    del data['2023_last_time']
    del data['2023_first_time']
    del data['one_day_sessions']
    del data['2023_day_sessions']
    del data['year']
    del data['month']
    del data['dayofweek'] 
    data.drop_duplicates(inplace=True)
    
    
    # data和data_2023拼接
    data = data.merge(data_2023,how='left',on='pid_id')
    
    # data和onehot_data拼接
    data = data.merge(onehot_data,how='left',on='pid_id')
    
    
    # 类别特征： 'brand','model','province','city'
    brand_dict = {}
    brand_list = list(data['brand'].unique())
    for ind,val in enumerate(brand_list):
        brand_dict[val] = ind
    data['brand'] = data['brand'].map(brand_dict).astype('float')
    
    model_dict = {}
    model_list = list(data['model'].unique())
    for ind,val in enumerate(model_list):
        model_dict[val] = ind
    data['model'] = data['model'].map(model_dict).astype('float')
    
    province_dict = {}
    province_list = list(data['province'].unique())
    for ind,val in enumerate(province_list):
        province_dict[val] = ind
    data['province'] = data['province'].map(province_dict).astype('float')
    
    city_dict = {}
    city_list = list(data['city'].unique())
    for ind,val in enumerate(city_list):
        city_dict[val] = ind
    data['city'] = data['city'].map(city_dict).astype('float')
       
    
    # 分类特征编码 'first_year','last_year'
    year_dict = {
            2023.0: 0,
            2022.0: 1,
            2021.0: 2,
            2020.0: 3,
            2019.0: 4,
            2018.0: 5,
            2017.0: 6,
            2015.0: 7,
            2010.0: 8,
            2009.0: 9}
    data['first_year'] = data['first_year'].map(year_dict).astype('int8')
    data['last_year'] = data['last_year'].map(year_dict).astype('int8')
 
    
    # 处理数值特征的异常数据
    data['session_count'] = data['session_count'].apply(lambda x:x if x<96 else 100)
    data['active_days'] = data['active_days'].apply(lambda x:x if x<32 else 35)
    data['dif_days_total'] = data['dif_days_total'].apply(lambda x:x if x<1000 else 1000)
    data['dif_days_avg'] = data['dif_days_avg'].apply(lambda x:x if x<140 else 140)
    data['day_sessions_avg'] = data['day_sessions_avg'].apply(lambda x:x if x<20 else 20)
    data['2023_session_count'] = data['session_count'].apply(lambda x:x if x<50 else 50)
    data['2023_day_sessions_avg'] = data['2023_day_sessions_avg'].apply(lambda x:x if x<20 else 20)
    
    # 衍生新的数值特征
    data['2023_session_rate'] = (data['2023_session_count']/data['session_count']).round(2)
    data['2023_active_rate'] = (data['2023_active_days']/data['active_days']).round(2)
    data['2023_difdays_reduce'] = (data['dif_days_avg'] - data['2023_dif_days_avg']).round(2)
    data['2023_day_sessions_pro'] = (data['2023_day_sessions_avg'] - data['day_sessions_avg']).round(2)
    
    
    # 数值特征进行数据分桶
    bins_c = [0,1,3,5,10,24,35,48,56,72,95,500]
    data['session_count_bin'] = pd.cut(data['session_count'],bins_c,labels=False)
    
    bins_act = [-1,1,2,3,7,10,15,20,32,90]
    data['active_days_bin'] = pd.cut(data['active_days'],bins_act,labels=False)
    
    bins_dif = [-1,0,7,14,30,60,90,120,150,180,240,365,730,1095,20000]
    data['dif_days_bin'] = pd.cut(data['dif_days_total'],bins_dif,labels=False)
    
    bins_dif_avg = [-1,0,7,14,21,30,60,90,120,180,365,10000]
    data['dif_days_avg_bin'] = pd.cut(data['dif_days_avg'],bins_dif_avg,labels=False)
    
    bins_se = [-1,1,2,3,5,12,24,35,65,500]
    data['day_sessions_avg_bin'] = pd.cut(data['day_sessions_avg'],bins_se,labels=False)
    
    bins_rec = [-1,60,90,120,150,180,220]
    data['2023_recency_bin'] = pd.cut(data['2023_recency'],bins_rec,labels=False)
    
    bins_sec = [0,1,3,5,10,21,27,32,43,65,120,320]
    data['2023_session_count_bin'] = pd.cut(data['2023_session_count'],bins_sec,labels=False)
    
    bins_acd = [0,1,4,9,12,20,25]
    data['2023_active_days_bin'] = pd.cut(data['2023_active_days'],bins_acd,labels=False)
    
    bins_2023_dif = [-1,0,3,7,14,21,30,60,90,120,180,200]
    data['2023_dif_days_bin'] = pd.cut(data['2023_dif_days_total'],bins_2023_dif,labels=False)
    
    bins_2023_dda= [-1,0,3,7,15,23,31,40,63,100]
    data['2023_dif_days_avg_bin'] = pd.cut(data['2023_dif_days_avg'],bins_dda,labels=False)
    
    bins_dse = [0,1,3,5,9,15,300]
    data['2023_day_sessions_avg_bin'] = pd.cut(data['2023_day_sessions_avg'],bins_dse,labels=False)  
    
    
    # 空值填充
    # 用众数填充
    year_col = ['first_year','last_year']
    si = SimpleImputer(strategy='most_frequent').fit(data[year_col])
    data[year_col] = si.transform(data[year_col])
    
    # 用 中位数填充
    num_col = ['session_count', 'active_days', 'dif_days_total',
       'dif_days_avg', 'day_sessions_avg', '2023_session_count',
       '2023_active_days', '2023_recency', '2023_dif_days_total',
       '2023_dif_days_avg', '2023_day_sessions_avg','value_mean']
    si = SimpleImputer(strategy='median').fit(data[num_col])
    data[num_col] = si.transform(data[num_col])
    
    
    
    # 数据压缩，通过调整数据类型，减少数据在内存中占用的空间
    data = reduce_mem_usage(data)
    
    return data


def session_value(x):
    if str(x).find('value')==-1:
        return np.nan
    else:
        return str(x)[str(x).find('value'):str(x).find('time')-1]
    
# def is_year_2023(x):
#     if x==2023.0:
#         return 1
#     elif x==-1:
#         return x
#     else:
#         return 0

def feature_merge(data,feature):
    data_gb = data.groupby(feature)
    all_infos = {} 
    for key,value in data_gb:
        info = {}
        value = value[value['session_time']!=np.nan]
        info['session_count'] = len(value)
        info['first_time'] = value.session_time.min()
        info['last_time'] = value.session_time.max()
        info['active_days'] = value.session_time.nunique()
        all_infos[key] = info   
    df = pd.DataFrame(all_infos).T.reset_index().rename(columns={'index':str(feature)})
    data = data.merge(df,how='left',on=str(feature))    
    return data  

def feature_merge_2023(data,feature):
    data_gb = data.groupby(feature)
    all_infos = {} 
    for key,value in data_gb:
        info = {}
        value = value[value['session_time']!=np.nan]
        info['2023_session_count'] = len(value)
        info['2023_first_time'] = value.session_time.min()
        info['2023_last_time'] = value.session_time.max()
        info['2023_active_days'] = value.session_time.nunique()
        all_infos[key] = info   
    df = pd.DataFrame(all_infos).T.reset_index().rename(columns={'index':str(feature)})
    data = data.merge(df,how='left',on=str(feature))    
    return data 



def reduce_mem_usage(data):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = data.memory_usage().sum() 
    
    for col in data.columns:
        col_type = data[col].dtype
        
        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)
        else:
            data[col] = data[col].astype('category')
    end_mem = data.memory_usage().sum() 
    return data 