# 该notebook为天池预测幸福感比赛中本人建模过程

导入模块

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

导入数据集

In [29]:
train = pd.read_csv('happiness_train_complete.csv', encoding='ISO-8859-1')
test = pd.read_csv('happiness_test_complete.csv', encoding='ISO-8859-1')
data = pd.concat([train, test], ignore_index=True)

先定义几个数据预处理中需要用到的函数

In [30]:
def reduce_mem_usage(df): #防止内存过大导致计算过程中出现内存错误，将数据集压缩至最优格式到达减少内存的目的
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} B'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} B'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [31]:
reduce_mem_usage(data)

Memory usage of dataframe is 12284288.00 B
Memory usage after optimization is: 2426928.00 B
Decreased by 80.2%


Unnamed: 0,id,happiness,survey_type,province,city,county,survey_time,gender,birth,nationality,...,neighbor_familiarity,public_service_1,public_service_2,public_service_3,public_service_4,public_service_5,public_service_6,public_service_7,public_service_8,public_service_9
0,1,4.0,1,12,32,59,2015/8/4 14:18,1,1959,1,...,4,50,60.0,50,50,30.0,30,50,50,50
1,2,4.0,2,18,52,85,2015/7/21 15:04,1,1992,1,...,3,90,70.0,70,80,85.0,70,90,60,60
2,3,4.0,2,29,83,126,2015/7/21 13:24,2,1967,1,...,4,90,80.0,75,79,80.0,90,90,90,75
3,4,5.0,2,10,28,51,2015/7/25 17:33,2,1943,1,...,3,100,90.0,70,80,80.0,90,90,80,80
4,5,4.0,1,7,18,36,2015/8/10 9:50,2,1994,1,...,2,50,50.0,50,50,50.0,50,50,50,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10963,10964,,1,27,77,117,2015/7/19 20:01,2,1946,1,...,5,50,60.0,60,70,50.0,60,40,60,50
10964,10965,,2,26,74,114,2015/8/8 13:09,2,1977,1,...,3,60,50.0,70,50,50.0,50,50,50,50
10965,10966,,2,29,84,127,2015/7/22 9:29,2,1968,1,...,4,60,60.0,60,60,60.0,60,60,60,60
10966,10967,,1,11,31,54,2015/7/20 16:06,1,1950,1,...,3,84,60.0,70,87,90.0,80,80,80,80


In [32]:
def drop_morenan(data): #当数据集的缺失值数量超过一定数量我们就将其删除
    for i in data.columns:
        print('---------------------')
        if i == 'happiness': #测试集没有标签，忽略掉
            continue
        print(i + '缺失值为{}'.format(data[i].isnull().sum()))
        #print('缺失数量大于20%，将该特征删除')
        if data[i].isnull().sum() > len(data) * 0.2:
            print('缺失数量大于20%，将该特征删除')
            data = data.drop(columns=i)
    return data

In [33]:
def search_negative(data): #因为调查问卷中不会有负值，所以要查看数据中的异常值
    columns = data.columns
    for column in columns:
        if data[column].dtype != 'object':
            print('%s 的异常值个数为：%d' % (column,len(data.loc[data[column] < 0,:][column])))

In [34]:
def negative_to_nan(data): #蒋异常值转化为缺失值方便处理
    for i in data.columns:
        data[i] = data[i].apply(lambda x: x if x >= 0 else np.nan)
    return data

数据预处理与特征工程

In [35]:
data = data.drop(columns='survey_time') #很明显调查时间不是影响结果的特征，我们只需要对结果有影响的特征

In [36]:
data.info(max_cols=150)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10968 entries, 0 to 10967
Data columns (total 139 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   id                    10968 non-null  int16   
 1   happiness             8000 non-null   float16 
 2   survey_type           10968 non-null  int8    
 3   province              10968 non-null  int8    
 4   city                  10968 non-null  int8    
 5   county                10968 non-null  int16   
 6   gender                10968 non-null  int8    
 7   birth                 10968 non-null  int16   
 8   nationality           10968 non-null  int8    
 9   religion              10968 non-null  int8    
 10  religion_freq         10968 non-null  int8    
 11  edu                   10968 non-null  int8    
 12  edu_other             6 non-null      category
 13  edu_status            9399 non-null   float16 
 14  edu_yr                8212 non-null   float16 
 15  i

In [37]:
data = data.drop(columns='edu_other') #category类型的数据异常值很多先删除
data = data.drop(columns='property_other')
data = data.drop(columns='invest_other')

In [38]:
search_negative(data)

id 的异常值个数为：0
happiness 的异常值个数为：12
survey_type 的异常值个数为：0
province 的异常值个数为：0
city 的异常值个数为：0
county 的异常值个数为：0
gender 的异常值个数为：0
birth 的异常值个数为：0
nationality 的异常值个数为：20
religion 的异常值个数为：146
religion_freq 的异常值个数为：22
edu 的异常值个数为：19
edu_status 的异常值个数为：38
edu_yr 的异常值个数为：1679
income 的异常值个数为：605
political 的异常值个数为：47
join_party 的异常值个数为：120
floor_area 的异常值个数为：0
property_0 的异常值个数为：0
property_1 的异常值个数为：0
property_2 的异常值个数为：0
property_3 的异常值个数为：0
property_4 的异常值个数为：0
property_5 的异常值个数为：0
property_6 的异常值个数为：0
property_7 的异常值个数为：0
property_8 的异常值个数为：0
height_cm 的异常值个数为：0
weight_jin 的异常值个数为：0
health 的异常值个数为：7
health_problem 的异常值个数为：52
depression 的异常值个数为：26
hukou 的异常值个数为：0
hukou_loc 的异常值个数为：0
media_1 的异常值个数为：2
media_2 的异常值个数为：5
media_3 的异常值个数为：5
media_4 的异常值个数为：4
media_5 的异常值个数为：17
media_6 的异常值个数为：21
leisure_1 的异常值个数为：7
leisure_2 的异常值个数为：22
leisure_3 的异常值个数为：21
leisure_4 的异常值个数为：16
leisure_5 的异常值个数为：92
leisure_6 的异常值个数为：21
leisure_7 的异常值个数为：44
leisure_8 的异常值个数为：24
leisure_9 的异常值个数为：27
leisure_10 的异常值个数为：47

In [39]:
negative_to_nan(data) #将异常值都转为缺失值

Unnamed: 0,id,happiness,survey_type,province,city,county,gender,birth,nationality,religion,...,neighbor_familiarity,public_service_1,public_service_2,public_service_3,public_service_4,public_service_5,public_service_6,public_service_7,public_service_8,public_service_9
0,1,4.0,1,12,32,59,1,1959,1.0,1.0,...,4.0,50.0,60.0,50.0,50.0,30.0,30.0,50.0,50.0,50.0
1,2,4.0,2,18,52,85,1,1992,1.0,1.0,...,3.0,90.0,70.0,70.0,80.0,85.0,70.0,90.0,60.0,60.0
2,3,4.0,2,29,83,126,2,1967,1.0,0.0,...,4.0,90.0,80.0,75.0,79.0,80.0,90.0,90.0,90.0,75.0
3,4,5.0,2,10,28,51,2,1943,1.0,1.0,...,3.0,100.0,90.0,70.0,80.0,80.0,90.0,90.0,80.0,80.0
4,5,4.0,1,7,18,36,2,1994,1.0,1.0,...,2.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10963,10964,,1,27,77,117,2,1946,1.0,1.0,...,5.0,50.0,60.0,60.0,70.0,50.0,60.0,40.0,60.0,50.0
10964,10965,,2,26,74,114,2,1977,1.0,1.0,...,3.0,60.0,50.0,70.0,50.0,50.0,50.0,50.0,50.0,50.0
10965,10966,,2,29,84,127,2,1968,1.0,1.0,...,4.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0
10966,10967,,1,11,31,54,1,1950,1.0,1.0,...,3.0,84.0,60.0,70.0,87.0,90.0,80.0,80.0,80.0,80.0


In [40]:
search_negative(data)

id 的异常值个数为：0
happiness 的异常值个数为：0
survey_type 的异常值个数为：0
province 的异常值个数为：0
city 的异常值个数为：0
county 的异常值个数为：0
gender 的异常值个数为：0
birth 的异常值个数为：0
nationality 的异常值个数为：0
religion 的异常值个数为：0
religion_freq 的异常值个数为：0
edu 的异常值个数为：0
edu_status 的异常值个数为：0
edu_yr 的异常值个数为：0
income 的异常值个数为：0
political 的异常值个数为：0
join_party 的异常值个数为：0
floor_area 的异常值个数为：0
property_0 的异常值个数为：0
property_1 的异常值个数为：0
property_2 的异常值个数为：0
property_3 的异常值个数为：0
property_4 的异常值个数为：0
property_5 的异常值个数为：0
property_6 的异常值个数为：0
property_7 的异常值个数为：0
property_8 的异常值个数为：0
height_cm 的异常值个数为：0
weight_jin 的异常值个数为：0
health 的异常值个数为：0
health_problem 的异常值个数为：0
depression 的异常值个数为：0
hukou 的异常值个数为：0
hukou_loc 的异常值个数为：0
media_1 的异常值个数为：0
media_2 的异常值个数为：0
media_3 的异常值个数为：0
media_4 的异常值个数为：0
media_5 的异常值个数为：0
media_6 的异常值个数为：0
leisure_1 的异常值个数为：0
leisure_2 的异常值个数为：0
leisure_3 的异常值个数为：0
leisure_4 的异常值个数为：0
leisure_5 的异常值个数为：0
leisure_6 的异常值个数为：0
leisure_7 的异常值个数为：0
leisure_8 的异常值个数为：0
leisure_9 的异常值个数为：0
leisure_10 的异常值个数为：0
leisure_11 的异常值个数为：0
leisur

In [41]:
drop_morenan(data)

---------------------
id缺失值为0
---------------------
---------------------
survey_type缺失值为0
---------------------
province缺失值为0
---------------------
city缺失值为0
---------------------
county缺失值为0
---------------------
gender缺失值为0
---------------------
birth缺失值为0
---------------------
nationality缺失值为20
---------------------
religion缺失值为146
---------------------
religion_freq缺失值为22
---------------------
edu缺失值为19
---------------------
edu_status缺失值为1607
---------------------
edu_yr缺失值为4435
缺失数量大于20%，将该特征删除
---------------------
income缺失值为605
---------------------
political缺失值为47
---------------------
join_party缺失值为9962
缺失数量大于20%，将该特征删除
---------------------
floor_area缺失值为0
---------------------
property_0缺失值为0
---------------------
property_1缺失值为0
---------------------
property_2缺失值为0
---------------------
property_3缺失值为0
---------------------
property_4缺失值为0
---------------------
property_5缺失值为0
---------------------
property_6缺失值为0
---------------------
property_7缺失值为0
-------------------

Unnamed: 0,id,happiness,survey_type,province,city,county,gender,birth,nationality,religion,...,neighbor_familiarity,public_service_1,public_service_2,public_service_3,public_service_4,public_service_5,public_service_6,public_service_7,public_service_8,public_service_9
0,1,4.0,1,12,32,59,1,1959,1.0,1.0,...,4.0,50.0,60.0,50.0,50.0,30.0,30.0,50.0,50.0,50.0
1,2,4.0,2,18,52,85,1,1992,1.0,1.0,...,3.0,90.0,70.0,70.0,80.0,85.0,70.0,90.0,60.0,60.0
2,3,4.0,2,29,83,126,2,1967,1.0,0.0,...,4.0,90.0,80.0,75.0,79.0,80.0,90.0,90.0,90.0,75.0
3,4,5.0,2,10,28,51,2,1943,1.0,1.0,...,3.0,100.0,90.0,70.0,80.0,80.0,90.0,90.0,80.0,80.0
4,5,4.0,1,7,18,36,2,1994,1.0,1.0,...,2.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10963,10964,,1,27,77,117,2,1946,1.0,1.0,...,5.0,50.0,60.0,60.0,70.0,50.0,60.0,40.0,60.0,50.0
10964,10965,,2,26,74,114,2,1977,1.0,1.0,...,3.0,60.0,50.0,70.0,50.0,50.0,50.0,50.0,50.0,50.0
10965,10966,,2,29,84,127,2,1968,1.0,1.0,...,4.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0
10966,10967,,1,11,31,54,1,1950,1.0,1.0,...,3.0,84.0,60.0,70.0,87.0,90.0,80.0,80.0,80.0,80.0


In [42]:
data.info(max_cols=150)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10968 entries, 0 to 10967
Data columns (total 136 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10968 non-null  int64  
 1   happiness             7988 non-null   float64
 2   survey_type           10968 non-null  int64  
 3   province              10968 non-null  int64  
 4   city                  10968 non-null  int64  
 5   county                10968 non-null  int64  
 6   gender                10968 non-null  int64  
 7   birth                 10968 non-null  int64  
 8   nationality           10948 non-null  float64
 9   religion              10822 non-null  float64
 10  religion_freq         10946 non-null  float64
 11  edu                   10949 non-null  float64
 12  edu_status            9361 non-null   float64
 13  edu_yr                6533 non-null   float64
 14  income                10363 non-null  float64
 15  political         

In [43]:
data= reduce_mem_usage(data)

Memory usage of dataframe is 11933312.00 B
Memory usage after optimization is: 2796968.00 B
Decreased by 76.6%


In [44]:
def fillna(data): #缺失值填充，这里安照均值填充
    for i in data.columns:
        if i =='happiness':
            continue
        data[i] = data[i].fillna(data[i].mean())

In [45]:
fillna(data)

In [46]:
data.info(max_cols=150)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10968 entries, 0 to 10967
Data columns (total 136 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10968 non-null  int16  
 1   happiness             7988 non-null   float16
 2   survey_type           10968 non-null  int8   
 3   province              10968 non-null  int8   
 4   city                  10968 non-null  int8   
 5   county                10968 non-null  int16  
 6   gender                10968 non-null  int8   
 7   birth                 10968 non-null  int16  
 8   nationality           10968 non-null  float16
 9   religion              10968 non-null  float16
 10  religion_freq         10968 non-null  float16
 11  edu                   10968 non-null  float16
 12  edu_status            10968 non-null  float16
 13  edu_yr                10968 non-null  float16
 14  income                10968 non-null  float32
 15  political         

In [50]:
data.corr()['happiness'] 

id                  0.004041
happiness           1.000000
survey_type        -0.033844
province           -0.018568
city               -0.021038
                      ...   
public_service_5    0.156715
public_service_6    0.182439
public_service_7    0.186180
public_service_8    0.139224
public_service_9    0.147039
Name: happiness, Length: 136, dtype: float64

In [52]:
ls  = list(data.corr()['happiness'].apply(lambda x: abs(x)).sort_values()[:20].index) #看出来低关系系数非常低，将很低的特征筛选出来

In [53]:
data = data.drop(ls, axis=1)

In [54]:
data.info(max_cols=150)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10968 entries, 0 to 10967
Data columns (total 116 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   happiness             7988 non-null   float16
 1   survey_type           10968 non-null  int8   
 2   province              10968 non-null  int8   
 3   city                  10968 non-null  int8   
 4   county                10968 non-null  int16  
 5   gender                10968 non-null  int8   
 6   nationality           10968 non-null  float16
 7   religion              10968 non-null  float16
 8   edu                   10968 non-null  float16
 9   edu_status            10968 non-null  float16
 10  edu_yr                10968 non-null  float16
 11  income                10968 non-null  float32
 12  political             10968 non-null  float16
 13  join_party            10968 non-null  float16
 14  floor_area            10968 non-null  float16
 15  property_0        

In [56]:
train = data[data.index <= 7999] #特征工程完成后将训练集和测试集分开，然后开始建模
test = data[data.index > 7999]

In [57]:
test

Unnamed: 0,happiness,survey_type,province,city,county,gender,nationality,religion,edu,edu_status,...,neighbor_familiarity,public_service_1,public_service_2,public_service_3,public_service_4,public_service_5,public_service_6,public_service_7,public_service_8,public_service_9
8000,,1,2,2,9,2,8.0,0.0,4.0,2.000000,...,4.000000,80.0,80.0,60.0,80.0,80.0,80.0,80.0,80.0,80.0
8001,,1,22,66,106,2,1.0,1.0,4.0,4.000000,...,5.000000,90.0,80.0,80.0,80.0,80.0,80.0,70.0,80.0,80.0
8002,,2,9,22,44,2,1.0,1.0,1.0,3.544922,...,5.000000,95.0,95.0,80.0,90.0,80.0,95.0,95.0,80.0,90.0
8003,,2,18,52,86,2,1.0,1.0,4.0,2.000000,...,4.000000,80.0,80.0,70.0,90.0,80.0,80.0,70.0,60.0,50.0
8004,,2,24,70,110,1,1.0,1.0,1.0,3.544922,...,3.734375,60.0,50.0,0.0,30.0,40.0,50.0,60.0,inf,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10963,,1,27,77,117,2,1.0,1.0,3.0,2.000000,...,5.000000,50.0,60.0,60.0,70.0,50.0,60.0,40.0,60.0,50.0
10964,,2,26,74,114,2,1.0,1.0,4.0,2.000000,...,3.000000,60.0,50.0,70.0,50.0,50.0,50.0,50.0,50.0,50.0
10965,,2,29,84,127,2,1.0,1.0,1.0,3.544922,...,4.000000,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0
10966,,1,11,31,54,1,1.0,1.0,7.0,4.000000,...,3.000000,84.0,60.0,70.0,87.0,90.0,80.0,80.0,80.0,80.0


In [58]:
train['happiness'].isnull().sum()

12

In [59]:
train  = train.dropna()

In [61]:
train['invest_6'].value_counts() #发现有一个只有一个类型的特征，删除掉

0    7988
Name: invest_6, dtype: int64

In [62]:
train = train.drop(columns='invest_6')
test = test.drop(columns='invest_6')

建立模型与调参

In [63]:
#我使用了xgboost模型进行预测，优点是该算法得到的模型非常强力，缺点是调参的计算过程非常慢，所以之前对数据集的压缩很有必要

In [None]:
#  默认随机挑选参数
train_d = xgb.DMatrix(x_train,y_train)
xgb1 = XGBRegressor(max_depth=3,
                     learning_rate=0.1,
                     n_estimators=5000,
                     silent=False,
                     booster='gbtree',
                     objective='reg:squarederror',
                     n_jobs=4,
                     gamma=0,
                     min_child_weight=1,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     seed=7)

#  利用cv函数选择最佳的树的数量。
xgb.cv(xgb1.get_params(),train_d,xgb1.get_params()['n_estimators'],nfold=5,
      metrics='rmse',early_stopping_rounds=70)

In [None]:
param = {
    'max_depth':[1,2,3,4,5,6],
    'min_child_weight':[1,2,3,4,5,6]
        
#寻找最优的max_depth和min_child_weight参数
grid = GridSearchCV(xgb1,param_grid=param,cv=5)
grid.fit(Xtrain,ytrain) 
print('The Best Params:',grid.best_params_)
print('The Best Score:',grid.best_score_)

In [None]:
xgb1 = XGBRegressor(max_depth=2,
                     learning_rate=0.1,
                     n_estimators=218,
                     silent=False,
                     objective='reg:squarederror',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=0,
                     min_child_weight=6,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     seed=7)


param_grid = {'gamma':[0,1,2,3,4,5,6,7,8,9]} #查找最优的gamma参数
grid_search = GridSearchCV(xgb1,param_grid,cv=5, scoring='neg_mean_squared_error')

In [None]:
grid_search.fit(x_train,y_train)
print('best_params:',grid_search.best_params_)
print('best_score:',grid_search.best_score_)

In [None]:
xgb1 = XGBRegressor(max_depth=2,
                     learning_rate=0.1,
                     n_estimators=218,
                     silent=False,
                     objective='reg:squarederror',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=3,
                     min_child_weight=6,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     seed=7)
param_grid = {'subsample':[i/10.0 for i in range(5,11)],
             'colsample_bytree':[i/10.0 for i in range(1,11)]}
grid_search = GridSearchCV(xgb1,param_grid,iid=False,cv=5)

grid_search.fit(x_train,y_train)
print('best_params:',grid_search.best_params_)
print('best_score:',grid_search.best_score_)

In [None]:
xgb1 = XGBRegressor(max_depth=2,
                     learning_rate=0.1,
                     n_estimators=218,
                     silent=False,
                     objective='reg:squarederror',
                     booster='gbtree',
                     n_jobs=4,
                     gamma=3,
                     min_child_weight=6,
                     subsample=0.9,
                     colsample_bytree=0.3,
                     seed=7)
xgb1.fit(X=x_train, y=y_train)
xgb1.predict(test)
test_predict = pd.DataFrame(xgb1.predict(test))
test_predict['happiness'] = test_predict[0]
test_predict  = test_predict.drop(columns=0)
test_predict['id'] = test_predict.index + 8001

模型调参完毕，将输出结果上传至比赛网站

In [None]:
test_predict.to_csv('resultss1.csv')