In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from category_encoders import TargetEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.neural_network import MLPRegressor

## function

In [2]:
def nanfill(data,row, column):
    tag = data.iloc[row,1]
    # print(data[data.iloc[:,1] == tag].iloc[:,column],'\n')
    temp = data[data.iloc[:,1] == tag].iloc[:,column]
    temp = np.average(temp[temp.notna()])
    if (pd.isna(temp)):
        print(row,column)
    data.iloc[row,column] = temp

def normalization(data):
    for i in range(data.shape[1]):
        m1 = min(data.iloc[:,i])
        m2 = max(data.iloc[:,i])
        data = (data.iloc[:,i] - m1) / (m2 - m1)

# 计算交叉验证结果的均值和标准差， 默认为五折交叉验证，结果为相关系数R2      
def cross_validate(models, cv = 5, scoring = 'r2'):
    means = []
    stds = []
    for model in models:
        score = cross_val_score(model, X_train, y_train, cv = cv, scoring = scoring)
        means.append(score.mean())
        stds.append(score.std(ddof = 1))
    return means, stds

## Data import

In [3]:
raw_data = pd.read_csv('Data.csv')

for i in range(3):
    for j in range(raw_data.shape[0]):
        raw_data.iloc[j,i+2] = np.float64(raw_data.iloc[j,i+2].replace(',',''))
        
for i in range(raw_data.shape[0]):
    for j in range(raw_data.shape[1]):
        if (pd.isna(raw_data.iloc[i,j])):
            nanfill(raw_data,i,j)

## Analysis of Correlation

In [None]:
group = [['LACCESS_POP15','LACCESS_LOWI15','LACCESS_HHNV15','LACCESS_CHILD15','LACCESS_SENIORS15'],
        ['GROCPTH16', 'SUPERCPTH16', 'CONVSPTH16', 'SPECSPTH16', 'WICSPTH16'],
        ['FFRPTH16', 'FSRPTH16'],
        ['FOODINSEC_15_17', 'VLFOODSEC_15_17'],
        ['FMRKT_WIC18', 'FMRKT_WICCASH18'],
        ['POVRATE15', 'CHILDPOVRATE15']]

In [None]:
for i in range(len(group)):
    corr = raw_data[group[i]].corr()
    plt.figure(figsize = (12,8), dpi = 300)
    sns.heatmap(corr,linewidths=0.1,vmax=1.0, square=True,linecolor='white', annot=True)
    plt.savefig('corr_heatmap_{}.jpg'.format(i))

## Data preprocessing

In [4]:
preserve_columns = ['County','State','Population_Estimate_2016','LACCESS_POP15','GROCPTH16',
                    'SUPERCPTH16','CONVSPTH16','SPECSPTH16','WICSPTH16','FFRPTH16','FSRPTH16',
                    'FOODINSEC_15_17','FMRKT_WIC18','POVRATE15','PCT_WIC17']
raw_data = raw_data[preserve_columns]
raw_data['PCT_WIC17'] = raw_data['PCT_WIC17'] * raw_data['Population_Estimate_2016']
raw_data['FOODINSEC_15_17'] = raw_data['FOODINSEC_15_17'] * raw_data['Population_Estimate_2016']
raw_data['POVRATE15'] = raw_data['POVRATE15'] * raw_data['Population_Estimate_2016']

# for i in range(raw_data.shape[1]-2):
#     plt.figure(figsize = (12,8), dpi = 300)
#     plt.plot(range(raw_data.shape[0]), raw_data.iloc[:,i+2])
#     print(preserve_columns[i+2])



In [5]:
enc = TargetEncoder(cols = ['State']).fit(raw_data['State'],raw_data['PCT_WIC17'])
raw_data['State'] = enc.transform(raw_data['State'])



In [6]:
data = raw_data.sample(frac = 1).values
X_train = np.float64(data[:2512,1:-1])
y_train = np.float64(data[:2512,-1])
X_test = np.float64(data[2512:,1:-1])
y_test = np.float64(data[2512:,-1])

## Ridge

In [None]:
Ridge_clfs = []
for i in range(5):
    clf = KernelRidge(alpha = 10 ** (i - 2), kernel = 'linear')
    Ridge_clfs.append(clf)
means1, stds1 = cross_validate(Ridge_clfs)

In [10]:
Ridge_clfs = []
for i in range(5):
    clf = KernelRidge(alpha = 10 ** (i - 2), kernel = 'sigmoid')
    Ridge_clfs.append(clf)
means2, stds2 = cross_validate(Ridge_clfs)

In [11]:
Ridge_clfs = []
for i in range(5):
    clf = KernelRidge(alpha = 10 ** (i - 2), kernel = 'rbf')
    Ridge_clfs.append(clf)
means3, stds3 = cross_validate(Ridge_clfs)

## Lasso

In [9]:
Lasso_clfs = []
for i in range(5):
    clf = Lasso(alpha = 10 ** (i - 2), max_iter = 5000, random_state = 913)
    Lasso_clfs.append(clf)
mean, std = cross_validate(Lasso_clfs)

## Elastic Net

In [15]:
ElasticNet_clfs = []
l1_ratio = [0.25,0.5,0.75]
for i in range(3):
    same_ratio = []
    for j in range(5):
        clf = ElasticNet(alpha = 10 ** (i - 2), l1_ratio = l1_ratio[i], random_state = 913, max_iter = 5000)
        same_ratio.append(clf)
    ElasticNet_clfs.append(clf)
    
mean, std = cross_validate(ElasticNet_clfs)

## Decision tree

In [12]:
DecisionTrees = []
clf = DecisionTreeRegressor(criterion = 'squared_error', random_state = 913)
DecisionTrees.append(clf)
clf = DecisionTreeRegressor(criterion = 'friedman_mse', random_state = 913)
DecisionTrees.append(clf)
clf = DecisionTreeRegressor(criterion = 'absolute_error', random_state = 913)
DecisionTrees.append(clf)
clf = DecisionTreeRegressor(criterion = 'poisson', random_state = 913)
DecisionTrees.append(clf)
cross_validate(DecisionTrees)

([0.8479539675280255,
  0.8503961441360299,
  0.8387932505589386,
  0.8335042699150778],
 [0.15318355977226067,
  0.15331643407752396,
  0.15052858738464922,
  0.14559628438148853])

## NN

In [9]:
%%time
NNs = []
size_list = [(64,8),(64,16,4),(128,16),(128,32,8)]
for i in range(len(size_list)):
    NN_same_size = []
    for j in range(3):
        clf = MLPRegressor(hidden_layer_sizes = size_list[i], learning_rate_init = 10 ** (-i - 2),
                           random_state = 913, max_iter = 2000)
        NN_same_size.append(clf)
    NNs.append(NN_same_size)

means = []
stds = []
for i in range(len(NNs)):
    mean, std = cross_validate(NNs[i])
    means.append(mean)
    std.append(std)

CPU times: total: 4min 57s
Wall time: 3min 6s
