# 507 Project
### Author: Diana Liang
### Date: 4/15/2020

Load Data

In [None]:
import pandas as pd
import numpy as np
import random
import itertools
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from keras.models import Sequential
from keras.layers import Dense
import keras.utils
import seaborn as sns
%matplotlib inline

Data Cleaning

In [None]:
# 2015 Data --------------------------------------------------------------------------------------

# Consumer Data
consumer2015 = pd.read_sas('consumer_behavior_2015.XPT')
consumer2015 = pd.DataFrame({'SEQN': consumer2015.SEQN, 'money_grocery': consumer2015.CBD071,
                            'money_nonfood': consumer2015.CBD091, 'money_food': consumer2015.CBD111,
                            'money_eatout': consumer2015.CBD121, 'money_deliver': consumer2015.CBD131})
consumer2015.money_grocery.replace(77777, np.nan)
consumer2015.money_nonfood.replace(77777, np.nan)
consumer2015.money_food.replace(77777, np.nan)
consumer2015.money_eatout.replace(77777, np.nan)
consumer2015.money_deliver.replace(77777, np.nan)
consumer2015.money_grocery.replace(999999, np.nan)
consumer2015.money_nonfood.replace(999999, np.nan)
consumer2015.money_food.replace(999999, np.nan)
consumer2015.money_eatout.replace(999999, np.nan)
consumer2015.money_deliver.replace(999999, np.nan)
# Health Data
health2015 = pd.read_sas('health_2015.XPT')
health2015 = pd.DataFrame({'SEQN': health2015.SEQN, 'health': health2015.HSD010})
health2015.health.replace(7, np.nan)
health2015.health.replace(9, np.nan)
# Diet Data
diet2015 = pd.read_sas('diet_behavior_2015.XPT')
diet2015 = pd.DataFrame({'SEQN': diet2015.SEQN, 'meals_nothome': diet2015.DBD895,
                        'meals_fastfood': diet2015.DBD900, 'meals_ready': diet2015.DBD905,
                        'meals_frozen': diet2015.DBD910})
diet2015.meals_nothome.replace(7777, np.nan)
diet2015.meals_fastfood.replace(7777, np.nan)
diet2015.meals_ready.replace(7777, np.nan)
diet2015.meals_frozen.replace(7777, np.nan)
diet2015.meals_nothome.replace(9999, np.nan)
diet2015.meals_fastfood.replace(9999, np.nan)
diet2015.meals_ready.replace(9999, np.nan)
diet2015.meals_frozen.replace(9999, np.nan)
# Combine to 2015 data
data2015 = pd.merge(health2015, consumer2015, on='SEQN')
data2015 = pd.merge(data2015, diet2015, on='SEQN')
# Get rid of SEQN
del data2015['SEQN']


# 2013 Data -----------------------------------------------------------------------------------

# Consumer Data
consumer2013 = pd.read_sas('consumer_behavior_2013.XPT')
consumer2013 = pd.DataFrame({'SEQN': consumer2013.SEQN, 'money_grocery': consumer2013.CBD070,
                            'money_nonfood': consumer2013.CBD090, 'money_food': consumer2013.CBD110,
                            'money_eatout': consumer2013.CBD120, 'money_deliver': consumer2013.CBD130})
consumer2013.money_grocery.replace(77777, np.nan)
consumer2013.money_nonfood.replace(77777, np.nan)
consumer2013.money_food.replace(77777, np.nan)
consumer2013.money_eatout.replace(77777, np.nan)
consumer2013.money_deliver.replace(77777, np.nan)
consumer2013.money_grocery.replace(999999, np.nan)
consumer2013.money_nonfood.replace(999999, np.nan)
consumer2013.money_food.replace(999999, np.nan)
consumer2013.money_eatout.replace(999999, np.nan)
consumer2013.money_deliver.replace(999999, np.nan)
# Health Data
health2013 = pd.read_sas('health_2013.XPT')
health2013 = pd.DataFrame({'SEQN': health2013.SEQN, 'health': health2013.HSD010})
health2013.health.replace(7, np.nan)
health2013.health.replace(9, np.nan)
# Diet Data
diet2013 = pd.read_sas('diet_behavior_2013.XPT')
diet2013 = pd.DataFrame({'SEQN': diet2013.SEQN, 'meals_nothome': diet2013.DBD895,
                        'meals_fastfood': diet2013.DBD900, 'meals_ready': diet2013.DBD905,
                        'meals_frozen': diet2013.DBD910})
diet2013.meals_nothome.replace(7777, np.nan)
diet2013.meals_fastfood.replace(7777, np.nan)
diet2013.meals_ready.replace(7777, np.nan)
diet2013.meals_frozen.replace(7777, np.nan)
diet2013.meals_nothome.replace(9999, np.nan)
diet2013.meals_fastfood.replace(9999, np.nan)
diet2013.meals_ready.replace(9999, np.nan)
diet2013.meals_frozen.replace(9999, np.nan)
# Combine to 2015 data
data2013 = pd.merge(health2013, consumer2013, on='SEQN')
data2013 = pd.merge(data2013, diet2013, on='SEQN')
# Get rid of SEQN
del data2013['SEQN']


# 2011 Data --------------------------------------------------------------------------

# Consumer Data
consumer2011 = pd.read_sas('consumer_behavior_2011.XPT')
consumer2011 = pd.DataFrame({'SEQN': consumer2011.SEQN, 'money_grocery': consumer2011.CBD070,
                            'money_nonfood': consumer2011.CBD090, 'money_food': consumer2011.CBD110,
                            'money_eatout': consumer2011.CBD120, 'money_deliver': consumer2011.CBD130})
consumer2011.money_grocery.replace(77777, np.nan)
consumer2011.money_nonfood.replace(77777, np.nan)
consumer2011.money_food.replace(77777, np.nan)
consumer2011.money_eatout.replace(77777, np.nan)
consumer2011.money_deliver.replace(77777, np.nan)
consumer2011.money_grocery.replace(999999, np.nan)
consumer2011.money_nonfood.replace(999999, np.nan)
consumer2011.money_food.replace(999999, np.nan)
consumer2011.money_eatout.replace(999999, np.nan)
consumer2011.money_deliver.replace(999999, np.nan)
# Health Data
health2011 = pd.read_sas('health_2011.XPT')
health2011 = pd.DataFrame({'SEQN': health2011.SEQN, 'health': health2011.HSD010})
health2011.health.replace(7, np.nan)
health2011.health.replace(9, np.nan)
# Diet Data
diet2011 = pd.read_sas('diet_behavior_2011.XPT')
diet2011 = pd.DataFrame({'SEQN': diet2011.SEQN, 'meals_nothome': diet2011.DBD895,
                        'meals_fastfood': diet2011.DBD900, 'meals_ready': diet2011.DBD905,
                        'meals_frozen': diet2011.DBD910})
diet2011.meals_nothome.replace(7777, np.nan)
diet2011.meals_fastfood.replace(7777, np.nan)
diet2011.meals_ready.replace(7777, np.nan)
diet2011.meals_frozen.replace(7777, np.nan)
diet2011.meals_nothome.replace(9999, np.nan)
diet2011.meals_fastfood.replace(9999, np.nan)
diet2011.meals_ready.replace(9999, np.nan)
diet2011.meals_frozen.replace(9999, np.nan)
# Combine to 2015 data
data2011 = pd.merge(health2011, consumer2011, on='SEQN')
data2011 = pd.merge(data2011, diet2011, on='SEQN')
# Get rid of SEQN
del data2011['SEQN']


# Merge 3 years ---------------------------------------------------------------------

data = data2015.append(data2013, ignore_index=True)
data = data.append(data2011, ignore_index=True)
data = data.query('health!="NaN"')

data.to_csv('food_health.csv', index=False)

Load Data

In [None]:
food_health = pd.read_csv('food_health.csv')

Create training and testing data

In [None]:
# Figure out how many to include
num_obs = len(food_health.health)
num_train = int((num_obs*0.8)//1)
train_idx = random.sample(list(range(num_obs)), num_train)
#food_health.loc[train_idx]
test_idx = []
for i in range(num_obs):
    if i not in train_idx:
        test_idx.append(i)
print(len(train_idx), len(test_idx))

# Create data sets
train = food_health.loc[train_idx]
test = food_health.loc[test_idx]
train.to_csv('food_train.csv')
test.to_csv('food_test.csv')

Data Exploration

In [None]:
# Look at distribution of health
data = pd.read_csv('food_health.csv')
health_count = {}
health_count['Excellent'] = np.sum(data['health'] == 1)
health_count['Very good'] = np.sum(data['health'] == 2)
health_count['Good'] = np.sum(data['health'] == 3)
health_count['Fair'] = np.sum(data['health'] == 4)
health_count['Poor'] = np.sum(data['health'] == 5)
plt.bar(health_count.keys(), health_count.values())
plt.title("Distribution of perceived health")
plt.xlabel("Perceived health")
plt.ylabel("# Respondents")
plt.show()
health_count

# Check how many missing data
num_missing = {}
i = 0
del data['health']
for x in data:
    i += 1
    num_missing[x] = sum(np.isnan(data[x]))
plt.bar(num_missing.keys(), num_missing.values())
plt.title("Number of missing data values per predictor")
plt.xlabel("Predictor Variables")
plt.xticks(rotation=90)
plt.ylabel("# missing values")
plt.show()
sum(np.isnan(data['meals_fastfood']))

Random Forest Model

In [None]:
# Create Training and Validation data sets
X = data.iloc[:, 1:]
Y = data.iloc[:, 0]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1,
                                                   random_state=0)
miss_train = IterativeImputer(max_iter=100, random_state=0)
new_X_train = miss_train.fit_transform(X_train)
miss_test = IterativeImputer(max_iter=100, random_state=0)
new_X_test = miss_test.fit_transform(X_test)

# Run random forests
oob = []
for each_num in [1000, 5000, 7500]:
    for each_max in [2, 3, 4, 5]:
        tree = RandomForestClassifier(n_estimators=each_num, max_features=each_max, oob_score = True, random_state=0)
        tree.fit(new_X_train, Y_train)
        oob.append(tree.oob_score_)
        print('tree:', each_num, 'feat:', each_max)

# Visualize

## Accuracy vs Parameters
plt.plot(oob)
plt.title("Average Accuracy for Different Parameters")
plt.xlabel("Different Parameters")
plt.ylabel("OOB Score")
#plt.savefig('oob.png')


## Accuracy vs num Predictors
plt.plot([2, 3, 4, 5], oob[4:8])
plt.title("Average Accuracy vs. # of Predictors for 500 Estimators")
plt.xlabel("# of Predictors")
#plt.ylabel("OOB Score")

## Accuracy
model = RandomForestClassifier(n_estimators=5000, max_features=3, oob_score = True, random_state=0)
model.fit(new_X_train, Y_train)
Y_pred = model.predict(new_X_test)
accuracy_score(Y_test, Y_pred)

## Prediction Heatmap
sns.heatmap(confusion_matrix(Y_test, Y_pred), xticklabels=[1, 2, 3, 4, 5], yticklabels=[1, 2, 3, 4, 5], annot=True)
plt.title("Predicted vs Actual Health Confusion Matrix")
plt.xlabel("Actual Health")
plt.ylabel("Predicted Health")
#plt.savefig("con_mat.png")

## Variable Importance
feat_import = pd.DataFrame({'Features': list(X.columns),
                            'Score': list(model.feature_importances_)}).sort_values(by='Score', ascending=False)
sns.barplot(x=feat_import.Features, y=feat_import.Score)
plt.title("Feature Importance")
plt.xticks(rotation=75)
#plt.savefig("var_imp.png")

Simplified random forests

In [None]:
# Simplify health categories to healthy or not healthy
data = pd.read_csv('food_health.csv')
data['is_1'] = data.health == 1
data['is_2'] = data.health == 2
data['is_3'] = data.health == 3
data['is_healthy'] = data.is_1 + data.is_2 + data.is_3

del data['health']
del data['is_1']
del data['is_2']
del data['is_3']

# Create Training and Validation data sets
X = data.iloc[:, :9]
Y = data.iloc[:, 9]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1,
                                                   random_state=0)
miss_train = IterativeImputer(max_iter=100, random_state=0)
new_X_train = miss_train.fit_transform(X_train)
miss_test = IterativeImputer(max_iter=100, random_state=0)
new_X_test = miss_test.fit_transform(X_test)

# Run random forests
s_oob = []
for each_num in [500, 1000, 5000]:
    for each_max in [2, 3, 4, 5]:
        tree = RandomForestClassifier(n_estimators=each_num,
                                      max_features=each_max,
                                      oob_score = True,
                                      random_state=0)
        tree.fit(new_X_train, Y_train)
        s_oob.append(tree.oob_score_)
        print('feat:', each_max, 'num:', each_num)
        
# Visualization

## Accuracy vs Parameters
plt.plot(s_oob)
plt.title("Average Accuracy for Different Parameters (Simple Model)")
plt.xlabel("Different Parameters")
plt.ylabel("OOB Score")
#plt.savefig('simple_oob.png')

plt.plot([2, 3, 4, 5], s_oob[8:])
s_oob[7:]

## Accuracy
model = RandomForestClassifier(n_estimators=5000,
                               max_features=2,
                               oob_score = True,
                               random_state=0)
model.fit(new_X_train, Y_train)
Y_pred = model.predict(new_X_test)
accuracy_score(Y_test, Y_pred)

## Prediction Heatmap
sns.heatmap(confusion_matrix(Y_test, Y_pred), annot=True)
plt.title("Predicted vs Actual Health Confusion Matrix (Simple Model)")
plt.xlabel("Actual Health")
plt.ylabel("Predicted Health")
#plt.savefig("simple_con_mat.png")

## Variable Importance
feat_import = pd.DataFrame({'Features': list(X.columns),
                            'Score': list(model.feature_importances_)}).sort_values(by='Score', ascending=False)
sns.barplot(x=feat_import.Features, y=feat_import.Score)
plt.title("Feature Importance (Simple Model)")
plt.xticks(rotation=90)
#plt.savefig("simple_var_imp.png")

Logistic Regression

In [None]:
# Set up Training and Validation data sets
data = pd.read_csv('food_health.csv')
X = data.iloc[:, 1:]
Y = data.iloc[:, 0]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1,
                                                   random_state=0)
miss_train = IterativeImputer(max_iter=100, random_state=0)
new_X_train = miss_train.fit_transform(X_train)
miss_test = IterativeImputer(max_iter=100, random_state=0)
new_X_test = miss_test.fit_transform(X_test)

# Run logistic regression
train_acc = []
for c in [0.01, 0.1, 1, 10]:
    model = LogisticRegression(C=c, multi_class='multinomial', max_iter=5000, random_state=0)
    scores = cross_val_score(model, new_X_train, Y_train, scoring='accuracy',
                            cv=8)
    train_acc.append(scores.mean())

    
# Visualize

## Accuracy vs Parameters
plt.plot(range(1,5), train_acc)
plt.title("Average Accuracy for Different Parameters (Simple LR)")
plt.xlabel("Different Parameters")
plt.ylabel("Accuracy")
#plt.savefig('lr_acc.png')

## Accuracy
model = LogisticRegression(C=1, multi_class='multinomial',
                           max_iter=5000, random_state=0)
model.fit(new_X_train, Y_train)
Y_pred = model.predict(new_X_test)
accuracy_score(Y_test, Y_pred)

## Prediction Heatmap
sns.heatmap(confusion_matrix(Y_test, Y_pred), annot=True)
plt.title("Predicted vs Actual Health Confusion Matrix (Simple LR)")
plt.xlabel("Actual Health")
plt.ylabel("Predicted Health")
#plt.savefig("lr_con_mat.png")

Simplified Logistic Regression

In [None]:
# Recode and set up Training and Validation data sets
data = pd.read_csv('food_health.csv')
data['is_1'] = data.health == 1
data['is_2'] = data.health == 2
data['is_3'] = data.health == 3
data['is_healthy'] = data.is_1 + data.is_2 + data.is_3
del data['health']
del data['is_1']
del data['is_2']
del data['is_3']
X = data.iloc[:, :9]
Y = data.iloc[:, 9]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1,
                                                   random_state=0)
miss_train = IterativeImputer(max_iter=100, random_state=0)
new_X_train = miss_train.fit_transform(X_train)
miss_test = IterativeImputer(max_iter=100, random_state=0)
new_X_test = miss_test.fit_transform(X_test)

# Run logistic regression
train_acc = []
for c in [0.01, 0.1, 1, 10]:
    model = LogisticRegression(C=c, multi_class='ovr', max_iter=5000, random_state=0)
    scores = cross_val_score(model, new_X_train, Y_train, scoring='accuracy',
                            cv=8)
    train_acc.append(scores.mean())
    
# Visualization

## Accuracy vs Parameters
plt.plot(range(1,5), train_acc)
plt.title("Average Accuracy for Different Parameters (Simple LR)")
plt.xlabel("Different Parameters")
plt.ylabel("Accuracy")
#plt.savefig('lr_acc.png')

## Accuracy
model = LogisticRegression(C=0.1, multi_class='ovr',
                           max_iter=5000, random_state=0)
model.fit(new_X_train, Y_train)
Y_pred = model.predict(new_X_test)
accuracy_score(Y_test, Y_pred)

## Prediction Heatmap
sns.heatmap(confusion_matrix(Y_test, Y_pred), annot=True)
plt.title("Predicted vs Actual Health Confusion Matrix (Simple LR)")
plt.xlabel("Predicted Health")
plt.ylabel("Actual Health")
#plt.savefig("lr_con_mat.png")

Simplified MLP (Multi-Level Perceptron

In [None]:
# Recode and set up Training and Validation data sets
data = pd.read_csv('food_health.csv')
data['is_1'] = data.health == 1
data['is_2'] = data.health == 2
data['is_3'] = data.health == 3
data['is_healthy'] = data.is_1 + data.is_2 + data.is_3
del data['health']
del data['is_1']
del data['is_2']
del data['is_3']
X = data.iloc[:, :9]
Y = data.iloc[:, 9]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1,
                                                   random_state=0)
miss_train = IterativeImputer(max_iter=100, random_state=0)
new_X_train = miss_train.fit_transform(X_train)
miss_test = IterativeImputer(max_iter=100, random_state=0)
new_X_test = miss_test.fit_transform(X_test)

# Run MLP
train_acc = []
for unit in [50, 100, 1000]:
    for act in ['relu', 'elu', 'tanh', 'sigmoid']:
        model = Sequential()
        model.add(Dense(unit, input_dim=9, activation=act))
        model.add(Dense(10, activation=act))
        model.add(Dense(1, activation='softmax'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.fit(new_X_train, Y_train, epochs=10, batch_size=32)
        _, acc = model.evaluate(new_X_train, Y_train)
        train_acc.append(acc)
        
# Visualization

## Accuracy vs Parameters
plt.title("Accuracy for Different Parameters (Simple MLP)")
plt.xlabel("Different Parameters")
plt.ylabel("Accuracy")
plt.plot(train_acc)
#plt.savefig('mlp_acc.png')

## Run with best parameters and accuracy
model = Sequential()
model.add(Dense(50, input_dim=9, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(new_X_train, Y_train, epochs=10, batch_size=32)
Y_pred = model.predict_classes(new_X_test)
accuracy_score(Y_pred, Y_test)

## Prediction Heatmap
sns.heatmap(confusion_matrix(Y_test, Y_pred), annot=True)
plt.title("Predicted vs Actual Health Confusion Matrix (Simple MLP)")
plt.xlabel("Predicted Health")
plt.ylabel("Actual Health")
#plt.savefig("mlp_con_mat.png")