# Team: Kitty
# Members: Minjie Yang (Leader), Yu Ren, Peter Xie

# Helpful Functions

The original author of this part is Professor Jeffrey Heaton.

Remark: to_xy has been tailored to the Kaggle dataset.

In [None]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
import requests
import base64


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    # Regression
    return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low
        
# This function submits an assignment.  You can submit an assignment as much as you like, only the final
# submission counts.  The paramaters are as follows:
# data - Pandas dataframe output.
# key - Your student key that was emailed to you.
# no - The assignment class number, should be 1 through 1.
# source_file - The full path to your Python or IPYNB file.  This must have "_class1" as part of its name.  
# .             The number must match your assignment number.  For example "_class2" for class assignment #2.
def submit(data,key,no,source_file=None):
    if source_file is None and '__file__' not in globals(): raise Exception('Must specify a filename when a Jupyter notebook.')
    if source_file is None: source_file = __file__
    suffix = '_class{}'.format(no)
    if suffix not in source_file: raise Exception('{} must be part of the filename.'.format(suffix))
    with open(source_file, "rb") as image_file:
        encoded_python = base64.b64encode(image_file.read()).decode('ascii')
    ext = os.path.splitext(source_file)[-1].lower()
    if ext not in ['.ipynb','.py']: raise Exception("Source file is {} must be .py or .ipynb".format(ext))
    r = requests.post("https://api.heatonresearch.com/assignment-submit",
        headers={'x-api-key':key}, json={'csv':base64.b64encode(data.to_csv(index=False).encode('ascii')).decode("ascii"),
        'assignment': no, 'ext':ext, 'py':encoded_python})
    if r.status_code == 200:
        print("Success: {}".format(r.text))
    else: print("Failure: {}".format(r.text))

In [None]:
# function in 'datasets_toy1.ipynb' to calculate weight with the toy's metal, shape, height, length and weight 
import math
def calculate_weight(metal,shape,h,l,w):
    metal_name = ['gold','silver','bronze','tin','platinum']
    metal_density = [19.32,10.49, 9.29,7.31, 21.09]
    shape_name = ['sphere','box','cylinder']
    
    metal = metal_name.index(metal)
    shape = shape_name.index(shape)
    
    if shape==0:
        # sphere
        vol = (4.0/3.0)  * math.pi * ((l/2.0)**3)
    elif shape==1:
        # box
        vol = l * w * h
    elif shape==2:
        # cylinder
        vol = math.pi * ((w/2.0)**2.0) * h
        
    weight = vol * metal_density[metal]
        
    return weight

# Kaggle Project Code

The author of the rest (main part) is Minjie Yang.

**Description:**

* Inspired from the midterm exam, **one attempt is to encode metal into its corresponding density** (from 'datasets_toy1.ipynb') in order to give it both a numeric value and an order. Doing this could reach a score around 290-300 on Kaggle.

* **Another attempt is to split up the dataset according to the cost column** for the following motivation. Doing this could reach a score around 200-210 on Kaggle.

The toy's **weight** can be evaluated in the following two ways:

**1.weight = cost/metal_cost**

**2.weight = density\*volume** (can be realized by the helpful function calculate_weight)

After analyzing the dataset and running some experiments, we find out that the first way has a high accuracy in predicting **weight** (RMSE on the holdout set is around 5), while the second way has a low accuracy (around 400, and this result is about the same among all three shapes). Unfortunately, there is a lot of missing value in the **cost** column. Simply encoding the missing data of the **cost** column into its median will lead to a bad performance, because it will destroy the nice feature of it. So in order to good full use of **cost**, we separate the dataset into two parts:

**If the cost value is complete: com_train and com_test**

**If the cost value is missing: mis_train and mis_test**

On the **com_train** and **com_test**, we add an approximation column predicting the **weight** by the first way to enhance the performance. On the **mis_train** and **mis_test**, we add an approximation column predicting the **weight** by the second way.

Eventually, we choose the two predictions on **com_test** and **mis_test** from the two models with the best holdout score respectively. Then we concatenate them together into one final prediction **df_submit**.

So the code of the model will be organized by:

**0.com_train, com_test, mis_train and mis_test preparing**

**1.com_train and com_test feature encoding > com_train training > com_test predicting : df_com**

**2.mis_train and mis_test feature encoding > mis_train training > mis_test predicting : df_mis**

**3.concatenate df_com and df_mis together into df_submit**

If want to try some changes on the neural network, just run the training and predicting part (with respect to 1 or 2), then run 3 to update **df_submit**.

The final two submissions are chosen by prediction with epoch=60 and epoch=200 to prevent overfitting.

Remark: 

Some other hyper parameter choosing: batch_size = 200, neural network size = (20 40 80), applying zscore before training, etc. What's more: led_vol, motor_vol, gear_vol, volume_parts are dropped because of nearly missing all the value and having low relevance to **weight**. Inverse pyramid structure doesn't help, wide/deep network doesn't help much, dropout layer doesn't help much, early stopping doesn't help much. The score on holdout set is important.

In [None]:
# importing packages and dataset
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

path = "./all/"
    
filename_train = os.path.join(path,"train.csv")
filename_test = os.path.join(path,"test.csv")
df_train = pd.read_csv(filename_train,na_values=['NA','?'])
df_test = pd.read_csv(filename_test,na_values=['NA','?'])

# preparing dataset
# 'id' is irrelevant to the prediction
df_train.drop('id',axis=1,inplace=True)
# 'led_vol','motor_vol','gear_vol','volume_parts' are hardly irrelevant to the prediction
df_train.drop(['led_vol','motor_vol','gear_vol','volume_parts'],axis=1,inplace=True)
df_test.drop(['led_vol','motor_vol','gear_vol','volume_parts'],axis=1,inplace=True)

In [None]:
# 0. com_train, com_test, mis_train and mis_test preparing
mis_train = df_train[df_train['cost'].isna() == True] # missing data
mis_test = df_test[df_test['cost'].isna() == True]
com_train = df_train[df_train['cost'].isna() == False] # complete data
com_test = df_test[df_test['cost'].isna() == False]

In [None]:
# 1. com_train and com_test feature encoding
# saving 'id' to match the prediction
id_com = com_test['id']
com_test.drop('id',axis=1,inplace=True)

# encode 'metal' into its corresponding density
com_train['metal'].replace(['gold','silver','bronze','tin','platinum'],[19.32,10.49, 9.29,7.31, 21.09],inplace=True)
com_test['metal'].replace(['gold','silver','bronze','tin','platinum'],[19.32,10.49, 9.29,7.31, 21.09],inplace=True)

# approximation 1: weight = cost/metal_cost
com_train['approx'] = com_train.apply(lambda x: x['cost']/x['metal_cost'], axis=1)
com_test['approx'] = com_test.apply(lambda x: x['cost']/x['metal_cost'], axis=1)

encode_numeric_zscore(com_train,['metal_cost','height','width','length','led','gears','motors','metal','cost','approx'])
encode_numeric_zscore(com_test,['metal_cost','height','width','length','led','gears','motors','metal','cost','approx'])

encode_text_dummy(com_train,'shape')
encode_text_dummy(com_test,'shape')

x,y = to_xy(com_train,'weight')

In [None]:
# com_train training
batch_size = 200
# Cross validation & Holdout & Trainning

# Keep a 10% holdout
x_main, x_holdout, y_main, y_holdout = train_test_split(    
    x, y, test_size=0.10) 

# Cross-validate
kf = KFold(5)
    
oos_y = []
oos_pred = []
fold = 0
for train, test in kf.split(x_main):        
    fold+=1
    print("Fold #{}".format(fold))
        
    x_train = x_main[train]
    y_train = y_main[train]
    x_test = x_main[test]
    y_test = y_main[test]
    
    model_com = Sequential()
    model_com.add(Dense(20, input_dim=x.shape[1], activation='relu'))
    # model_com.add(Dropout(0.01))
    model_com.add(Dense(40, activation='relu'))
    model_com.add(Dense(80, activation='relu'))
    model_com.add(Dense(1))
    model_com.compile(loss='mean_squared_error', optimizer='adam')
    
    # monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
    # model_com.fit(x_train,y_train,batch_size=batch_size,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)
    model_com.fit(x_train,y_train,batch_size=batch_size,validation_data=(x_test,y_test),verbose=0,epochs=60)
    model_com.save("com%d.hdf5" % fold)
    
    pred = model_com.predict(x_test)
    
    oos_y.append(y_test)
    oos_pred.append(pred) 

    # Measure accuracy
    score = np.sqrt(metrics.mean_squared_error(pred,y_test))
    print("Fold score (RMSE): {}".format(score))

# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = np.sqrt(metrics.mean_squared_error(oos_pred,oos_y))
print()
print("Cross-validated score (RMSE): {}".format(score))    
    
# Write the cross-validated prediction (from the last neural network)
holdout_pred = model_com.predict(x_holdout)

score = np.sqrt(metrics.mean_squared_error(holdout_pred,y_holdout))
print("Holdout score (RMSE): {}".format(score))

In [None]:
# com_test predicting : df_com

# # to choose the best model
# x,y = to_xy(com_train,'weight')
# from keras.models import load_model
# model_com = load_model("com2.hdf5") # change the number
# # Keep a 10% holdout 
# x_main, x_holdout, y_main, y_holdout = train_test_split(x, y, test_size=0.10) 
# # Write the cross-validated prediction
# holdout_pred = model_com.predict(x_holdout)
# score = np.sqrt(metrics.mean_squared_error(holdout_pred,y_holdout))
# print("Holdout score (RMSE): {}".format(score))

# default is the last model
x = com_test.as_matrix().astype(np.float32)

pred = model_com.predict(x)

df_com = pd.DataFrame(pred)
df_com.insert(0,'id',list(id_com))
df_com.columns = ['id','weight']

In [None]:
# 2.mis_train and mis_test feature encoding
# 'cost' is missing, drop it
# keep 'metal_cost' to add a protential feature
mis_train.drop(['cost'],axis=1,inplace=True)
mis_test.drop(['cost'],axis=1,inplace=True)

# saving 'id' to match the prediction
id_mis = mis_test['id']
mis_test.drop('id',axis=1,inplace=True)

# approximation 2: weight = density*volume(can be realized by the helpful function calculate_weight)
mis_train['approx'] = mis_train.apply(lambda x: calculate_weight(x['metal'],x['shape'],x['height'],x['length'],x['width']), axis=1)
mis_test['approx'] = mis_test.apply(lambda x: calculate_weight(x['metal'],x['shape'],x['height'],x['length'],x['width']), axis=1)

# encode 'metal' into its corresponding density
mis_train['metal'].replace(['gold','silver','bronze','tin','platinum'],[19.32,10.49, 9.29,7.31, 21.09],inplace=True)
mis_test['metal'].replace(['gold','silver','bronze','tin','platinum'],[19.32,10.49, 9.29,7.31, 21.09],inplace=True)

encode_numeric_zscore(mis_train,['metal','metal_cost','height','width','length','led','gears','motors','approx'])
encode_numeric_zscore(mis_test,['metal','metal_cost','height','width','length','led','gears','motors','approx'])

encode_text_dummy(mis_train,'shape')
encode_text_dummy(mis_test,'shape')

x,y = to_xy(mis_train,'weight')

In [None]:
# mis_train training
batch_size = 200
# Cross validation & Holdout & Trainning

# Keep a 10% holdout
x_main, x_holdout, y_main, y_holdout = train_test_split(    
    x, y, test_size=0.10) 

# Cross-validate
kf = KFold(5)
    
oos_y = []
oos_pred = []
fold = 0
for train, test in kf.split(x_main):        
    fold+=1
    print("Fold #{}".format(fold))
        
    x_train = x_main[train]
    y_train = y_main[train]
    x_test = x_main[test]
    y_test = y_main[test]
    
    model_mis = Sequential()
    model_mis.add(Dense(20, input_dim=x.shape[1], activation='relu'))
    # model_mis.add(Dropout(0.01))
    model_mis.add(Dense(40, activation='relu'))
    model_mis.add(Dense(80, activation='relu'))
    model_mis.add(Dense(1))
    model_mis.compile(loss='mean_squared_error', optimizer='adam')
    
    # monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
    # model_mis.fit(x_train,y_train,batch_size=batch_size,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)
    model_mis.fit(x_train,y_train,batch_size=batch_size,validation_data=(x_test,y_test),verbose=0,epochs=60)
    model_mis.save("mis%d.hdf5" % fold)
    
    pred = model_mis.predict(x_test)
    
    oos_y.append(y_test)
    oos_pred.append(pred) 

    # Measure accuracy
    score = np.sqrt(metrics.mean_squared_error(pred,y_test))
    print("Fold score (RMSE): {}".format(score))

# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = np.sqrt(metrics.mean_squared_error(oos_pred,oos_y))
print()
print("Cross-validated score (RMSE): {}".format(score))    
    
# Write the cross-validated prediction (from the last neural network)
holdout_pred = model_mis.predict(x_holdout)

score = np.sqrt(metrics.mean_squared_error(holdout_pred,y_holdout))
print("Holdout score (RMSE): {}".format(score))

In [None]:
# mis_test predicting : df_mis

# # to choose the best model
# x,y = to_xy(mis_train,'weight')
# from keras.models import load_model
# model_mis = load_model("mis5.hdf5") # change the number
# # Keep a 10% holdout
# x_main, x_holdout, y_main, y_holdout = train_test_split(x, y, test_size=0.10) 
# # Write the cross-validated prediction
# holdout_pred = model_mis.predict(x_holdout)
# score = np.sqrt(metrics.mean_squared_error(holdout_pred,y_holdout))
# print("Holdout score (RMSE): {}".format(score))

# default is the last model
x = mis_test.as_matrix().astype(np.float32)

pred = model_mis.predict(x)

df_mis = pd.DataFrame(pred)
df_mis.insert(0,'id',list(id_mis))
df_mis.columns = ['id','weight']

In [None]:
# 3.concatenate df_com and df_mis together into df_submit
df_submit = pd.concat([df_com,df_mis])
df_submit.sort_values(by="id",ascending= True,inplace=True) 

# Save csv
df_submit.to_csv('kaggle.csv',index=False)
df_submit