In [3]:
# Programming Assignment #2 
# Hanyu Feng 
# Student ID:452106
# T81-558: Application of Deep Learning
import os
import sklearn
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.contrib.learn as skflow
from sklearn.cross_validation import KFold
from scipy.stats import zscore
from sklearn import metrics
from sklearn import preprocessing
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
#from sklearn.model_selection import KFold
import shutil

path = "./data/"

# These four functions will help you, they were covered in class.
# Encode a text field to dummy variables
def encode_text_dummy(df,name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name,x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

# Encode a text field to a single index value
def encode_text_index(df,name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Encode a numeric field to Z-Scores
def encode_numeric_zscore(df,name,mean=None,sd=None):
    if mean is None:
        mean = df[name].mean()
    if sd is None:
        sd = df[name].std()
    df[name] = (df[name]-mean)/sd

# Encode a numeric field to fill missing values with the median.
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)

# Convert a dataframe to x/y suitable for training.
def to_xy(df,target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    return df.as_matrix(result),df[target]

#encode_toy_dataset definition
def encode_toy_dataset(df):
    encode_numeric_zscore(df,'height')
    encode_numeric_zscore(df,'width')
    encode_numeric_zscore(df,'length')
    encode_text_dummy(df,'shape')
    encode_text_dummy(df,'metal')
    return df

def get_model_dir(name,erase):
    base_path = os.path.join(".","dnn")
    model_dir = os.path.join(base_path,name)
    os.makedirs(model_dir,exist_ok=True)
    if erase and len(model_dir)>4 and os.path.isdir(model_dir):
        shutil.rmtree(model_dir,ignore_errors=True) # be careful, this deletes everything below the specified path
    return model_dir


# Encode the toy dataset
def question1():
    print()
    print("***Question 1***")
    
    tf.logging.set_verbosity(tf.logging.ERROR)
    filename_read = os.path.join(path,"toy1.csv")
    filename_write = os.path.join(path,"submit-hanyu-prog2q1.csv")
    df = pd.read_csv(filename_read,na_values=['NA','?'])
    df = encode_toy_dataset(df)
    df.to_csv(filename_write,index=False)
    print("Wrote {} lines.".format(len(df)))


# Model the toy dataset, no cross validation
def question2():
    print()
    print("***Question 2***")
    
 
    filename_read = os.path.join(path,"submit-hanyu-prog2q1.csv")
    df = pd.read_csv(filename_read,na_values=['NA','?'])
    
    #shuffle the data
    np.random.seed(42)
    df = df.reindex(np.random.permutation(df.index))
    df.reset_index(inplace=True, drop=True)

    # Encode to a 2D matrix for training
    x,y = to_xy(df,'weight')
    
    #Split into train/test 
    x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=42)
    
    # Get/clear a directory to store the neural network to
    model_dir = get_model_dir('weight',True)
    
    # Create a deep neural network with 3 hidden layers of 50, 25, 10
    feature_columns = [tf.contrib.layers.real_valued_column("", dimension=x.shape[0])]
    regressor = skflow.DNNRegressor(
    model_dir= model_dir,
    feature_columns=feature_columns,
    hidden_units=[50, 25, 10])
    
    # Early stopping
    validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
    x_test,
    y_test,
    every_n_steps=50,
    early_stopping_metric="loss",
    early_stopping_metric_minimize=True,
    early_stopping_rounds=50)
    
    # Fit/train neural network
    regressor.fit(x_train, y_train,monitors=[validation_monitor],steps=1000)

    # Measure RMSE error.  RMSE is common for regression.
    pred = list(regressor.predict(x_test, as_iterable=True))
    score = np.sqrt(metrics.mean_squared_error(pred,y_test))
    print("Out of sample (RMSE): {}".format(score))

def question3():
    print()
    print("***Question 3***")
    
    filename_read = os.path.join(path,"toy1.csv")
    filename_write = os.path.join(path,"submit-hanyu-prog2q3.csv")
    df = pd.read_csv(filename_read,na_values=['NA','?'])
    lmean = df['length'].mean()
    lstdv = df['length'].std()
    print("length:({},{})".format(lmean,lstdv))
    wmean = df['width'].mean()
    wstdv = df['width'].std()
    print("width:({},{})".format(wmean,wstdv))
    hmean = df['height'].mean()
    hstdv = df['height'].std()
    print("height:({},{})".format(hmean,hstdv))
    
    # Z-Score encode these using the mean/sd from the dataset (you got this in question 2)
    testDF = pd.DataFrame([
            {'length':1, 'width':2, 'height': 3},
            {'length':3, 'width':2, 'height': 5},
            {'length':4, 'width':1, 'height': 3}
        ])
    
    testDF['height'] = (testDF['height'] - hmean)/hstdv
    testDF['length'] = (testDF['length'] - lmean)/lstdv
    testDF['width'] = (testDF['width'] - wmean)/wstdv
    print(testDF)
    testDF.to_csv(filename_write,index=False)

def question4():
    print()
    print("***Question 4***")
    filename_read = os.path.join(path,"iris.csv")
    filename_write = os.path.join(path,"submit-hanyu-prog2q4.csv")
    df = pd.read_csv(filename_read,na_values=['NA','?'])
    
    encode_numeric_zscore(df,'sepal_l')
    encode_numeric_zscore(df,'sepal_w')
    encode_numeric_zscore(df,'petal_l')
    encode_text_dummy(df,'species')
    
    #shuffle the data
    np.random.seed(42)
    df = df.reindex(np.random.permutation(df.index))
    df.reset_index(inplace=True, drop=True)
    
    # Encode to a 2D matrix for training
    x,y=to_xy(df,'petal_w')
    
    # Cross validate
    kf = KFold(len(df), n_folds=5)

    oos_y = []
    oos_pred = []
    fold = 0
    for train, test in kf:
        fold+=1
        print("Fold #{}".format(fold))
        
        x_train = x[train]
        y_train = y[train]
        x_test = x[test]
        y_test = y[test]
    
        # Get/clear a directory to store the neural network to
        model_dir = get_model_dir('petal_w-{}'.format(fold),True) # Each fold has its own folder

        # Create a deep neural network with 3 hidden layers of 50, 25, 10
        feature_columns = [tf.contrib.layers.real_valued_column("", dimension=x.shape[0])]
        regressor = skflow.DNNRegressor(
            model_dir= model_dir,
            config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1),
            feature_columns=feature_columns,
            hidden_units=[50, 25, 10])

        # Early stopping
        validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
            x_test,
            y_test,
            every_n_steps=500,
            early_stopping_metric="loss",
            early_stopping_metric_minimize=True,
            early_stopping_rounds=50)
    
        # Fit/train neural network
        regressor.fit(x_train, y_train,monitors=[validation_monitor],steps=10000)

        # Add the predictions to the oos prediction list
        pred = list(regressor.predict(x_test, as_iterable=True))
    
        oos_y.append(y_test)
        oos_pred.append(pred)        

        # Measure accuracy
        score = np.sqrt(metrics.mean_squared_error(pred,y_test))
        print("Fold score (RMSE): {}".format(score))


    #Build the oos prediction list and calculate the error.
    oos_y = np.concatenate(oos_y)
    oos_pred = np.concatenate(oos_pred)
    score = np.sqrt(metrics.mean_squared_error(oos_pred,oos_y))
    print("Final, out of sample score (RMSE): {}".format(score))    
    
    oos_y = pd.DataFrame(oos_y)
    oos_pred = pd.DataFrame(oos_pred)
    oosDF = pd.concat( [df, oos_y, oos_pred],axis=1 )
    oosDF.to_csv(filename_write,index=False)
    
    
def question5():
    print()
    print("***Question 5***")
    
    filename_read = os.path.join(path,"auto-mpg.csv")
    filename_write = os.path.join(path,"submit-hanyu-prog2q5.csv")
    df = pd.read_csv(filename_read,na_values=['NA','?'])
    
    med = df['horsepower'].median()
    df['horsepower'] = df['horsepower'].fillna(med) 
    encode_numeric_zscore(df, 'mpg')
    encode_numeric_zscore(df, 'horsepower')
    encode_numeric_zscore(df, 'weight')
    encode_numeric_zscore(df, 'displacement')
    encode_numeric_zscore(df, 'acceleration')
    encode_numeric_zscore(df, 'origin')
    temp = df['name']
    df.drop('name',1,inplace=True)
    cylinders = encode_text_index(df,"cylinders")
    num_classes = len(cylinders)
    
    # Shuffle
    np.random.seed(42)
    df = df.reindex(np.random.permutation(df.index))
    df.reset_index(inplace=True, drop=True)

    # Encode to a 2D matrix for training
    x,y = to_xy(df,'cylinders')
    
    # Cross validate
    kf = KFold(len(df), n_folds=5)

    oos_y = []
    oos_pred = []
    fold = 0
    for train, test in kf:
        fold+=1
        print("Fold #{}".format(fold))
        
        x_train = x[train]
        y_train = y[train]
        x_test = x[test]
        y_test = y[test]
    
        # Get/clear a directory to store the neural network to
        model_dir = get_model_dir('cylinders-{}'.format(fold),True) # Each fold has its own folder

        # Create a deep neural network with 3 hidden layers of 10, 20, 5
        feature_columns = [tf.contrib.layers.real_valued_column("", dimension=x.shape[0])]
        classifier = skflow.DNNClassifier(
        model_dir= model_dir,
        hidden_units=[10, 20, 5], n_classes=num_classes, feature_columns=feature_columns)

        # Early stopping
        validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
        x_test,
        y_test,
        every_n_steps=50,
        early_stopping_metric="loss",
        early_stopping_metric_minimize=True,
        early_stopping_rounds=250)
    
        # Fit/train neural network
        classifier.fit(x_train, y_train,monitors=[validation_monitor],steps=500)

        # Add the predictions to the oos prediction list
        pred = list(classifier.predict(x_test, as_iterable=True))
    
        oos_y.append(y_test)
        oos_pred.append(pred)        

        # Measure accuracy
        score = metrics.accuracy_score(y_test, pred)
        print("Fold score: {}".format(score))


    #Build the oos prediction list and calculate the error.
    oos_y = np.concatenate(oos_y)
    oos_pred = np.concatenate(oos_pred)
    score = metrics.accuracy_score(oos_y, oos_pred)
    print("Final, out of sample score: {}".format(score)) 
      
    # Write the cross-validated prediction
    oos_y = pd.DataFrame(oos_y)
    oos_pred = pd.DataFrame(oos_pred)
    oos_y.columns = ['ideal']
    oos_pred.columns = ['predict']
    oosDF = pd.concat( [df, temp, oos_y, oos_pred],axis=1 )
    oosDF.to_csv(filename_write,index=False)
    

question1()
question2()
question3()
question4()
question5()


***Question 1***
Wrote 10000 lines.

***Question 2***
Out of sample (RMSE): 261.55503023152124

***Question 3***
length:(5.4895,2.8474024162474727)
width:(5.4783,2.8547055968006094)
height:(5.52,2.872981970451614)
     height    length     width
0 -0.877137 -1.576700 -1.218444
1 -0.180997 -0.874306 -1.218444
2 -0.877137 -0.523108 -1.568743

***Question 4***
Fold #1
Fold score (RMSE): 0.20131494355764176
Fold #2
Fold score (RMSE): 0.15472213077299496
Fold #3
Fold score (RMSE): 0.1806365862097857
Fold #4
Fold score (RMSE): 0.21541695913948652
Fold #5
Fold score (RMSE): 0.20215257929374797
Final, out of sample score (RMSE): 0.19202414021580508

***Question 5***
Fold #1
Fold score: 0.8625
Fold #2
Fold score: 0.55
Fold #3
Fold score: 0.5
Fold #4
Fold score: 0.46835443037974683
Fold #5
Fold score: 0.8987341772151899
Final, out of sample score: 0.6557788944723618
