In [1]:
import os
import sklearn
import pandas as pd
import numpy as np
import tensorflow.contrib.learn as skflow
from tensorflow.contrib import learn
from sklearn.cross_validation import KFold
from scipy.stats import zscore
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.cross_validation import train_test_split
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping


path = "./data/"

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for - red,green,blue)
def encode_text_dummy(df,name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name,x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable. The new columns (which - do not replace the old) will have a 1
# at every location where the original column (name) matches each of the - target_values. One column is added for
#eachtargetvalue.
def defencode_text_single_dummy(df,name,target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


#Encodetextvaluestoindexes(i.e.[1],[2],[3]forred,green,blue).
def encode_text_index(df,name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


#Encode a numeric column as zscores
def encode_numeric_zscore(df,name,mean=None,sd=None):
    if mean is None:
        mean = df[name].mean()
    if sd is None:
        sd = df[name].std()
    df[name] = (df[name]-mean)/sd
    
    
#Convert all missing values in thes pecified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Encode the toy dataset

# Question 1
print()
print("***Question 1***")
path = "./data/"
filename_read = os.path.join(path,"toy1.csv")
df = pd.read_csv(filename_read,na_values=['NA','?'])
filename_write = os.path.join(path,"submit-FeiyangYang-prog2q1.csv")

#Solution
encode_numeric_zscore(df, 'length') 
encode_numeric_zscore(df, 'width') 
encode_numeric_zscore(df, 'height') 
encode_text_dummy(df, 'metal') 
encode_text_dummy(df, 'shape')
    
df.to_csv(filename_write,index=False)
print("Wrote {} lines.".format(len(df)))


#Question 2
print()
print("***Question 2***")
path = "./data/"
filename_read = os.path.join(path,"toy1.csv")
df = pd.read_csv(filename_read,na_values=['NA','?'])

# JTH - The following are from the skeleton code
encode_numeric_zscore(df, 'length')
encode_numeric_zscore(df, 'width')
encode_numeric_zscore(df, 'height')
encode_text_dummy(df, 'metal')
encode_text_dummy(df, 'shape')

#weight = encode_text_index(df,"weight") #JTH - do not encode weight (see hint on Schoology)
x,y = to_xy(df,'weight') # Create x(predictors) and y (expected outcome)

path = "./data/"

# Create the x/y
x, y = to_xy(df, 'weight')

# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=42)

# Create a deep neural network with 3 hidden layers
# of 50, 25, 10
regressor = Sequential()
regressor.add(Dense(100, input_dim=x.shape[1],activation='relu'))
regressor.add(Dense(50, activation='relu'))
regressor.add(Dense(25, activation='relu'))
regressor.add(Dense(1, kernel_initializer='normal'))
regressor.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=50, verbose=1, mode='auto')
regressor.fit(x,y,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=100000)

# Measure RMSE error, for out of sample.
pred = regressor.predict(x_test)
score = metrics.mean_squared_error(pred,y_test)
score = np.sqrt(score)
print("Final score (RMSE): {}".format(score))




#Question 3
print()
print("***Question 3***")
path = "./data/"
    
filename_read = os.path.join(path,"toy1.csv")
df = pd.read_csv(filename_read,na_values=['NA','?'])
filename_write = os.path.join(path,"submit-FeiyangYang-prog2q3.csv")

length_std=df['length'].std()
width_std=df['width'].std()
height_std=df['height'].std()
length_mean=df['length'].mean()
width_mean=df['width'].mean()
height_mean=df['height'].mean()

testDF = pd.DataFrame([
            {'length':1, 'width':2, 'height': 3},
            {'length':3, 'width':2, 'height': 5},
            {'length':4, 'width':1, 'height': 3}
         ])
    
encode_numeric_zscore(testDF,'length',mean=length_mean,sd=length_std)
encode_numeric_zscore(testDF,'width',mean=width_mean,sd=width_std)
encode_numeric_zscore(testDF,'height',mean=height_mean,sd=height_std)

print("length: ({}, {})".format(length_mean,length_std))
print("width:({}, {})".format(width_mean,width_std))
print("height:({}, {})".format(height_mean,height_std))
    
print(testDF)
    
testDF.to_csv(filename_write,index=False)    

#Question 4
print()
print("***Question 4***")
path = "./data/"

filename_read = os.path.join(path,"iris.csv")
filename_write = os.path.join(path,"submit-FeiyangYang-prog2q4.csv")
df = pd.read_csv(filename_read,na_values=['NA','?'])

name = ['species', 'sepal_l', 'sepal_w',  'petal_l','petal_w']
df = pd.DataFrame(df[name])
 
np.random.seed(42)

encode_numeric_zscore(df,'petal_l')
encode_numeric_zscore(df,'sepal_w')
encode_numeric_zscore(df,'sepal_l')
encode_text_dummy(df,"species")

df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)
x, y = to_xy(df,'petal_w')

#Cross validate
kf = KFold(5)

oos_y = []
oos_pred = []
oos_x = []
fold = 1

for train, test in kf.split(x):        
    print("Fold #{}".format(fold))
    fold+=1

    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    regressor = Sequential()
    regressor.add(Dense(10, input_dim=x.shape[1],activation='relu'))
    regressor.add(Dense(20, activation='relu'))
    regressor.add(Dense(10, activation='relu'))
    regressor.add(Dense(1, kernel_initializer='normal'))
    regressor.compile(loss='mean_squared_error', optimizer='adam')
    
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1)
    regressor.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=500)
    
    pred = regressor.predict(x_test)

    oos_y.append(y_test)
    oos_pred.append(pred)  
    oos_x.append(x_test)

    # Get accuracy
    score = np.sqrt(metrics.mean_squared_error(pred,y_test))
    print("Fold score (RMSE): {}".format(score))


# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
oos_x = np.concatenate(oos_x)
   
score = np.sqrt(metrics.mean_squared_error(oos_pred,oos_y))
print("Final, out of sample score (RMSE): {}".format(score))    

# Cross-validated prediction
oos_y = pd.DataFrame(oos_y)
oos_pred = pd.DataFrame(oos_pred)
oos_x = pd.DataFrame(oos_x)
oos_x.insert(3,'petal_w',oos_y[:])
oosDF = pd.concat([oos_x,oos_y, oos_pred],axis=1 )
oosDF.columns = ['sepal_l','sepal_w','petal_l','petal_w','species-Iris-setosa','species-Iris-versicolor','species-Iris-virginica',0,0]

oosDF.to_csv(filename_write,index=False)

#Question 5
print()
print("***Question 5***")

filename_read = os.path.join(path, "auto-mpg.csv")
filename_write = os.path.join(path, "submit-FeiyangYang-prog2q5.csv")
df = pd.read_csv(filename_read)
df = pd.read_csv(filename_read, na_values=['NA', '?'])

# Handle missing values in horsepower
missing_median(df, 'horsepower')

# Convert to zscores
encode_numeric_zscore(df, 'horsepower')
encode_numeric_zscore(df, 'weight')
encode_numeric_zscore(df, 'displacement')
encode_numeric_zscore(df, 'acceleration')
encode_numeric_zscore(df, 'mpg')
encode_numeric_zscore(df, 'origin')
df.drop('name',1,inplace=True)

cyl = encode_text_index(df,'cylinders')
num_classes = len(cyl)

# Create the x/y
x, y = to_xy(df, 'cylinders')

# Cross validate
kf = KFold(5)

oos_y = []
oos_pred = []
fold = 1
for train, test in kf.split(x):
    print("Fold #{}".format(fold))
    fold += 1

    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]

    # layers of 10, 20, 5
    model = Sequential()
    model.add(Dense(10, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(5, activation='relu'))
    model.add(Dense(y.shape[1],activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

    model.fit(x,y,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=1000)

    # Add the predictions to the oos prediction list
    pred = model.predict(x_test)
    pred = np.argmax(pred,axis=1)

    y_compare = np.argmax(y_test,axis=1) 
    oos_y.append(y_compare)
    oos_pred.append(pred)

    # Measure accuracy
    score = metrics.accuracy_score(y_compare, pred)
    print("Fold score: {}".format(score))

# Build the oos prediction list and calculate the error.
oos_pred = np.concatenate(oos_pred)
oos_y = np.concatenate(oos_y)

score = metrics.accuracy_score(oos_y, oos_pred)
print("Final, out of sample score: {}".format(score))

# Write the cross-validated prediction
oos_y = pd.DataFrame(oos_y)
oos_pred = pd.DataFrame(oos_pred)
col_actual = pd.DataFrame(cyl[oos_y])
col_predict = pd.DataFrame(cyl[oos_pred])
oosDF = pd.concat([df, col_actual, col_predict], axis=1)
oosDF.columns = list(df.columns) + ['ideal', 'predict']
oosDF.to_csv(filename_write, index=False)


Using TensorFlow backend.



***Question 1***
Wrote 10000 lines.

***Question 2***
Epoch 00253: early stopping
Final score (RMSE): 28.79257583618164

***Question 3***
length: (5.4895, 2.8474024162474727)
width:(5.4783, 2.8547055968006094)
height:(5.52, 2.872981970451614)
     height    length     width
0 -0.877137 -1.576700 -1.218444
1 -0.180997 -0.874306 -1.218444
2 -0.877137 -0.523108 -1.568743

***Question 4***
Fold #1
Epoch 00110: early stopping
Fold score (RMSE): 0.18533986806869507
Fold #2
Epoch 00039: early stopping
Fold score (RMSE): 0.1815733164548874
Fold #3
Epoch 00112: early stopping
Fold score (RMSE): 0.1905587762594223
Fold #4
Epoch 00085: early stopping
Fold score (RMSE): 0.2145203948020935
Fold #5
Epoch 00094: early stopping
Fold score (RMSE): 0.2075815051794052
Final, out of sample score (RMSE): 0.19633719325065613

***Question 5***
Fold #1
Epoch 00250: early stopping
Fold score: 0.975
Fold #2
Epoch 00124: early stopping
Fold score: 0.975
Fold #3
Epoch 00121: early stopping
Fold score: 0.975
Fold