In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
import matplotlib.pyplot as plt
import shutil
from sklearn import metrics


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float64), df.as_matrix([target]).astype(np.float64)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


path = "./Downloads/"
    
filename_train = os.path.join(path,"train.csv")
filename_test = os.path.join(path,"test.csv")
filename_submit = os.path.join(path,"sampleSubmission.csv")

df_train = pd.read_csv(filename_train,na_values=['NA','?'])
print(df_train['sales'].dtypes)



# Encode feature vector
encode_text_dummy(df_train, 'type_name')
encode_numeric_zscore(df_train,'0_type_count')
encode_numeric_zscore(df_train,'1_type_count')
encode_numeric_zscore(df_train,'2_type_count')
encode_numeric_zscore(df_train,'3_type_count')
encode_numeric_zscore(df_train,'4_type_count')
encode_numeric_zscore(df_train,'age')
encode_numeric_zscore(df_train,'sqft')
encode_numeric_zscore(df_train,'income')
encode_numeric_zscore(df_train,'pets')
encode_numeric_zscore(df_train,'population')
encode_numeric_zscore(df_train,'sqmiles')
encode_numeric_zscore(df_train,'urban')


df_train.drop('id', axis=1, inplace=True)


# Create x & y for training

# Create the x-side (feature vectors) of the training

x, y = to_xy(df_train, 'sales')
    
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=45)

regressor = Sequential()
#regressor.add(Dense(100, input_dim=x.shape[1],activation='relu'))
#regressor.add(Dense(50, activation='relu'))
regressor.add(Dense(10, input_dim=x.shape[1], activation='relu'))
regressor.add(Dense(1))
regressor.add(Dense(y.shape[1],activation='softmax'))
regressor.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=50, verbose=1, mode='auto')
regressor.fit(x,y,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=100000)

# Measure RMSE error, for out of sample.
pred = regressor.predict(x_test)
score = metrics.mean_squared_error(pred,y_test)
score = np.sqrt(score)
pred = regressor.predict(x_test)

oos_y.append(y_test)
oos_pred.append(pred)  
oos_x.append(x_test)

# Measure accuracy
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("RMSE Score:".format(score))
# Generate Kaggle submit file

# Encode feature vector
df_test = pd.read_csv(filename_test,na_values=['NA','?'])
encode_text_dummy(df_test, 'type_name')
encode_numeric_zscore(df_test,'0_type_count')
encode_numeric_zscore(df_test,'1_type_count')
encode_numeric_zscore(df_test,'2_type_count')
encode_numeric_zscore(df_test,'3_type_count')
encode_numeric_zscore(df_test,'4_type_count')
encode_numeric_zscore(df_test,'age')
encode_numeric_zscore(df_test,'sqft')
encode_numeric_zscore(df_test,'income')
encode_numeric_zscore(df_test,'pets')
encode_numeric_zscore(df_test,'population')
encode_numeric_zscore(df_test,'sqmiles')
encode_numeric_zscore(df_test,'urban')

ids = df_test['id']
df_test.drop('id', axis=1, inplace=True)

x = df_test.as_matrix().astype(np.float32)

# Generate predictions
pred = model.predict(x)
#pred

# Create submission data set

df_submit = pd.DataFrame(pred)
df_submit.insert(0,'id',ids)
df_submit.columns = ['id','sales']

df_submit.to_csv(filename_submit, index=False)

print(df_submit)

Using TensorFlow backend.


float64
Epoch 00051: early stopping


NameError: name 'oos_y' is not defined