In [9]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import copy
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import category_encoders as ce

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support as score

import tensorflow as tf

import seaborn as sns
le = preprocessing.LabelEncoder()
scale = StandardScaler()

In [2]:
# Scale allContinuous numeric columns
# Use Standard Scaler to scale numerics for the model
def scalefeatures(df):
    for col in numCols: 
        colName = col.replace(' ', '') + '_scale' # Remove Spaces in Column names
        df[colName] = scale.fit_transform(df[[col]]) # Create new column with standard scaled value
        df = df.set_index(colName).reset_index() # Move column to the front of the DF
    df = df.set_index('object_id').reset_index()
    df = df.drop(numCols, axis=1)
    return df

# Sets the distmod to 0 and creates a new category for galactic (is milky way)
def distmodclean(df):
    # No records use 0 for distmod so will make them 0
    #find nan values to replace
    nanDistMod = np.isnan(df['distmod'])
    # print(nanDistMod)
    df['galactic'] = np.isnan(df['distmod'])
    df.loc[np.isnan(df['distmod']), 'distmod'] = 0

    # If Distmod has 0 create Milkyway resident
    df['galactic'] = le.fit_transform(df['galactic'])
    temp = df['galactic'] 
    df.drop(labels=['galactic'], axis=1,inplace = True)
    df.insert(len(df.columns.tolist())-2, 'galactic', temp)

In [4]:
# DATA_DIR = '../input/'
# DATA_DIR = '' # For Azure Notebook

trainMdDf = pd.read_csv('training_set_metadata.csv')
testdf = pd.read_csv('test_set_metadata.csv')

# hostgal_specz is mostly missing form the Test data so just drop
trainMdDf = trainMdDf.drop('hostgal_specz', axis=1)
testdf = testdf.drop('hostgal_specz', axis=1)

In [5]:
# --------------------------------------------------------------------------------------------
#  Data Clean up
# --------------------------------------------------------------------------------------------
numCols = [
    'hostgal_photoz'
    ,'hostgal_photoz_err'
    ,'distmod'
    ,'mwebv'
#     ,'hostgal_specz'
]

# Create Clean copy of DF
cleanTrain = copy.deepcopy(trainMdDf)
distmodclean(cleanTrain)
cleanTrain = scalefeatures(cleanTrain)
cleanTrain.head()

# Creat train/validation set
train, valid = train_test_split(cleanTrain, test_size=0.3, shuffle=True, random_state=42)

idCols = [
    'object_id'
#     ,'ra'
#     ,'decl'
#     ,'gal_l'
#     ,'gal_b'
]
ids_train = train[idCols]
ids_test = valid[idCols]
# ids_Submit = test[idCols]

# load just the features into X dropping the dropCols
X_train = train.drop(idCols, axis=1)
lenFeats = X_train.shape[1]-1
X_train = X_train.iloc[:, 0:lenFeats] #X1 - Training features
# X_train = X_train.drop('hostgal_specz_scale', axis=1)

X_validate = valid.drop(idCols, axis=1)
X_validate = X_validate.iloc[:, 0:lenFeats] #X2 - Validation features
# X_validate = X_validate.drop('hostgal_specz_scale', axis=1)

# find the taret columns
yOrdinal = (train.shape[1]-1) #location of y in the original dataframe
y_train = train.iloc[:, yOrdinal] #Y1 - training target
y_validate = valid.iloc[:, yOrdinal] #Y2 - Validation target

X_train.head()
# X_validate.head()


Unnamed: 0,mwebv_scale,distmod_scale,hostgal_photoz_err_scale,hostgal_photoz_scale,ra,decl,gal_l,gal_b,ddf,galactic
3780,-0.391482,0.847385,-0.382538,1.989885,208.300781,-4.780192,330.318318,54.773706,0,0
1468,-0.418045,0.739035,-0.423824,0.573796,33.75,-4.630479,168.146242,-59.949072,1,0
5787,1.467894,0.869126,-0.140819,2.429651,287.226562,-7.632146,28.085822,-7.338392,0,0
5818,-0.325076,0.610163,-0.470769,-0.180534,318.401154,-58.16037,337.79216,-41.322248,0,0
5501,-0.205545,-1.533499,-0.518381,-0.656047,173.144531,-33.868706,284.741349,26.202143,0,1


In [6]:
cleanTest = pd.DataFrame()
cleanTest = copy.deepcopy(testdf)
distmodclean(cleanTest)

cleanTest = scalefeatures(cleanTest)
X_Test = cleanTest.drop(idCols, axis=1)

X_Test.head()

Unnamed: 0,mwebv_scale,distmod_scale,hostgal_photoz_err_scale,hostgal_photoz_scale,ra,decl,gal_l,gal_b,ddf,galactic
0,-0.470316,0.272839,-0.360829,-0.43977,34.453125,-5.229529,169.987075,-59.956185,1,0
1,-0.476762,0.404806,-0.492705,0.257126,33.398438,-4.331149,167.226341,-59.936551,1,0
2,-0.489653,0.458831,-0.337941,0.696638,348.529419,-61.75544,321.29398,-51.763351,1,0
3,-0.444533,0.411281,-0.020421,0.303883,34.804688,-5.829153,171.307861,-60.174401,1,0
4,-0.444533,0.343245,-0.513413,-0.122716,351.321442,-64.198746,317.458993,-50.429931,1,0


In [7]:
frames = [X_train, X_validate] 
feats = pd.concat(frames) #X - combined features

# Encode all feature columns
le =  ce.OneHotEncoder(return_df=True,impute_missing=False,handle_unknown="ignore")
encodedFeats = le.fit_transform(feats) #X_encoded

# put encoded features back to split dfs
feats_encoded_train = encodedFeats.iloc[0:X_train.shape[0], :] # X_encoded_train
feats_encoded_valid = encodedFeats.iloc[X_train.shape[0]:encodedFeats.shape[0], :] # X_encoded_valid
# feats_encoded_train.head()

In [22]:
feats_encoded_train.head()

Unnamed: 0,mwebv_scale,distmod_scale,hostgal_photoz_err_scale,hostgal_photoz_scale,ra,decl,gal_l,gal_b,ddf,galactic
3780,-0.391482,0.847385,-0.382538,1.989885,208.300781,-4.780192,330.318318,54.773706,0,0
1468,-0.418045,0.739035,-0.423824,0.573796,33.75,-4.630479,168.146242,-59.949072,1,0
5787,1.467894,0.869126,-0.140819,2.429651,287.226562,-7.632146,28.085822,-7.338392,0,0
5818,-0.325076,0.610163,-0.470769,-0.180534,318.401154,-58.16037,337.79216,-41.322248,0,0
5501,-0.205545,-1.533499,-0.518381,-0.656047,173.144531,-33.868706,284.741349,26.202143,0,1


In [8]:
#%matplotlib inline
# --------------------------------------------------------------------------
# Set tuning
# --------------------------------------------------------------------------
#nTrees = 200
#max_depth = 10
#min_node_size = 10
#verbose = 0

# --------------------------------------------------------------------------
# Fit the model
# --------------------------------------------------------------------------
#clf = RandomForestClassifier(n_estimators=nTrees, max_depth=max_depth, random_state=0, verbose=verbose, min_samples_leaf=min_node_size, n_jobs=-2)
#clf.fit(feats_encoded_train, y_train)
# print(clf.feature_importances_)
# --------------------------------------------------------------------------
# Predict the Testing Data and Evaluate
# --------------------------------------------------------------------------
#y_predicted_valid = clf.predict(feats_encoded_valid) # Y_test_hat

In [11]:
### set all variables

# number of neurons in each layer
input_num_units = 28*28
hidden_num_units = 500
output_num_units = 10
seed = 42

# define placeholders
x = tf.placeholder(tf.float32, [None, input_num_units])
y = tf.placeholder(tf.float32, [None, output_num_units])

# set remaining variables
epochs = 5
batch_size = 128
learning_rate = 0.01

### define weights and biases of the neural network (refer this article if you don't understand the terminologies)

weights = {
    'hidden': tf.Variable(tf.random_normal([input_num_units, hidden_num_units], seed=seed)),
    'output': tf.Variable(tf.random_normal([hidden_num_units, output_num_units], seed=seed))
}

biases = {
    'hidden': tf.Variable(tf.random_normal([hidden_num_units], seed=seed)),
    'output': tf.Variable(tf.random_normal([output_num_units], seed=seed))
}

In [12]:
hidden_layer = tf.add(tf.matmul(x, weights['hidden']), biases['hidden'])
hidden_layer = tf.nn.relu(hidden_layer)

output_layer = tf.matmul(hidden_layer, weights['output']) + biases['output']

In [17]:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=output_layer, labels=y))

In [18]:
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

In [21]:
init = tf.initialize_all_variables()
#init = tf.global_variables_initializer()

In [None]:
with tf.Session() as sess:
    # create initialized variables
    sess.run(init)
    
    ### for each epoch, do:
    ###   for each batch, do:
    ###     create pre-processed batch
    ###     run optimizer by feeding batch
    ###     find cost and reiterate to minimize
    
    for epoch in range(epochs):
        avg_cost = 0
        total_batch = int(train.shape[0]/batch_size)
        for i in range(total_batch):
            batch_x, batch_y = batch_creator(batch_size, feats_encoded_train.shape[0], 'train')
            _, c = sess.run([optimizer, cost], feed_dict = {x: batch_x, y: batch_y})
            
            avg_cost += c / total_batch
            
        print "Epoch:", (epoch+1), "cost =", "{:.5f}".format(avg_cost)
    
    print "\nTraining complete!"
    
    
    # find predictions on val set
    pred_temp = tf.equal(tf.argmax(output_layer, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(pred_temp, "float"))
    print "Validation Accuracy:", accuracy.eval({x: val_x.reshape(-1, input_num_units), y: dense_to_one_hot(val_y)})
    
    predict = tf.argmax(output_layer, 1)
    pred = predict.eval({x: test_x.reshape(-1, input_num_units)})