In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [None]:
#Import the given CSV files into dataframe variables
#Also, add csv from 1st round
test_data = pd.read_csv("./input/test.csv")
train_data_new = pd.read_csv("./input/train.csv")
train_data_old = pd.read_csv("./input/train_old.csv")
# train_data_old.info()
# train_data_new.info()

#Concat or combine old train data with 40 rows and new train data with 400 rows
train_data = pd.concat ([train_data_new,train_data_old], sort=False)
#train_data.info()

In [None]:
#Feature Engineering Starts here
# Replace all data in n_jobs column with -1 to 16. 
#This assumes that -1 means using all processors (according to SKLearn documentation)
train_data.loc[train_data['n_jobs'] == -1, 'n_jobs'] = 8
test_data.loc[test_data['n_jobs'] == -1, 'n_jobs'] = 8
#8 is the maximum number in the train.csv provided

In [None]:
#Find correlation around features provided.
#Plot heat map using Matplotlib libraries.

train_data.corr()
plt.figure(figsize=(20,20))

g=sns.heatmap(train_data.corr(),annot=True,cmap="RdYlGn") #colormaps and attributes

In [None]:
#Creating a new featuer: class * cluster per class is related to each other. Divided this by n jobs.
train_data['ncpc'] = (train_data['n_classes'] * train_data['n_clusters_per_class'])/train_data['n_jobs']
train_data['nmis'] = (train_data['max_iter'] * train_data['n_samples'])/train_data['n_jobs']
train_data['nfnl'] = train_data['ncpc']/train_data['n_informative']
# print(train_data.shape)
# train_data.head()
# train_data.info()

In [None]:
#Creating a new featuer: class * cluster per class is related to each other. Divided this by n jobs (this applies to test data now)
test_data['ncpc'] = (test_data['n_classes'] * test_data['n_clusters_per_class'])/test_data['n_jobs']
#The number of classes (or labels) of the classification problem.
#The number of clusters per class.
test_data['nmis'] = (test_data['max_iter'] * test_data['n_samples'])/test_data['n_jobs']
test_data['nfnl'] = test_data['ncpc']/test_data['n_informative']
# print(test_data.shape)
# test_data.head()
# test_data.info()

In [None]:
#New variables being defined for time and penalty columns
time_tag1 = train_data["time"]

In [None]:
penalty_tag_train = train_data["penalty"]
penalty_tag_test = test_data["penalty"]
train_data = train_data.drop(['time'], axis=1)
#train_data.head(5)
#test_data.head(5)

In [None]:
dtypes1 = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] #great variety of numerical types

newdf_train = train_data.select_dtypes(include=dtypes1)
newdf_test = test_data.select_dtypes(include=dtypes1)

train_temp = (newdf_train - newdf_test.mean())/newdf_test.std(ddof=0)

train = pd.concat([train_temp,penalty_tag_train],axis=1) #concat penalty column with above variable

train = train.drop(columns=['l1_ratio','scale','random_state','alpha','flip_y'])
train.shape

In [None]:
test = (newdf_test - newdf_test.mean())/newdf_test.std(ddof=0)
test = test.join(penalty_tag_test)
test = test.drop(columns=['l1_ratio','scale','random_state','alpha','flip_y'])
test.shape

In [None]:
train.head()

In [None]:
test.head()

In [None]:
#Tensorflow code starts here...
#define parameters
BATCH_SIZE = 200
num_epochs = 10000

X_train, X_test, y_train, y_test = train_test_split(train, time_tag1, test_size=0.15) #split into 75/25

In [None]:
input_train = tf.estimator.inputs.pandas_input_fn(x=train,y=time_tag1,batch_size=BATCH_SIZE,num_epochs=num_epochs,shuffle=True)
input_test = tf.estimator.inputs.pandas_input_fn(x=X_test,y=y_test,batch_size=BATCH_SIZE,num_epochs=num_epochs,shuffle=True)

In [None]:
max_iter = tf.feature_column.numeric_column("max_iter")
n_jobs = tf.feature_column.numeric_column("n_jobs")
n_samples = tf.feature_column.numeric_column("n_samples")
n_features = tf.feature_column.numeric_column("n_features")
n_classes = tf.feature_column.numeric_column("n_classes")
n_clusters_per_class = tf.feature_column.numeric_column("n_clusters_per_class")
n_informative = tf.feature_column.numeric_column("n_informative")
#Split each one for TF column

In [None]:
ncpc = tf.feature_column.numeric_column("ncpc")
nmis = tf.feature_column.numeric_column("nmis")
nfnl = tf.feature_column.numeric_column("nfnl")

In [None]:
penalty_tf = tf.feature_column.categorical_column_with_vocabulary_list(key="penalty", vocabulary_list=["l2", "l1", "none", "elasticnet"])

In [None]:
Feature_columns = [
    max_iter,
    n_jobs, 
    n_samples, 
    n_features, 
    n_classes,
    n_clusters_per_class, 
    n_informative,
    ncpc,
    nmis,
    nfnl,
    tf.feature_column.indicator_column(penalty_tf),
]

wide_columns = [    
    max_iter,
    n_jobs, 
    n_samples, 
    n_features, 
    n_classes, 
    n_clusters_per_class, 
    n_informative,
    ncpc,
    nmis,
    nfnl,
]

In [None]:
print(", " . join(Feature_columns))
print(", " . join(wide_columns))

In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

In [None]:
DNNTF = tf.estimator.DNNLinearCombinedRegressor(
    linear_feature_columns=wide_columns,
    dnn_feature_columns=Feature_columns,
    dnn_hidden_units=[1000, 600, 360, 150, 75, 25, 14,7],
    dnn_activation_fn=tf.nn.leaky_relu)
DNNTF.train(input_fn=input_train)

print('Tensor Flow has finished running. Training with above feature is now ready.')

In [None]:
predict_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=test,
        batch_size=1,
        num_epochs=1,
        shuffle=False)

predictions = m.predict(input_fn=predict_input_fn)
result = []
for i in predictions:
    result.append(i["predictions"][0])
print(result)

print(len(result))

#loop to print results
count=0
for row in result:
    if row<0:
        count += 1
print(count)

In [None]:
#arrange test id and print final week result to be submitted to Kaggle final week competition
test_id = np.arange(100)
test_id = test_id.reshape(len(test_id),1)
final_wk_result = np.array(final_wk_result)
final_wk_result = result.reshape(len(final_wk_result),1)
final_wk_result = np.abs(final_wk_result)
final_wk_output = np.concatenate((test_id,result), axis=1)
np.savetxt("final_week.csv", final_wk_output, delimiter=",", fmt='%i,%f', header="Id,Time", comments='')