In [121]:
from keras.preprocessing.image import img_to_array, load_img, array_to_img
import os
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.preprocessing import StandardScaler


In [150]:
train_feature_data = pd.read_csv("train.csv")
test_feature_data = pd.read_csv("test.csv")

ID = train_feature_data.pop("id")

train_labels = train_feature_data.pop('species')
le = preprocessing.LabelEncoder()
train_labels = le.fit(train_labels).transform(train_labels) 

# standardize the data by setting the mean to 0 and std to 1
scaler = StandardScaler().fit(train_feature_data)
train_feature_data = scaler.transform(train_feature_data)

X_train, X_val, y_train, y_val = train_test_split(train_feature_data, train_labels, test_size=.1, random_state=2, stratify = train_labels)
y_train = (np.arange(99) == y_train[:,None]).astype(np.float32)
y_val = (np.arange(99) == y_val[:,None]).astype(np.float32)

X_train = X_train.astype(np.float32)
X_val = X_val.astype(np.float32)

test_feature_id = test_feature_data.pop("id")
test_feature_data = scaler.transform(test_feature_data)
test_feature_data = test_feature_data.astype(np.float32)


print("Training Data", X_train.shape, y_train.shape)
print("Validated Data", X_val.shape, y_val.shape)
print("Test Data", test_feature_data.shape)

Training Data (891, 192) (891, 99)
Validated Data (99, 192) (99, 99)
Test Data (594, 192)


In [151]:
def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
            / predictions.shape[0])

def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.01)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)


split_by_half = lambda x,k : int(x/2**k)

In [152]:
print(le.inverse_transform([71, 80]))
print(le.inverse_transform([37, 38]))
print(le.inverse_transform([28, 29]))
print(le.inverse_transform([54, 75]))
print(le.inverse_transform([69, 55]))
print(le.inverse_transform([66, 64]))

['Quercus_Palustris' 'Quercus_Shumardii']
['Lithocarpus_Cleistocarpus' 'Lithocarpus_Edulis']
['Eucalyptus_Glaucescens' 'Eucalyptus_Neglecta']
['Quercus_Canariensis' 'Quercus_Pubescens']
['Quercus_Kewensis' 'Quercus_Castaneifolia']
['Quercus_Ilex' 'Quercus_Greggii']


In [153]:
batch_size = 662
hidden_nodes = 1024
lamb_reg = 0.0

graph = tf.Graph()
with graph.as_default():

    # Input data.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, 192))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, 99))
    tf_valid_dataset = tf.constant(X_val)
    tf_valid_labels = tf.constant(y_val)
    tf_test_dataset = tf.constant(test_feature_data)

    # Variables.
    layer1_weights = weight_variable([192, hidden_nodes])
    layer1_biases = bias_variable([hidden_nodes])
    layer4_weights = weight_variable([hidden_nodes, 99])
    layer4_biases = bias_variable([99])
    
    keep_prob = tf.placeholder("float")
    
    # Model with dropout
    def model(data, proba=keep_prob):
        layer1 = tf.matmul(data, layer1_weights) + layer1_biases
        hidden1 = tf.nn.dropout(tf.nn.relu(layer1), proba)  # dropout on hidden layer
        return tf.matmul(hidden1, layer4_weights) + layer4_biases
    
    # Training computation.
    logits = model(tf_train_dataset, keep_prob)
    
    # loss = tf.reduce_sum(tf.multiply(tf.log(tf.clip_by_value(tf.nn.softmax(logits), 1e-10, 1.0)), tf_train_labels))/-32.0
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
   
    regularizers = (tf.nn.l2_loss(layer1_weights) + tf.nn.l2_loss(layer1_biases) + \
                    tf.nn.l2_loss(layer4_weights) + tf.nn.l2_loss(layer4_biases))

    # Add the regularization term to the loss.
    loss = tf.reduce_mean(loss + lamb_reg * regularizers)
    
    # Optimizer.
    optimizer = tf.train.RMSPropOptimizer(1e-4).minimize(loss)

    
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(model(tf_valid_dataset, 1.0))
    
    # loss_v = tf.reduce_sum(tf.multiply(tf.log(tf.clip_by_value(valid_prediction, 1e-10, 1.0)), tf_valid_labels))/-(99.0*3)
    loss_v = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(model(tf_valid_dataset,1.0), tf_valid_labels))
    
    test_prediction = tf.nn.softmax(model(tf_test_dataset, 1.0))


In [157]:
# Re-define the function to include the keep probability

def run_session(num_epochs, name, k_prob=1.0):

    with tf.Session(graph=graph) as session:
        merged = tf.merge_all_summaries()  
        writer = tf.train.SummaryWriter("/tmp/tensorflowlogs", session.graph)
        tf.initialize_all_variables().run()
        print("Initialized")
        for epoch in range(num_epochs):
            offset = (epoch * batch_size) % (y_train.shape[0] - batch_size)
            batch_data = X_train[offset:(offset + batch_size), :]
            batch_labels = y_train[offset:(offset + batch_size), :]
            feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : k_prob}
            _, l, predictions, l_v = session.run([optimizer, loss, train_prediction, loss_v], feed_dict=feed_dict)
            if (epoch % 500 == 0):
                print("Minibatch loss at epoch {}: {}".format(epoch, l))
                print("Validation loss at epoch {}: {}".format(epoch, l_v))
                print("Minibatch accuracy: {:.1f}".format(accuracy(predictions, batch_labels)))
                print("Validation accuracy: {:.1f}".format(accuracy(valid_prediction.eval(), y_val)))
        
        test_prob = test_prediction.eval()
        return test_prob

In [158]:
test_prob = run_session(20000, "Deep_NN", 0.5)

Instructions for updating:
Please switch to tf.summary.merge_all.
Instructions for updating:
Please switch to tf.summary.FileWriter. The interface and behavior is the same; this is just a rename.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized
Minibatch loss at epoch 0: 4.59370231628418
Validation loss at epoch 0: 4.59503698348999
Minibatch accuracy: 0.6
Validation accuracy: 0.0
Minibatch loss at epoch 500: 0.08406969904899597
Validation loss at epoch 500: 0.13460473716259003
Minibatch accuracy: 99.8
Validation accuracy: 99.0
Minibatch loss at epoch 1000: 0.0011797471670433879
Validation loss at epoch 1000: 0.013119995594024658
Minibatch accuracy: 100.0
Validation accuracy: 100.0
Minibatch loss at epoch 1500: 0.0001918996131280437
Validation loss at epoch 1500: 0.006283221300691366
Minibatch accuracy: 100.0
Validation accuracy: 100.0
Minibatch loss at epoch 2000: 0.0001024752200464718
Validation loss at epoch 2000: 0.004507883917540312
Minibatch ac

In [159]:
test_submit = pd.DataFrame(test_prob, index=test_feature_id,columns=le.inverse_transform(range(99)))

fp = open('submit.csv', 'w')
fp.write(test_submit.to_csv())

test_submit.tail()


Unnamed: 0_level_0,Acer_Capillipes,Acer_Circinatum,Acer_Mono,Acer_Opalus,Acer_Palmatum,Acer_Pictum,Acer_Platanoids,Acer_Rubrum,Acer_Rufinerve,Acer_Saccharinum,...,Salix_Fragilis,Salix_Intergra,Sorbus_Aria,Tilia_Oliveri,Tilia_Platyphyllos,Tilia_Tomentosa,Ulmus_Bergmanniana,Viburnum_Tinus,Viburnum_x_Rhytidophylloides,Zelkova_Serrata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1576,2.440899e-10,0.9999995,8.831962e-14,4.7682330000000005e-17,2.955674e-07,1.188586e-13,1.947139e-14,2.422856e-10,1.384369e-08,1.447734e-09,...,2.215762e-12,1.992084e-16,4.124587e-13,1.252392e-14,7.429296e-16,1.154957e-12,7.869796e-10,2.169862e-18,1.437015e-14,8.275723e-08
1577,4.083067e-10,7.456544e-10,1.097664e-16,6.771465e-10,8.047896e-14,8.158166e-18,6.095036e-12,2.575843e-07,1.658736e-07,8.135807e-14,...,4.091379e-13,8.341479000000001e-17,1.47211e-07,1.347312e-12,9.415825e-09,1.759933e-08,4.89508e-12,7.492896e-18,1.220798e-13,1.092134e-07
1579,3.385547e-12,7.436422e-13,7.781795e-16,1.848521e-16,5.576487e-10,6.263753e-10,4.617105e-17,4.457102e-14,5.28559e-16,2.75828e-08,...,8.612141e-18,1.842578e-18,4.978316e-12,7.207074e-14,1.402834e-11,9.493672e-21,7.986348e-18,6.301556e-15,1.530689e-11,1.159556e-09
1580,1.854899e-16,2.2130220000000002e-17,2.629771e-11,7.787225e-16,2.44433e-15,8.753763999999999e-19,9.019534e-12,5.15214e-12,1.2323340000000002e-17,2.369852e-20,...,3.160817e-10,1.943864e-11,3.129738e-20,6.482206e-12,1.285599e-20,2.639021e-15,1.332293e-16,4.845224e-16,1.333509e-22,3.582541e-15
1583,1.0961919999999999e-20,1.420207e-14,3.296923e-12,7.646984e-16,7.31546e-13,4.30428e-09,5.703745e-09,4.314401e-15,1.262867e-17,2.398561e-15,...,3.857383e-16,4.660028e-18,5.793973000000001e-17,8.932258e-15,1.903239e-15,1.0953500000000001e-17,2.9228660000000003e-17,1.91673e-14,5.947094e-18,7.070727e-12


In [5]:
from keras.preprocessing.image import img_to_array, load_img, array_to_img
import os
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
            / predictions.shape[0])

def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.01)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)


def run_session(num_epochs, name, k_prob=1.0):

    with tf.Session(graph=graph) as session:
        merged = tf.merge_all_summaries()  
        writer = tf.train.SummaryWriter("/tmp/tensorflowlogs", session.graph)
        tf.initialize_all_variables().run()
        print("Initialized")
        for epoch in range(num_epochs):
            offset = (epoch * batch_size) % (y_train.shape[0] - batch_size)
            batch_data = X_train[offset:(offset + batch_size), :]
            batch_labels = y_train[offset:(offset + batch_size), :]
            feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : k_prob}
            _, l, predictions, l_v = session.run([optimizer, loss, train_prediction, loss_v], feed_dict=feed_dict)
            
            if (epoch % 10000 == 0):
                print("Minibatch loss at epoch {}: {}".format(epoch, l))
                print("Validation loss at epoch {}: {}".format(epoch, l_v))
                print("Minibatch accuracy: {:.1f}".format(accuracy(predictions, batch_labels)))
                print("Validation accuracy: {:.1f}".format(accuracy(valid_prediction.eval(), y_val)))
        
        test_prob = test_prediction.eval()
        return test_prob
    
    

split_by_half = lambda x,k : int(x/2**k)


train_feature_data = pd.read_csv("train.csv")
test_feature_data = pd.read_csv("test.csv")

ID = train_feature_data.pop("id")

train_labels = train_feature_data.pop('species')
le = preprocessing.LabelEncoder()
train_labels = le.fit(train_labels).transform(train_labels) 

# standardize the data by setting the mean to 0 and std to 1
scaler = StandardScaler().fit(train_feature_data)
train_feature_data = scaler.transform(train_feature_data)

test_feature_id = test_feature_data.pop("id")
test_feature_data = scaler.transform(test_feature_data)
test_feature_data = test_feature_data.astype(np.float32)

for rdm in range(10, 30):
    
    X_train, X_val, y_train, y_val = train_test_split(train_feature_data, train_labels, test_size=.1, random_state=rdm, stratify = train_labels)
    y_train = (np.arange(99) == y_train[:,None]).astype(np.float32)
    y_val = (np.arange(99) == y_val[:,None]).astype(np.float32)

    X_train = X_train.astype(np.float32)
    X_val = X_val.astype(np.float32)

    batch_size = 662
    hidden_nodes = 1024
    lamb_reg = 0.0

    graph = tf.Graph()
    with graph.as_default():

        # Input data.
        tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, 192))
        tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, 99))
        tf_valid_dataset = tf.constant(X_val)
        tf_valid_labels = tf.constant(y_val)
        tf_test_dataset = tf.constant(test_feature_data)

        # Variables.
        layer1_weights = weight_variable([192, hidden_nodes])
        layer1_biases = bias_variable([hidden_nodes])
        layer4_weights = weight_variable([hidden_nodes, 99])
        layer4_biases = bias_variable([99])

        keep_prob = tf.placeholder("float")

        # Model with dropout
        def model(data, proba=keep_prob):
            layer1 = tf.matmul(data, layer1_weights) + layer1_biases
            hidden1 = tf.nn.dropout(tf.nn.relu(layer1), proba)  # dropout on hidden layer
            return tf.matmul(hidden1, layer4_weights) + layer4_biases

        # Training computation.
        logits = model(tf_train_dataset, keep_prob)

        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))

        regularizers = (tf.nn.l2_loss(layer1_weights) + tf.nn.l2_loss(layer1_biases) + \
                        tf.nn.l2_loss(layer4_weights) + tf.nn.l2_loss(layer4_biases))

        # Add the regularization term to the loss.
        loss = tf.reduce_mean(loss + lamb_reg * regularizers)

        # Optimizer.
        optimizer = tf.train.RMSPropOptimizer(1e-4).minimize(loss)


        # Predictions for the training, validation, and test data.
        train_prediction = tf.nn.softmax(logits)
        valid_prediction = tf.nn.softmax(model(tf_valid_dataset, 1.0))

        loss_v = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(model(tf_valid_dataset,1.0), tf_valid_labels))

        test_prediction = tf.nn.softmax(model(tf_test_dataset, 1.0))
        
        test_prob = run_session(20000, "Deep_NN", 0.5)
        
        test_submit = pd.DataFrame(test_prob, index=test_feature_id, columns=le.inverse_transform(range(99)))
        
        print(rdm)
        save_name = 'submit'+ str(rdm) + '.csv'
        
        fp = open(save_name, 'w')
        fp.write(test_submit.to_csv())


Using TensorFlow backend.


Instructions for updating:
Please switch to tf.summary.merge_all.
Instructions for updating:
Please switch to tf.summary.FileWriter. The interface and behavior is the same; this is just a rename.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized
Minibatch loss at epoch 0: 4.602806568145752
Validation loss at epoch 0: 4.601140022277832
Minibatch accuracy: 0.8
Validation accuracy: 0.0
Minibatch loss at epoch 10000: 7.220596216939157e-06
Validation loss at epoch 10000: 0.0343179851770401
Minibatch accuracy: 100.0
Validation accuracy: 99.0
10
Instructions for updating:
Please switch to tf.summary.merge_all.
Instructions for updating:
Please switch to tf.summary.FileWriter. The interface and behavior is the same; this is just a rename.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Initialized
Minibatch loss at epoch 0: 4.594104766845703
Validation loss at epoch 0: 4.594125270843506
Minibatch accuracy: 1.5
Validation accuracy: 1

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn import preprocessing
import tensorflow as tf
from sklearn.preprocessing import StandardScaler


In [51]:
train_feature_data = pd.read_csv("train.csv")
test_feature_data = pd.read_csv("test.csv")

need_species = ["Cytisus_Battandieri", "Fagus_Sylvatica", "Populus_Adenopoda"  ]  
train_feature_data = train_feature_data.loc[train_feature_data["species"].isin(need_species), : ]
test_feature_data = test_feature_data.loc[test_feature_data["id"] == 297, :]

ID = train_feature_data.pop("id")

train_labels = train_feature_data.pop('species')
le = preprocessing.LabelEncoder()
train_labels = le.fit(train_labels).transform(train_labels) 

# standardize the data by setting the mean to 0 and std to 1
scaler = StandardScaler().fit(train_feature_data)
train_feature_data = scaler.transform(train_feature_data)


test_feature_id = test_feature_data.pop("id")
test_feature_data = scaler.transform(test_feature_data)
test_feature_data = test_feature_data.astype(np.float32)



In [53]:
leaf_rf = RandomForestClassifier(n_estimators=1000)
leaf_rf.fit(train_feature_data, train_labels)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(leaf_rf, train_feature_data, train_labels, cv=10)
print(scores)

prob = leaf_rf.predict_prob(test_feature_data)
# print(le.inverse_transform(test_pred))
prob

[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]


AttributeError: 'RandomForestClassifier' object has no attribute 'predict_prob'