In [3]:
from keras.preprocessing.image import img_to_array, load_img, array_to_img
import os
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    SVC(kernel="rbf", C=0.025, probability=True),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GaussianNB()]

def accuracy(predictions, labels):
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
            / predictions.shape[0])

def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.01)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)


def run_session(num_epochs, name, k_prob=1.0):

    with tf.Session(graph=graph) as session:
        merged = tf.merge_all_summaries()  
        writer = tf.train.SummaryWriter("/tmp/tensorflowlogs", session.graph)
        tf.initialize_all_variables().run()
        print("Initialized")
        for epoch in range(num_epochs):
            offset = (epoch * batch_size) % (y_train.shape[0] - batch_size)
            batch_data = X_train[offset:(offset + batch_size), :]
            batch_labels = y_train[offset:(offset + batch_size), :]
            feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : k_prob}
            _, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
            
        test_prob = test_prediction.eval()
        return test_prob
    
    

split_by_half = lambda x,k : int(x/2**k)

In [4]:
potential_prob_sample_predicted = pd.read_csv("potential_prob_sample_predicted.csv")
potential_prob_sample_table = pd.read_csv("potential_prob_sample_table.csv")

for sample in range(30): 
    
    p_species = pd.Series(potential_prob_sample_table.iloc[:, sample])
    p_species = p_species[~p_species.isnull()].tolist()

    p_pred = potential_prob_sample_predicted.iloc[0, sample]

    p_id = int(potential_prob_sample_predicted.columns.values[sample])

    print(p_id, p_species, p_pred)

    train_feature_data = pd.read_csv("train.csv")
    test_feature_data = pd.read_csv("test.csv")

    n_specials = len(p_species)
    train_feature_data = train_feature_data.loc[train_feature_data["species"].isin(p_species), : ]
    test_feature_data = test_feature_data.loc[test_feature_data["id"] == p_id, :]

    ID = train_feature_data.pop("id")

    train_labels = train_feature_data.pop('species')
    le = preprocessing.LabelEncoder()
    train_labels = le.fit(train_labels).transform(train_labels) 

    # standardize the data by setting the mean to 0 and std to 1
    scaler = StandardScaler().fit(train_feature_data)
    train_feature_data = scaler.transform(train_feature_data)


    test_feature_id = test_feature_data.pop("id")
    test_feature_data = scaler.transform(test_feature_data)
    test_feature_data = test_feature_data.astype(np.float32)

    test_prob = np.zeros((1, n_specials), dtype=float)


    X_train, X_val, y_train, y_val = train_test_split(train_feature_data, train_labels, test_size=.1, random_state=13, stratify = train_labels)


    X_train = X_train.astype(np.float32)
    X_val = X_val.astype(np.float32)

    X_train = X_train.astype(np.float32)
    X_val = X_val.astype(np.float32)
    test_feature_data = test_feature_data.astype(np.float32)


    classifiers = [
        SVC(kernel="rbf", C=0.025, probability=True),
        RandomForestClassifier(),
        AdaBoostClassifier(),
        GaussianNB()]

    # Logging for Visual Comparison
    log_cols=["Classifier", "Accuracy", "Log Loss"]
    log = pd.DataFrame(columns=log_cols)

    final_prob = np.zeros((1, n_specials), dtype=float)
    for clf in classifiers:
        clf.fit(X_train, y_train)
        name = clf.__class__.__name__

        train_predictions = clf.predict(X_val)
        acc = accuracy_score(y_val, train_predictions)

        train_predictions = clf.predict_proba(X_val)
        ll = log_loss(y_val, train_predictions)

        log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
        log = log.append(log_entry)

        test_prob = clf.predict_proba(test_feature_data)

        final_prob = final_prob + test_prob


    print(le.inverse_transform(range(n_specials)))
    print(final_prob/4)




97 ['Quercus_Brantii', 'Quercus_Castaneifolia', 'Quercus_Kewensis'] Quercus_Castaneifolia
['Quercus_Brantii' 'Quercus_Castaneifolia' 'Quercus_Kewensis']
[[ 0.12084752  0.70566424  0.17348824]]
213 ['Lithocarpus_Cleistocarpus', 'Lithocarpus_Edulis', 'Magnolia_Heptapeta'] Lithocarpus_Cleistocarpus
['Lithocarpus_Cleistocarpus' 'Lithocarpus_Edulis' 'Magnolia_Heptapeta']
[[ 0.43084418  0.16568751  0.4034683 ]]
285 ['Ilex_Cornuta', 'Salix_Intergra'] Salix_Intergra
['Ilex_Cornuta' 'Salix_Intergra']
[[ 0.46415527  0.53584473]]
297 ['Cytisus_Battandieri', 'Fagus_Sylvatica', 'Populus_Adenopoda'] Populus_Adenopoda
['Cytisus_Battandieri' 'Fagus_Sylvatica' 'Populus_Adenopoda']
[[ 0.10834291  0.73176913  0.15988796]]
301 ['Cornus_Controversa', 'Cornus_Macrophylla', 'Eucalyptus_Glaucescens', 'Eucalyptus_Neglecta', 'Quercus_Infectoria_sub', 'Quercus_Semecarpifolia'] Cornus_Controversa
['Cornus_Controversa' 'Cornus_Macrophylla' 'Eucalyptus_Glaucescens'
 'Eucalyptus_Neglecta' 'Quercus_Infectoria_sub' 'Q