In [1]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd

In [17]:
train_data = pd.DataFrame.from_csv("./data/train.csv")
test_feature = pd.DataFrame.from_csv("./data/test.csv")

In [18]:
train_labels = train_data['species'].values
train_feature = train_data[[i for i in train_data.columns if i != 'species']].values

In [19]:
minmax_scaler = MinMaxScaler()
train_feature = minmax_scaler.fit_transform(np.array(train_feature))
test_feature = minmax_scaler.transform(np.array(test_feature))

In [20]:
def train_validation_split(x, y):
    idx = (list(range(len(y))))
    np.random.shuffle(idx)
    if type(y) != np.ndarray:
        y = np.array(y)
    if type(x) != np.ndarray:
        x = np.array(x)
    y = y[idx]
    x = x[idx]
    
    vali_idx = []
    s = {}
    for k in set(y):
        s[k] = 2
        
    for i in range(len(y)):
        if s[y[i]] > 0:
            vali_idx.append(i)
            s[y[i]] = s[y[i]] - 1
            
    train_idx = [i for i in range(len(y)) if i not in vali_idx]
    return  x[train_idx], x[vali_idx], y[train_idx], y[vali_idx]

In [22]:
X_train, X_vali, y_train, y_vali = train_validation_split(train_feature, train_labels)

In [25]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_vali)
    acc = accuracy_score(y_vali, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions = clf.predict_proba(X_vali)
    ll = log_loss(y_vali, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

KNeighborsClassifier
****Results****
Accuracy: 98.4848%
Log Loss: 0.21565754827390174
SVC
****Results****
Accuracy: 96.9697%
Log Loss: 4.598463830715084
NuSVC
****Results****
Accuracy: 98.4848%
Log Loss: 2.364591942505799
DecisionTreeClassifier
****Results****
Accuracy: 67.1717%
Log Loss: 11.338487200349562
RandomForestClassifier
****Results****
Accuracy: 87.3737%
Log Loss: 1.3264860795435591
AdaBoostClassifier
****Results****
Accuracy: 3.0303%
Log Loss: 4.676099119921021
GradientBoostingClassifier
****Results****
Accuracy: 55.0505%
Log Loss: 2.961984242868131
GaussianNB
****Results****
Accuracy: 57.0707%
Log Loss: 14.827252492764792
LinearDiscriminantAnalysis
****Results****
Accuracy: 96.9697%
Log Loss: 1.1280478764599071
QuadraticDiscriminantAnalysis
****Results****
Accuracy: 3.5354%
Log Loss: 33.31770854256545


  np.exp(prob, prob)


In [27]:
def prepare_submission_file(clf, file_name):
    res = clf.predict_proba(test_feature)
    submission = pd.DataFrame(res, columns=clf.classes_)
    submission.insert(0, 'id', pd.DataFrame.from_csv("./data/test.csv").index.values)
    submission.to_csv("./baseline_submission/" + file_name + ".csv", index=False)

In [28]:
# best_clf = classifiers[2]
for i in classifiers:
    prepare_submission_file(i,i.__class__.__name__)

  np.exp(prob, prob)
