## Import libraries and files

In [1]:
import pandas as pd
import warnings
import time
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', -1)

ModuleNotFoundError: No module named 'xgboost'

In [34]:
def get_confusion_matrix_values(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    return(cm[0][0], cm[0][1], cm[1][0], cm[1][1])

In [35]:
df_output = pd.read_csv("test_file.txt")

In [43]:
df_output

Unnamed: 0,gbif_name,ncbi_name,category_label,match,levenshtein_distance,lcs_sequence_length,lcs_edit_distance,damerau_levenshtein_distance,hamming_distance,jaro_similarity,...,partial_ratio,token_sort_ratio,token_set_ratio,w_ratio,uq_ratio,q_ratio,matching_numbers,matching_numbers_log,log_fuzz_score,log_fuzz_score_numbers
0,plantae;tracheophyta;magnoliopsida;austrobaileyales;schisandraceae;illicium angustisepalum,eukaryota;streptophyta;magnoliopsida;austrobaileyales;schisandraceae;illicium angustisepalum,species,1,13,83,13,13,87,0.823248,...,92,87,91,91,91,91,1,0.693147,5.888878,5.888878
1,plantae;tracheophyta;liliopsida;asparagales;orchidaceae;maxillaria calcarata,eukaryota;streptophyta;magnoliopsida;asparagales;orchidaceae;maxillaria calcarata,species,1,18,67,18,18,74,0.820198,...,86,80,80,85,85,85,1,0.693147,5.802118,5.802118
2,animalia;arthropoda;insecta;coleoptera;curculionidae;acalles indigens,eukaryota;arthropoda;insecta;coleoptera;curculionidae;acalles indigens,species,1,9,63,9,9,69,0.848677,...,91,86,93,91,91,91,1,0.693147,5.886104,5.886104
3,plantae;chlorophyta;trebouxiophyceae;prasiolales;prasiolaceae;rosenvingiellopsis constricta,eukaryota;chlorophyta;trebouxiophyceae;prasiolales;prasiolaceae;rosenvingiellopsis constricta,species,1,8,87,8,8,88,0.895616,...,96,95,95,95,95,95,1,0.693147,5.940171,5.940171
4,animalia;arthropoda;insecta;coleoptera;curculionidae;darumazo,eukaryota;arthropoda;insecta;coleoptera;curculionidae;darumazo,genus,1,9,55,9,9,62,0.852004,...,89,85,92,89,89,89,1,0.693147,5.872118,5.872118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,plantae;tracheophyta;liliopsida;asparagales;amaryllidaceae;strumaria truncata,eukaryota;streptophyta;magnoliopsida;arecales;arecaceae;phoenix canariensis,species,0,47,43,47,47,71,0.712099,...,57,50,50,57,57,57,1,0.693147,5.361292,5.361292
996,animalia;chordata;aves;passeriformes;alaudidae;calandrella rufescens,eukaryota;streptophyta;magnoliopsida;gentianales;apocynaceae;macropharynx steyermarkii,species,0,65,26,65,65,79,0.666758,...,38,38,38,36,34,34,1,0.693147,4.997212,4.997212
997,chromista;ochrophyta;bacillariophyceae;naviculales;scoliotropidaceae;biremis panamae,eukaryota;streptophyta;magnoliopsida;asparagales;amaryllidaceae;strumaria truncata,species,0,54,42,54,54,76,0.683051,...,51,45,45,51,51,51,1,0.693147,5.252273,5.252273
998,animalia;arthropoda;insecta;mantodea;nanomantidae;bolbena hottentotta,eukaryota;chordata;amphibia;anura;centrolenidae;nymphargus ocellatus,species,0,53,29,53,52,66,0.650400,...,43,44,44,42,42,42,1,0.693147,5.153292,5.153292


## Feature selection (find features that the most contribute to the prediction)

In [36]:
#Correlation with output variable
cor_target = abs(df_output.corr()["match"])
#Selecting highly correlated features
relevant_features = list(pd.DataFrame(cor_target[cor_target>0.9]).index)

In [37]:
features = pd.DataFrame(cor_target[cor_target>0.9]).sort_values("match", ascending=False)

In [38]:
top_feature_list = list(features.head()[1:].index)

## Find best classifiers for the prediction

In [39]:
#creating training and testing data
X = df_output[relevant_features[1:]].values
y = df_output['match'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [45]:
#selecting and comparing state of the art classifiers to choose the best one
classifiers = {
    "DummyClassifier_stratified":DummyClassifier(strategy='stratified', random_state=0),    
    "KNeighborsClassifier":KNeighborsClassifier(3),
    "DecisionTreeClassifier":DecisionTreeClassifier(),
    "AdaBoostClassifier":AdaBoostClassifier(),
    "GradientBoostingClassifier":GradientBoostingClassifier(),
    "Perceptron": Perceptron(max_iter=40, eta0=0.1, random_state=1),
    "SupportVectorMachine":SVC(),
    "MLP": MLPClassifier(max_iter=100),
    "RandomForestClassifier":RandomForestClassifier(),
    "XGBClassifier":XGBClassifier(n_estimators=1000, learning_rate=0.1),
    "XGBClassifier finetuned":XGBClassifier(
                      scale_pos_weight=1,
                      learning_rate=0.01,  
                      colsample_bytree = 0.8,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=9, 
                      gamma=10)

}

df_results = pd.DataFrame(columns=['model', 'accuracy', 'mae', 'precision','recall','f1','roc','run_time','tp','fp','tn','fn'])
for key in classifiers:

    start_time = time.time()
    classifier = classifiers[key]
    model = classifier.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc = roc_auc_score(y_test, y_pred)
    classification = classification_report(y_test, y_pred, zero_division=0)
    run_time = format(round((time.time() - start_time)/60,2))
    tp, fp, fn, tn = get_confusion_matrix_values(y_test, y_pred)

    row = {'model': key,
           'accuracy': accuracy,
           'mae': mae,
           'precision': precision,
           'recall': recall,
           'f1': f1,
           'roc': roc,
           'run_time': run_time,
           'tp': tp,
           'fp': fp,
           'tn': tn,
           'fn': fn,
          }
    df_results = df_results.append(row, ignore_index=True)

df_results.sort_values(['accuracy', 'precision'], ascending=[False, False]).reset_index(drop = True)

Unnamed: 0,model,accuracy,mae,precision,recall,f1,roc,run_time,tp,fp,tn,fn
0,DecisionTreeClassifier,1.0,0.0,1.0,1.0,1.0,1.0,0.0,148,0,152,0
1,AdaBoostClassifier,1.0,0.0,1.0,1.0,1.0,1.0,0.0,148,0,152,0
2,GradientBoostingClassifier,1.0,0.0,1.0,1.0,1.0,1.0,0.0,148,0,152,0
3,SupportVectorMachine,1.0,0.0,1.0,1.0,1.0,1.0,0.0,148,0,152,0
4,RandomForestClassifier,1.0,0.0,1.0,1.0,1.0,1.0,0.0,148,0,152,0
5,XGBClassifier,1.0,0.0,1.0,1.0,1.0,1.0,0.0,148,0,152,0
6,XGBClassifier finetuned,1.0,0.0,1.0,1.0,1.0,1.0,0.0,148,0,152,0
7,KNeighborsClassifier,0.996667,0.003333,1.0,0.993421,0.9967,0.996711,0.0,148,0,151,1
8,Perceptron,0.993333,0.006667,1.0,0.986842,0.993377,0.993421,0.0,148,0,150,2
9,MLP,0.993333,0.006667,0.993421,0.993421,0.993421,0.993332,0.0,147,1,151,1
