# FMD experiment

## Part 1

We are using Random Forest to classify images appearing in the FMD database using TDA methods.

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from itertools import combinations

data = pd.read_csv('new.csv')

nes = list(range(1,256))
betti = list(range(256,511))
sil = list(range(511, 766))
sil5 = list(range(766, 1021))
sil10 = list(range(1021, 1276))
#number of estimators
num = 100

We apply the Random Forest algorithm in each possible case. Different functions are combined via concatenation.

In [20]:
fun_list = [nes, betti, sil, sil5, sil10, nes + betti, sil + nes, sil5 + nes, sil10 + nes, betti +  sil,
            betti +  sil5, betti +  sil10, sil + nes + betti, sil5 + nes + betti, sil10 + nes + betti]
aux = []
for fun in fun_list:
    X = data.iloc[:, fun].values
    y = data.iloc[:, 0].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    #Create a Gaussian Classifier
    clf=RandomForestClassifier(n_estimators=num, random_state = 0)
    #Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    # Model Accuracy, how often is the classifier correct?
    aux.append(metrics.accuracy_score(y_test, y_pred))

In [21]:
names = ["nes", "betti", "sil", "sil5", "sil10", "nes + betti", "sil + nes", "sil5 + nes", "sil10 + nes",
         "betti +  sil",  "betti +  sil5", "betti +  sil10", "sil + nes + betti", "sil5 + nes + betti",
         "sil10 + nes + betti"]
for i in range(len(fun_list)):
    print("Accuracy", names[i], ":", aux[i])

Accuracy nes : 0.24
Accuracy betti : 0.285
Accuracy sil : 0.185
Accuracy sil5 : 0.14
Accuracy sil10 : 0.125
Accuracy nes + betti : 0.29
Accuracy sil + nes : 0.25
Accuracy sil5 + nes : 0.245
Accuracy sil10 + nes : 0.24
Accuracy betti +  sil : 0.285
Accuracy betti +  sil5 : 0.26
Accuracy betti +  sil10 : 0.27
Accuracy sil + nes + betti : 0.265
Accuracy sil5 + nes + betti : 0.27
Accuracy sil10 + nes + betti : 0.27


## Part 2 

In this case, we will try to classify categories pairwise. We will see how different TDA methods may be better depending on the data.

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from itertools import combinations

data = pd.read_csv('new.csv')
data_persim = pd.read_csv('db_persim.csv')
data_persim = data_persim[map(str,range(401))]

nes = list(range(1,256))
betti = list(range(256,511))
sil = list(range(511, 766))

#number of estimators
num = 100
#create the pairs
com = combinations(range(10), 2)

In [24]:
results = pd.DataFrame()

for pair in com:
    auxrow = []
    db = data.loc[[x[0] | x[1] for x in zip(data['labels'] == pair[0],  data['labels'] == pair[1])]]
    y = db.iloc[:, 0].values
    
    X = db.iloc[:, betti].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    clf=RandomForestClassifier(n_estimators=num, random_state = 0)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    auxrow.append(metrics.accuracy_score(y_test, y_pred))
    
    X = db.iloc[:, sil].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    clf=RandomForestClassifier(n_estimators=num, random_state = 0)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    auxrow.append(metrics.accuracy_score(y_test, y_pred))
    
    X = db.iloc[:, nes].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    clf=RandomForestClassifier(n_estimators=num, random_state = 0)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    auxrow.append(metrics.accuracy_score(y_test, y_pred))

    
    db = data_persim.loc[[x[0] | x[1] for x in zip(data_persim['0'] == pair[0],  data_persim['0'] == pair[1])]]
    y = db.iloc[:, 0].values
    X = db.iloc[:, 1:401].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    clf=RandomForestClassifier(n_estimators=num, random_state = 0)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    auxrow.append(metrics.accuracy_score(y_test, y_pred))
    
    results = pd.DataFrame([auxrow]).append(results, ignore_index=True)


In [25]:
#Create a function to bold the maximum value
def highlight_max(s):
    is_max = s == s.max()
    return ['font-weight: bold' if v else '' for v in is_max]

In [28]:
names = ['betti', 'sil', 'nes', 'persim']
results.columns = names
results.style.apply(highlight_max, axis=1)

Unnamed: 0,betti,sil,nes,persim
0,0.6,0.625,0.5,0.55
1,0.675,0.525,0.6,0.725
2,0.7,0.55,0.7,0.8
3,0.6,0.45,0.5,0.675
4,0.575,0.55,0.45,0.6
5,0.725,0.75,0.725,0.775
6,0.675,0.575,0.7,0.875
7,0.8,0.775,0.8,0.85
8,0.775,0.675,0.775,0.825
9,0.8,0.825,0.85,0.85
