# Libraries

In [1]:
import os
import sys
import warnings
import pandas as pd
from lib.places365 import predict

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import KFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC

IMAGES_DIR = "./images"
NO_OF_CHARS = 10
 
# Ignore warnings
warnings.filterwarnings("ignore")

models = {
  'Random Forest': RandomForestClassifier(n_estimators=100, random_state=0),
  'Extra Trees': ExtraTreesClassifier(n_estimators=100, random_state=0),
  'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=0),
  'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=0),
  'SVM': SVC(kernel='rbf', C=1, random_state=0),
  'MLPClassifer': MLPClassifier(hidden_layer_sizes=(64, 32),max_iter=1000, random_state=0)
}

# Get Descriptors

In [2]:
df = pd.DataFrame(columns=['image', 'description', 'class'])

import time
documents = []

for _class in os.listdir(IMAGES_DIR):
  print("Class:", _class)
  total = len(os.listdir(os.path.join(IMAGES_DIR, _class)))
  for i, img in enumerate(os.listdir(os.path.join(IMAGES_DIR, _class))):
    # Get prediction 
    prediction = predict(os.path.join(IMAGES_DIR, _class, img), NO_OF_CHARS)
    descriptor = list(prediction.keys())

    documents.append(" ".join(descriptor))

    df.loc[len(df.index)] = [os.path.join(IMAGES_DIR, _class, img), descriptor, _class]
    sys.stdout.write(f"\rImage: {i + 1}/{total}")
    sys.stdout.flush()
  print()

Class: mountain
Image: 538/538
Class: beach
Image: 538/538


# Create bag of words w/ TF-IDF

In [3]:
vectorizer = TfidfVectorizer()

docs = vectorizer.fit_transform(documents)

columns = ['image', 'class'] + [f'feature_{i}' for i in range(docs.shape[1])]

_df = pd.DataFrame(columns=columns)

for i, row in df.iterrows():
  description = [" ".join(row['description'])]
  vector = list(vectorizer.transform(description).toarray()[0])

  _df.loc[len(_df.index)] = [row['image'], row['class']] + vector

# Classify

In [4]:
_df = _df.drop('image', axis=1)
X = _df.drop("class", axis=1)
y = _df["class"]

In [None]:
# Split our data in sets of training and testing 
kf = KFold(n_splits=5, shuffle=True, random_state=42)
dict_model = {}

for name, model in models.items():
  accuracies = []
  for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    dict_model[name] = dict_model.get(name,[y_test, y_pred])
    
    accuracies.append(accuracy_score(y_test, y_pred))

  print(f"{name}:", np.mean(accuracies))

Random Forest: 0.9730577088716623
Extra Trees: 0.9721274763135227
Gradient Boosting: 0.9674720068906115
AdaBoost: 0.9674849267872524
SVM: 0.9749138673557278
MLPClassifer: 0.9721274763135229


# Mutual Info

In [6]:
mi = mutual_info_classif(docs, y)
feature_names = vectorizer.get_feature_names_out()

print("La mejor feature para clasificar es:", feature_names[np.argmax(mi)])

La mejor feature para clasificar es: coast


# Classification Report for Models

In [None]:
for model in dict_model:
    print(f'Classification report for {model}:')
    y_test, y_pred = dict_model[model]
    class_report = classification_report(y_test, y_pred)
    print(class_report)
    

Classification report for Random Forest:
              precision    recall  f1-score   support

       beach       0.98      0.94      0.96       108
    mountain       0.95      0.98      0.96       108

    accuracy                           0.96       216
   macro avg       0.96      0.96      0.96       216
weighted avg       0.96      0.96      0.96       216

Classification report for Extra Trees:
              precision    recall  f1-score   support

       beach       0.98      0.94      0.96       108
    mountain       0.95      0.98      0.96       108

    accuracy                           0.96       216
   macro avg       0.96      0.96      0.96       216
weighted avg       0.96      0.96      0.96       216

Classification report for Gradient Boosting:
              precision    recall  f1-score   support

       beach       0.98      0.95      0.97       108
    mountain       0.95      0.98      0.97       108

    accuracy                           0.97       216
   