In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

In [2]:
dataset = pd.read_csv('dataset.csv', index_col=False)
not_features = ["Path", "Emotional_intensity", "Statement", "Gender"]
features_data = dataset.drop(columns=not_features)
features_data
actor_list = [1, 12, 7, 24]
test_set = features_data.loc[features_data['Actor'].isin(actor_list)]
training_set = features_data.drop(test_set.index)
training_actors = training_set['Actor']
training_actors = training_actors.unique

In [3]:
y_train = training_set["Emotions"]
X2d_train = training_set[dataset.columns[6:]]
y_test = test_set["Emotions"]
X2d_test = test_set[dataset.columns[6:]]


In [4]:
accuracy = []
f1 = []

In [5]:
grid = {
    "max_samples": [0.3, 0.6, 0.9],
    "n_estimators": [200, 400, 600]
}
model = RandomForestClassifier(n_estimators=200, max_depth=None, max_samples=0.9, bootstrap=True, n_jobs=-1, random_state=42)

In [6]:
def gridSearchRandomForest(model, grid):
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    rf = RandomForestClassifier(max_depth=None, bootstrap=True, n_jobs=-1, random_state=42)
    grid_search = GridSearchCV(rf, grid, scoring='accuracy', cv=skf)
    grid_search.fit(X_train, y_train)
    print("Best parameters found:", grid_search.best_params_)
    print("Best accuracy score:", grid_search.best_score_)

    best_accuracy = 0
    best_params = {}
    for max_samples in grid["max_samples"]:
        for n_estimators in grid["n_estimators"]:
            model.set_params(max_samples=max_samples, n_estimators=n_estimators)
            model_accuracy = []
            model_f1 = []
            for train_index, test_index in skf.split(X2d_train, y_train["Emotions"]):
                training_set = dataset.iloc[train_index]
                testing_set = dataset.iloc[test_index]

                # Separate male and female actors
                training_actors_m = training_set[training_set['Gender'] == 'M']['Actor'].unique()
                training_actors_f = training_set[training_set['Gender'] == 'F']['Actor'].unique()

                # Cross-validation
                cross_validation(model)

            # Compute average accuracy and f1 score for the current model
            avg_accuracy = np.mean(model_accuracy)
            avg_f1 = np.mean(model_f1)

            # Print results
            print("max_samples:", max_samples, "n_estimators:", n_estimators, "accuracy:", avg_accuracy, "avg_f1:", avg_f1)

#I would like two for loop inside def gridSearchRandomForest,  before model_accuracy's
#initialization that iterates over the two parameters two find the best accuracy of the two arrays


In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [8]:
skf = StratifiedKFold(5, shuffle=True, random_state=42)

In [9]:
gs = GridSearchCV(rfm, grid, cv=skf, scoring=make_scorer(f1_score, average="macro"))

NameError: name 'rfm' is not defined

In [None]:
gs.fit(X2d_train, y_train);

In [12]:
gs.best_params_

{'max_samples': 0.9, 'n_estimators': 400}

In [13]:
model = gs.best_estimator_

In [44]:
model.score(X2d_test, y_test)

0.55

In [45]:
y_pred = model.predict(X2d_test)

In [46]:
f1_score(y_test, y_pred, average=None)

array([0.71428571, 0.73417722, 0.53164557, 0.5       , 0.4516129 ,
       0.11111111, 0.36363636, 0.71698113])

In [47]:
f1_score(y_test, y_pred, average="macro")

0.5154312511430743

In [48]:
from sklearn.metrics import classification_report

In [49]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       angry       0.83      0.62      0.71        32
        calm       0.62      0.91      0.73        32
     disgust       0.45      0.66      0.53        32
     fearful       0.58      0.44      0.50        32
       happy       0.47      0.44      0.45        32
     neutral       0.50      0.06      0.11        16
         sad       0.31      0.44      0.36        32
   surprised       0.90      0.59      0.72        32

    accuracy                           0.55       240
   macro avg       0.58      0.52      0.52       240
weighted avg       0.59      0.55      0.54       240



In [50]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(cm, index=model.classes_, columns=model.classes_)

Unnamed: 0,angry,calm,disgust,fearful,happy,neutral,sad,surprised
angry,20,0,6,1,5,0,0,0
calm,0,29,0,0,0,0,3,0
disgust,0,4,21,1,3,1,2,0
fearful,3,0,2,14,4,0,7,2
happy,1,1,6,2,14,0,8,0
neutral,0,7,1,0,0,1,7,0
sad,0,6,8,4,0,0,14,0
surprised,0,0,3,2,4,0,4,19


In [51]:
from xgboost import XGBClassifier

  from pandas import MultiIndex, Int64Index


In [52]:
xgbm = XGBClassifier(objective='binary:logistic', reg_alpha=0.5, reg_lambda=1.0, n_estimators=200, random_state=42)

In [53]:
grid = {
    "n_estimators": [200, 400, 600],
    "reg_alpha": [0.5, 1, 1.5],
    "reg_lambda": [0.5, 1, 1.5]
}