In [None]:
# Import packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load the CSV data from the GitHub link
url = 'https://raw.githubusercontent.com/FilipJanikulaS22660/ASI_grupa_6/main/data/prepared_final.csv'

data = pd.read_csv(url, sep=',')
print(data.columns)

Index(['Class', 'RecordingTime [ms]', 'Time of Day [h:m:s:ms]', 'Trial',
       'Stimulus', 'Participant', 'Tracking Ratio [%]', 'Category Group',
       'Category Right', 'Index Right', 'Pupil Diameter Right [mm]',
       'Point of Regard Right X [px]', 'Point of Regard Right Y [px]',
       'AOI Name Right', 'Gaze Vector Right X', 'Gaze Vector Right Y',
       'Gaze Vector Right Z'],
      dtype='object')


In [None]:
data.head()

Unnamed: 0,Class,RecordingTime [ms],Time of Day [h:m:s:ms],Trial,Stimulus,Participant,Tracking Ratio [%],Category Group,Category Right,Index Right,Pupil Diameter Right [mm],Point of Regard Right X [px],Point of Regard Right Y [px],AOI Name Right,Gaze Vector Right X,Gaze Vector Right Y,Gaze Vector Right Z
0,Anxious,612656.5,09:16:28:000,Trial001.21_nsap_f.jpg,K717,67.2,Information,Separator,1,,,,,,,,
1,Anxious,612663.8,09:16:28:007,Trial001.21_nsap_f.jpg,K717,67.2,Eye,Fixation,1,4.9,807.6,537.3,White Space,0.1,0.3,-1.0,
2,Anxious,612666.8,09:16:28:010,Trial001.21_nsap_f.jpg,K717,67.2,Eye,Fixation,1,4.9,808.1,534.9,White Space,0.1,0.3,-1.0,
3,Anxious,612675.1,09:16:28:019,Trial001.21_nsap_f.jpg,K717,67.2,Eye,Fixation,1,4.8,803.8,534.4,White Space,0.0,0.3,-1.0,
4,Anxious,612691.8,09:16:28:035,Trial001.21_nsap_f.jpg,K717,67.2,Eye,Fixation,1,4.8,803.2,534.1,White Space,0.1,0.3,-1.0,


In [None]:
# Check how many records are null
null_counts = pd.DataFrame(data[data.select_dtypes('number').columns].isna().sum(), columns=['Null Counts'])
null_counts

Unnamed: 0,Null Counts
AOI Name Right,2921
Gaze Vector Right X,9881
Gaze Vector Right Y,30032
Gaze Vector Right Z,78520


In [None]:
# Drop column 'Gaze Vector Right Z', because it contains only NaN values
data = data.drop('Gaze Vector Right Z', axis=1)

In [None]:
data.head()

Unnamed: 0,Class,RecordingTime [ms],Time of Day [h:m:s:ms],Trial,Stimulus,Participant,Tracking Ratio [%],Category Group,Category Right,Index Right,Pupil Diameter Right [mm],Point of Regard Right X [px],Point of Regard Right Y [px],AOI Name Right,Gaze Vector Right X,Gaze Vector Right Y
1,Anxious,612663.8,09:16:28:007,Trial001.21_nsap_f.jpg,K717,67.2,Eye,Fixation,1,4.9,807.6,537.3,White Space,0.1,0.3,-1.0
2,Anxious,612666.8,09:16:28:010,Trial001.21_nsap_f.jpg,K717,67.2,Eye,Fixation,1,4.9,808.1,534.9,White Space,0.1,0.3,-1.0
3,Anxious,612675.1,09:16:28:019,Trial001.21_nsap_f.jpg,K717,67.2,Eye,Fixation,1,4.8,803.8,534.4,White Space,0.0,0.3,-1.0
4,Anxious,612691.8,09:16:28:035,Trial001.21_nsap_f.jpg,K717,67.2,Eye,Fixation,1,4.8,803.2,534.1,White Space,0.1,0.3,-1.0
5,Anxious,612700.2,09:16:28:044,Trial001.21_nsap_f.jpg,K717,67.2,Eye,Fixation,1,4.8,801.1,531.5,White Space,0.0,0.3,-1.0


In [None]:
data.dropna(how='any', inplace=True)

In [None]:
X = data.drop('Class', axis=1)  # Assuming 'Class' is the target column
y = data['Class']
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48488 entries, 1 to 78519
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Class                         48488 non-null  object 
 1   RecordingTime [ms]            48488 non-null  float64
 2   Time of Day [h:m:s:ms]        48488 non-null  object 
 3   Trial                         48488 non-null  object 
 4   Stimulus                      48488 non-null  object 
 5   Participant                   48488 non-null  object 
 6   Tracking Ratio [%]            48488 non-null  object 
 7   Category Group                48488 non-null  object 
 8   Category Right                48488 non-null  object 
 9   Index Right                   48488 non-null  object 
 10  Pupil Diameter Right [mm]     48488 non-null  object 
 11  Point of Regard Right X [px]  48488 non-null  object 
 12  Point of Regard Right Y [px]  48488 non-null  object 
 13  AOI Na

In [None]:
columns_to_be_converted_to_floats = ['RecordingTime [ms]', 'Participant', 'Index Right', 'Pupil Diameter Right [mm]', 'Point of Regard Right X [px]']
data[columns_to_be_converted_to_floats] = data[columns_to_be_converted_to_floats].astype(float)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 48488 entries, 1 to 78519
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Class                         48488 non-null  object 
 1   RecordingTime [ms]            48488 non-null  float64
 2   Time of Day [h:m:s:ms]        48488 non-null  object 
 3   Trial                         48488 non-null  object 
 4   Stimulus                      48488 non-null  object 
 5   Participant                   48488 non-null  float64
 6   Tracking Ratio [%]            48488 non-null  object 
 7   Category Group                48488 non-null  object 
 8   Category Right                48488 non-null  object 
 9   Index Right                   48488 non-null  float64
 10  Pupil Diameter Right [mm]     48488 non-null  float64
 11  Point of Regard Right X [px]  48488 non-null  float64
 12  Point of Regard Right Y [px]  48488 non-null  object 
 13  AOI Na

In [None]:
# Replace missing values with NaN
# data = data.replace('?', pd.NA)

# Separate features and target
X = data.drop('Class', axis=1)
y = data['Class']

# Encode categorical columns
categorical_cols = X.dtypes == object
categorical_cols = list(categorical_cols[categorical_cols].index)
label_encoder = LabelEncoder()

for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
print("Precision:", accuracy_score(y_test, y_pred))
print("Classification report:\n", classification_report(y_test, y_pred))

# Analyze feature importance
importances = model.feature_importances_
feature_names = X.columns
feature_importances = pd.Series(importances, index=feature_names)
feature_importances.sort_values(ascending=False, inplace=True)
print("Feature Importances:\n", feature_importances)

Precision: 1.0
Classification report:
               precision    recall  f1-score   support

     Anxious       1.00      1.00      1.00      2871
     Control       1.00      1.00      1.00      3620
  Depressive       1.00      1.00      1.00      3207

    accuracy                           1.00      9698
   macro avg       1.00      1.00      1.00      9698
weighted avg       1.00      1.00      1.00      9698

Feature Importances:
 Stimulus                        0.367318
Time of Day [h:m:s:ms]          0.283988
RecordingTime [ms]              0.202267
Participant                     0.074375
Index Right                     0.033511
Gaze Vector Right X             0.012434
Point of Regard Right X [px]    0.009319
Trial                           0.007151
Pupil Diameter Right [mm]       0.003661
Gaze Vector Right Y             0.002842
AOI Name Right                  0.001790
Category Right                  0.000653
Point of Regard Right Y [px]    0.000647
Category Group           