In [1]:
%pip install pandas scikit-learn openpyxl



Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import joblib

In [3]:
# File paths
file_path_cursive = r'Resources/Cursive Writing in a Dotted Line Emotion Label.xlsx'
file_path_fine_motor = r'Resources/Fine Motor Skill Label.xlsx'
file_path_bold = r'Resources/Bold the Sentence Task Emotion Label.xlsx'

# Load the Excel data
df_cursive = pd.read_excel(file_path_cursive)
df_fine_motor = pd.read_excel(file_path_fine_motor)
df_bold = pd.read_excel(file_path_bold)

In [4]:
df_fine_motor.head(5)

Unnamed: 0,NO,Student ID,Ages (Years),Gender,Grade,Bold The Sentence,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Cursive Writing in a Dotted Line,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Fine Motor Skill Label
0,,,,,,Speed (Detik),Number Stroke,Mean Pressure,Mean Altitude,Modulus Altitude,Mean Azimuth,Modulus Azimuth,Speed,Number Stroke,Mean Pressure,Mean Altitude,Modulus Altitude,Mean Azimuth,Modulus Azimuth,
1,1.0,10001.0,7.083333,Boys,1.0,87,28,474,1467,1520,579,600,165,28,474,1467,1520,579,600,Good
2,2.0,10002.0,6.75,Boys,1.0,106,36,224,1098,1090,446,440,82,25,285,936,900,470,460,Average
3,3.0,10003.0,6.416667,Boys,1.0,116,50,318,1235,1240,501,480,155,44,302,1160,1220,468,460,Less
4,4.0,10004.0,6.166667,Boys,1.0,100,55,295,1073,1070,565,550,128,89,240,1102,1080,559,550,Good


## Preprocess Cursive Writing in a Dotted Line Emotion Label file

In [5]:
# Inspect the first few rows of the Cursive Writing in a Dotted Line Emotion Label
print(df_cursive.head(5))

    NO  Student ID  Ages (Years) Gender  Grade  \
0  NaN         NaN           NaN    NaN    NaN   
1  1.0     10001.0      7.083333   Boys    1.0   
2  2.0     10002.0      6.750000   Boys    1.0   
3  3.0     10003.0      6.416667   Boys    1.0   
4  4.0     10004.0      6.166667   Boys    1.0   

  Cursive Writing in a Dotted Line     Unnamed: 6     Unnamed: 7  \
0                            Speed  Number Stroke  Mean Pressure   
1                              165             28            474   
2                               82             25            285   
3                              155             44            302   
4                              128             89            240   

      Unnamed: 8        Unnamed: 9   Unnamed: 10      Unnamed: 11  \
0  Mean Altitude  Modulus Altitude  Mean Azimuth  Modulus Azimuth   
1           1467              1520           579              600   
2            936               900           470              460   
3           11

In [6]:
# Rename the columns
df_cursive.rename(columns={
    'Cursive Writing in a Dotted Line': 'Speed_C',
    'Unnamed: 6': 'Number Stroke_C',
    'Unnamed: 7': 'Mean Pressure_C',
    'Unnamed: 8': 'Mean Altitude_C',
    'Unnamed: 9': 'Modulus Altitude_C',
    'Unnamed: 10': 'Mean Azimuth_C',
    'Unnamed: 11': 'Modulus Azimuth_C'
}, inplace=True)

# Display the updated columns
print(df_cursive.columns)


Index(['NO', 'Student ID', 'Ages (Years)', 'Gender', 'Grade', 'Speed_C',
       'Number Stroke_C', 'Mean Pressure_C', 'Mean Altitude_C',
       'Modulus Altitude_C', 'Mean Azimuth_C', 'Modulus Azimuth_C',
       'Label Emotion'],
      dtype='object')


## Preprocess Bold the Sentence Task Emotion Label file

In [7]:
# Inspect the first few rows of the Bold the Sentence Task Emotion Label file
print(df_bold.head(5))

    NO  Student ID  Ages (Years) Gender  Grade Bold The Sentence   \
0  NaN         NaN           NaN    NaN    NaN      Speed (Detik)   
1  1.0     10001.0      7.083333   Boys    1.0                 87   
2  2.0     10002.0      6.750000   Boys    1.0                106   
3  3.0     10003.0      6.416667   Boys    1.0                116   
4  4.0     10004.0      6.166667   Boys    1.0                100   

      Unnamed: 6     Unnamed: 7     Unnamed: 8        Unnamed: 9  \
0  Number Stroke  Mean Pressure  Mean Altitude  Modulus Altitude   
1             28            474           1467              1520   
2             36            224           1098              1090   
3             50            318           1235              1240   
4             55            295           1073              1070   

    Unnamed: 10      Unnamed: 11 Label Emotion  
0  Mean Azimuth  Modulus Azimuth           NaN  
1           579              600       positif  
2           446              

In [8]:
# Rename the columns
df_bold.rename(columns={
    'Bold The Sentence ': 'Speed_B',
    'Unnamed: 6': 'Number Stroke_B',
    'Unnamed: 7': 'Mean Pressure_B',
    'Unnamed: 8': 'Mean Altitude_B',
    'Unnamed: 9': 'Modulus Altitude_B',
    'Unnamed: 10': 'Mean Azimuth_B',
    'Unnamed: 11': 'Modulus Azimuth_B'
}, inplace=True)

# Display the updated columns
print(df_bold.columns)


Index(['NO', 'Student ID', 'Ages (Years)', 'Gender', 'Grade', 'Speed_B',
       'Number Stroke_B', 'Mean Pressure_B', 'Mean Altitude_B',
       'Modulus Altitude_B', 'Mean Azimuth_B', 'Modulus Azimuth_B',
       'Label Emotion'],
      dtype='object')


### Merging the Files

In [9]:
# Join the two DataFrames on Student ID
df_merged = pd.merge(df_cursive, df_bold, on='Student ID', how='inner')

# Display the merged DataFrame
print("Merged DataFrame:")
print(df_merged.head())

Merged DataFrame:
   NO_x  Student ID  Ages (Years)_x Gender_x  Grade_x Speed_C Number Stroke_C  \
0   NaN         NaN             NaN      NaN      NaN   Speed   Number Stroke   
1   1.0     10001.0        7.083333     Boys      1.0     165              28   
2   2.0     10002.0        6.750000     Boys      1.0      82              25   
3   3.0     10003.0        6.416667     Boys      1.0     155              44   
4   4.0     10004.0        6.166667     Boys      1.0     128              89   

  Mean Pressure_C Mean Altitude_C Modulus Altitude_C  ... Gender_y Grade_y  \
0   Mean Pressure   Mean Altitude   Modulus Altitude  ...      NaN     NaN   
1             474            1467               1520  ...     Boys     1.0   
2             285             936                900  ...     Boys     1.0   
3             302            1160               1220  ...     Boys     1.0   
4             240            1102               1080  ...     Boys     1.0   

         Speed_B  Number S

In [10]:
# Save the merged DataFrame as a CSV file
df_merged.to_csv('Merged_Cursive_Bold_Data.csv', index=False)


## Merging Fine Motor Skill Label with df_merged

In [11]:
# Load the merged cursive and bold data from the CSV file
df_merged = pd.read_csv('Merged_Cursive_Bold_Data.csv')

In [12]:
# Merge df_merged (cursive + bold) with df_fine_motor on 'Student ID'
df_final_merged = pd.merge(df_merged, df_fine_motor[['Student ID', 'Fine Motor Skill Label']], on='Student ID', how='inner')

# Display the first few rows of the final merged DataFrame
print("Final Merged DataFrame:")
print(df_final_merged.head(5))


Final Merged DataFrame:
   NO_x  Student ID  Ages (Years)_x Gender_x  Grade_x Speed_C Number Stroke_C  \
0   NaN         NaN             NaN      NaN      NaN   Speed   Number Stroke   
1   1.0     10001.0        7.083333     Boys      1.0     165              28   
2   2.0     10002.0        6.750000     Boys      1.0      82              25   
3   3.0     10003.0        6.416667     Boys      1.0     155              44   
4   4.0     10004.0        6.166667     Boys      1.0     128              89   

  Mean Pressure_C Mean Altitude_C Modulus Altitude_C  ... Grade_y  \
0   Mean Pressure   Mean Altitude   Modulus Altitude  ...     NaN   
1             474            1467               1520  ...     1.0   
2             285             936                900  ...     1.0   
3             302            1160               1220  ...     1.0   
4             240            1102               1080  ...     1.0   

         Speed_B Number Stroke_B  Mean Pressure_B  Mean Altitude_B  \
0  S

In [13]:
# Drop the row with index 0
df_final_merged = df_final_merged.drop(0)

# Reset the index if needed
df_final_merged.reset_index(drop=True, inplace=True)

# Display the updated DataFrame to verify
print(df_final_merged.head(5))


   NO_x  Student ID  Ages (Years)_x Gender_x  Grade_x Speed_C Number Stroke_C  \
0   1.0     10001.0        7.083333     Boys      1.0     165              28   
1   2.0     10002.0        6.750000     Boys      1.0      82              25   
2   3.0     10003.0        6.416667     Boys      1.0     155              44   
3   4.0     10004.0        6.166667     Boys      1.0     128              89   
4   5.0     10005.0        6.916667     Boys      1.0     168              40   

  Mean Pressure_C Mean Altitude_C Modulus Altitude_C  ... Grade_y Speed_B  \
0             474            1467               1520  ...     1.0      87   
1             285             936                900  ...     1.0     106   
2             302            1160               1220  ...     1.0     116   
3             240            1102               1080  ...     1.0     100   
4             362            1133               1170  ...     1.0     247   

  Number Stroke_B  Mean Pressure_B  Mean Altitude_

In [14]:
# Check for missing values in the final merged file 
print(df_final_merged.isnull().sum())

NO_x                      0
Student ID                0
Ages (Years)_x            0
Gender_x                  0
Grade_x                   0
Speed_C                   0
Number Stroke_C           0
Mean Pressure_C           0
Mean Altitude_C           0
Modulus Altitude_C        0
Mean Azimuth_C            0
Modulus Azimuth_C         0
Label Emotion_x           4
NO_y                      0
Ages (Years)_y            0
Gender_y                  0
Grade_y                   0
Speed_B                   0
Number Stroke_B           0
Mean Pressure_B           0
Mean Altitude_B           0
Modulus Altitude_B        0
Mean Azimuth_B            0
Modulus Azimuth_B         0
Label Emotion_y           4
Fine Motor Skill Label    0
dtype: int64


In [15]:
# Drop rows with missing values or fill them with appropriate values
df_cleaned = df_final_merged.dropna()
df_cleaned.head(5)

Unnamed: 0,NO_x,Student ID,Ages (Years)_x,Gender_x,Grade_x,Speed_C,Number Stroke_C,Mean Pressure_C,Mean Altitude_C,Modulus Altitude_C,...,Grade_y,Speed_B,Number Stroke_B,Mean Pressure_B,Mean Altitude_B,Modulus Altitude_B,Mean Azimuth_B,Modulus Azimuth_B,Label Emotion_y,Fine Motor Skill Label
0,1.0,10001.0,7.083333,Boys,1.0,165,28,474,1467,1520,...,1.0,87,28,474,1467,1520,579,600,positif,Good
2,3.0,10003.0,6.416667,Boys,1.0,155,44,302,1160,1220,...,1.0,116,50,318,1235,1240,501,480,negatif,Less
3,4.0,10004.0,6.166667,Boys,1.0,128,89,240,1102,1080,...,1.0,100,55,295,1073,1070,565,550,positif,Good
4,5.0,10005.0,6.916667,Boys,1.0,168,40,362,1133,1170,...,1.0,247,44,441,1059,1010,519,520,negatif,Less
5,6.0,10006.0,6.5,Boys,1.0,124,81,274,850,900,...,1.0,76,62,318,857,900,692,710,positif,Less


In [16]:
# Check for missing values in the final merged file 
print(df_cleaned.isnull().sum())

NO_x                      0
Student ID                0
Ages (Years)_x            0
Gender_x                  0
Grade_x                   0
Speed_C                   0
Number Stroke_C           0
Mean Pressure_C           0
Mean Altitude_C           0
Modulus Altitude_C        0
Mean Azimuth_C            0
Modulus Azimuth_C         0
Label Emotion_x           0
NO_y                      0
Ages (Years)_y            0
Gender_y                  0
Grade_y                   0
Speed_B                   0
Number Stroke_B           0
Mean Pressure_B           0
Mean Altitude_B           0
Modulus Altitude_B        0
Mean Azimuth_B            0
Modulus Azimuth_B         0
Label Emotion_y           0
Fine Motor Skill Label    0
dtype: int64


In [17]:
# Drop columns NO_x, NO_y, 'Ages (Years)_y' and 'Grade_y'as NO_x, NO_y are index columns and 'Ages (Years)_y','Grade_y' are duplicates.
df_cleaned.drop(columns=['NO_x', 'NO_y','Ages (Years)_y','Grade_y'], inplace=True)

# Display the first few rows of the DataFrame to confirm the changes
print(df_cleaned.head())


   Student ID  Ages (Years)_x Gender_x  Grade_x Speed_C Number Stroke_C  \
0     10001.0        7.083333     Boys      1.0     165              28   
2     10003.0        6.416667     Boys      1.0     155              44   
3     10004.0        6.166667     Boys      1.0     128              89   
4     10005.0        6.916667     Boys      1.0     168              40   
5     10006.0        6.500000     Boys      1.0     124              81   

  Mean Pressure_C Mean Altitude_C Modulus Altitude_C Mean Azimuth_C  ...  \
0             474            1467               1520            579  ...   
2             302            1160               1220            468  ...   
3             240            1102               1080            559  ...   
4             362            1133               1170            545  ...   
5             274             850                900            741  ...   

  Gender_y Speed_B Number Stroke_B Mean Pressure_B Mean Altitude_B  \
0     Boys      87    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned.drop(columns=['NO_x', 'NO_y','Ages (Years)_y','Grade_y'], inplace=True)


In [18]:
# Create a copy of the DataFrame 
df_cleaned = df_cleaned.copy()

# Check if Label Emotion_x and Label Emotion_y are duplicates (identical across all rows)
duplicates_check = (df_cleaned['Label Emotion_x'] == df_cleaned['Label Emotion_y']).all()

# Output the result and drop 'Label Emotion_x' if they are identical
if duplicates_check:
    print("Label Emotion_x and Label Emotion_y are identical across all rows.")
    # Drop 'Label Emotion_x' as it is redundant
    df_cleaned.drop(columns=['Label Emotion_x'], inplace=True)
else:
    print("Label Emotion_x and Label Emotion_y have differences.")

# Display the first few rows of the DataFrame to confirm
print(df_cleaned.head())


Label Emotion_x and Label Emotion_y are identical across all rows.
   Student ID  Ages (Years)_x Gender_x  Grade_x Speed_C Number Stroke_C  \
0     10001.0        7.083333     Boys      1.0     165              28   
2     10003.0        6.416667     Boys      1.0     155              44   
3     10004.0        6.166667     Boys      1.0     128              89   
4     10005.0        6.916667     Boys      1.0     168              40   
5     10006.0        6.500000     Boys      1.0     124              81   

  Mean Pressure_C Mean Altitude_C Modulus Altitude_C Mean Azimuth_C  ...  \
0             474            1467               1520            579  ...   
2             302            1160               1220            468  ...   
3             240            1102               1080            559  ...   
4             362            1133               1170            545  ...   
5             274             850                900            741  ...   

  Gender_y Speed_B Number

In [19]:
# Step 1: Check if Gender_x and Gender_y are duplicates (identical across all rows)
duplicates_check_gender = (df_cleaned['Gender_x'] == df_cleaned['Gender_y']).all()

# Step 2: Output the result and drop 'Gender_x' if they are identical
if duplicates_check_gender:
    print("Gender_x and Gender_y are identical across all rows.")
    # Drop 'Gender_y' as it is redundant
    df_cleaned.drop(columns=['Gender_y'], inplace=True)
else:
    print("Gender_x and Gender_y have differences.")

# Step 3: Display the first few rows of the DataFrame to confirm
print(df_cleaned.head())


Gender_x and Gender_y are identical across all rows.
   Student ID  Ages (Years)_x Gender_x  Grade_x Speed_C Number Stroke_C  \
0     10001.0        7.083333     Boys      1.0     165              28   
2     10003.0        6.416667     Boys      1.0     155              44   
3     10004.0        6.166667     Boys      1.0     128              89   
4     10005.0        6.916667     Boys      1.0     168              40   
5     10006.0        6.500000     Boys      1.0     124              81   

  Mean Pressure_C Mean Altitude_C Modulus Altitude_C Mean Azimuth_C  \
0             474            1467               1520            579   
2             302            1160               1220            468   
3             240            1102               1080            559   
4             362            1133               1170            545   
5             274             850                900            741   

  Modulus Azimuth_C Speed_B Number Stroke_B Mean Pressure_B Mean Alti

In [20]:
# Encode 'Gender_x' column (Boys: 0, Girls: 1)
df_cleaned['Gender_x'] = df_cleaned['Gender_x'].map({'Boys': 0, 'Girls': 1})

# Encode 'Fine Motor Skill Label' column (Good: 1, Average: 0.5, Less: 0)
df_cleaned['Fine Motor Skill Label'] = df_cleaned['Fine Motor Skill Label'].map({
    'Good': 1, 'Average': 0.5, 'Less': 0
})

# Check the DataFrame after encoding
print(df_cleaned.head())

   Student ID  Ages (Years)_x  Gender_x  Grade_x Speed_C Number Stroke_C  \
0     10001.0        7.083333         0      1.0     165              28   
2     10003.0        6.416667         0      1.0     155              44   
3     10004.0        6.166667         0      1.0     128              89   
4     10005.0        6.916667         0      1.0     168              40   
5     10006.0        6.500000         0      1.0     124              81   

  Mean Pressure_C Mean Altitude_C Modulus Altitude_C Mean Azimuth_C  \
0             474            1467               1520            579   
2             302            1160               1220            468   
3             240            1102               1080            559   
4             362            1133               1170            545   
5             274             850                900            741   

  Modulus Azimuth_C Speed_B Number Stroke_B Mean Pressure_B Mean Altitude_B  \
0               600      87          

In [21]:
# Duplicate rows where Label Emotion_y is 'negative' to balance the data
negative_class = df_cleaned[df_cleaned['Label Emotion_y'] == 'negatif']
df_balanced = pd.concat([df_cleaned, negative_class])  # Combine the original data with duplicated rows


In [22]:
df_balanced.head(5)

Unnamed: 0,Student ID,Ages (Years)_x,Gender_x,Grade_x,Speed_C,Number Stroke_C,Mean Pressure_C,Mean Altitude_C,Modulus Altitude_C,Mean Azimuth_C,Modulus Azimuth_C,Speed_B,Number Stroke_B,Mean Pressure_B,Mean Altitude_B,Modulus Altitude_B,Mean Azimuth_B,Modulus Azimuth_B,Label Emotion_y,Fine Motor Skill Label
0,10001.0,7.083333,0,1.0,165,28,474,1467,1520,579,600,87,28,474,1467,1520,579,600,positif,1.0
2,10003.0,6.416667,0,1.0,155,44,302,1160,1220,468,460,116,50,318,1235,1240,501,480,negatif,0.0
3,10004.0,6.166667,0,1.0,128,89,240,1102,1080,559,550,100,55,295,1073,1070,565,550,positif,1.0
4,10005.0,6.916667,0,1.0,168,40,362,1133,1170,545,530,247,44,441,1059,1010,519,520,negatif,0.0
5,10006.0,6.5,0,1.0,124,81,274,850,900,741,780,76,62,318,857,900,692,710,positif,0.0


In [23]:
df_balanced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, 0 to 92
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Student ID              120 non-null    float64
 1   Ages (Years)_x          120 non-null    float64
 2   Gender_x                120 non-null    int64  
 3   Grade_x                 120 non-null    float64
 4   Speed_C                 120 non-null    object 
 5   Number Stroke_C         120 non-null    object 
 6   Mean Pressure_C         120 non-null    object 
 7   Mean Altitude_C         120 non-null    object 
 8   Modulus Altitude_C      120 non-null    object 
 9   Mean Azimuth_C          120 non-null    object 
 10  Modulus Azimuth_C       120 non-null    object 
 11  Speed_B                 120 non-null    object 
 12  Number Stroke_B         120 non-null    object 
 13  Mean Pressure_B         120 non-null    object 
 14  Mean Altitude_B         120 non-null    object 


In [33]:
# 'Label Emotion_y' is the column with values 'positif' and 'negative'
df_balanced['Label Emotion_y'] = df_balanced['Label Emotion_y'].replace({'positif': 'positive', 'negatif': 'negative'})

# Display the updated DataFrame to verify the changes
print(df_balanced.head(5))


   Student ID  Ages (Years)_x  Gender_x  Grade_x  Speed_C  Number Stroke_C  \
0     10001.0        7.083333         0      1.0      165               28   
2     10003.0        6.416667         0      1.0      155               44   
3     10004.0        6.166667         0      1.0      128               89   
4     10005.0        6.916667         0      1.0      168               40   
5     10006.0        6.500000         0      1.0      124               81   

   Mean Pressure_C  Mean Altitude_C  Modulus Altitude_C  Mean Azimuth_C  \
0              474             1467                1520             579   
2              302             1160                1220             468   
3              240             1102                1080             559   
4              362             1133                1170             545   
5              274              850                 900             741   

   Modulus Azimuth_C  Speed_B  Number Stroke_B  Mean Pressure_B  \
0            

In [34]:
# Convert all columns of object type to numeric except 'Label Emotion_y'
columns_to_convert = df_balanced.select_dtypes(include=['object']).columns.tolist()

# Exclude 'Label Emotion_y' from the conversion process
columns_to_convert.remove('Label Emotion_y')

# Convert the remaining object columns to numeric, coercing errors (invalid parsing will be set to NaN)
for column in columns_to_convert:
    df_balanced[column] = pd.to_numeric(df_balanced[column], errors='coerce')

# Check the DataFrame info to ensure proper conversion
print(df_balanced.info())


<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, 0 to 92
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Student ID              120 non-null    float64
 1   Ages (Years)_x          120 non-null    float64
 2   Gender_x                120 non-null    int64  
 3   Grade_x                 120 non-null    float64
 4   Speed_C                 120 non-null    int64  
 5   Number Stroke_C         120 non-null    int64  
 6   Mean Pressure_C         120 non-null    int64  
 7   Mean Altitude_C         120 non-null    int64  
 8   Modulus Altitude_C      120 non-null    int64  
 9   Mean Azimuth_C          120 non-null    int64  
 10  Modulus Azimuth_C       120 non-null    int64  
 11  Speed_B                 120 non-null    int64  
 12  Number Stroke_B         120 non-null    int64  
 13  Mean Pressure_B         120 non-null    int64  
 14  Mean Altitude_B         120 non-null    int64  


In [35]:
df_balanced.head(10)

Unnamed: 0,Student ID,Ages (Years)_x,Gender_x,Grade_x,Speed_C,Number Stroke_C,Mean Pressure_C,Mean Altitude_C,Modulus Altitude_C,Mean Azimuth_C,Modulus Azimuth_C,Speed_B,Number Stroke_B,Mean Pressure_B,Mean Altitude_B,Modulus Altitude_B,Mean Azimuth_B,Modulus Azimuth_B,Label Emotion_y,Fine Motor Skill Label
0,10001.0,7.083333,0,1.0,165,28,474,1467,1520,579,600,87,28,474,1467,1520,579,600,positive,1.0
2,10003.0,6.416667,0,1.0,155,44,302,1160,1220,468,460,116,50,318,1235,1240,501,480,negative,0.0
3,10004.0,6.166667,0,1.0,128,89,240,1102,1080,559,550,100,55,295,1073,1070,565,550,positive,1.0
4,10005.0,6.916667,0,1.0,168,40,362,1133,1170,545,530,247,44,441,1059,1010,519,520,negative,0.0
5,10006.0,6.5,0,1.0,124,81,274,850,900,741,780,76,62,318,857,900,692,710,positive,0.0
6,10007.0,6.083333,0,1.0,198,48,405,875,900,572,560,159,44,544,935,900,596,590,negative,0.0
7,10008.0,6.5,0,1.0,179,54,583,1134,1130,538,510,205,45,686,1135,1170,526,530,positive,1.0
8,10009.0,8.916667,1,2.0,109,58,523,1273,1330,645,650,135,35,552,933,900,664,670,positive,1.0
9,10010.0,8.583333,1,2.0,165,56,298,1104,1140,634,630,150,57,411,1130,1160,667,630,positive,0.0
11,10012.0,7.916667,0,2.0,120,55,538,803,780,586,580,114,24,575,971,930,542,520,positive,0.5


In [36]:
# Save the cleaned DataFrame to a CSV file
df_balanced.to_csv('Cleaned_Merged_Data.csv', index=False)

In [37]:
# Define features (X) and target (y)
X = df_balanced.drop(['Label Emotion_y','Student ID','Fine Motor Skill Label'], axis=1)  # Drop unnecessary columns
y = df_balanced['Label Emotion_y']  # Set the target variable

# Check the shapes of X and y
print("Features shape:", X.shape)
print("Target shape:", y.shape)


Features shape: (120, 17)
Target shape: (120,)


In [38]:
# Step 1: Split the data into training (90%) and testing (1%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Step 2: Scale the features using StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Random Forest Model

In [39]:
# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

In [40]:
# Save the model to a file
with open('rf_model.joblib', 'wb') as model_file:
    joblib.dump(rf_model, model_file)

print("Random Forest model saved as 'rf_model.joblib'")


Random Forest model saved as 'rf_model.joblib'


Random Forest Model Accuracy: 0.9166666666666666

Classification Report:
              precision    recall  f1-score   support

     negatif       0.83      1.00      0.91         5
     positif       1.00      0.86      0.92         7

    accuracy                           0.92        12
    macro avg      0.92      0.93      0.92        12
    weighted avg   0.93      0.92      0.92        12


### Optimization
1. Hyperparameter Tuning with GridSearchCV

In [41]:
# Set up a parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [None, 10, 20, 30],  # Max depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Initialize a RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Set up GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the GridSearchCV on the scaled training data
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the Random Forest model with the best parameters
best_rf_model = grid_search.best_estimator_

# Make predictions
y_pred_rf = best_rf_model.predict(X_test_scaled)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_report_rf = classification_report(y_test, y_pred_rf)

print(f"Optimized Random Forest Model Accuracy: {accuracy_rf}")
print(f"Classification Report:\n{classification_report_rf}")

# Save the optimized model
with open('optimized_rf_model.joblib', 'wb') as model_file:
    joblib.dump(best_rf_model, model_file)

print("Optimized Random Forest model saved as 'optimized_rf_model.joblib'")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best Parameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Optimized Random Forest Model Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         5
    positive       1.00      1.00      1.00         7

    accuracy                           1.00        12
   macro avg       1.00      1.00      1.00        12
weighted avg       1.00      1.00      1.00        12

Optimized Random Forest model saved as 'optimized_rf_model.joblib'
