In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("2proctor_results.csv")

In [4]:
df

Unnamed: 0,H-Hand Detected,H-Distance,H-Illegal Objects,H-Prohibited Item,F-Hand Detected,F-Distance,F-Illegal Objects,F-Prohibited Item,num_faces,iris_pos,...,z_rotation,radial_distance,gaze_direction,gaze_zone,verification_result,Cheat Score,timestamp,face_is_cheating,hand_is_cheating,is_cheating
0,True,,0,[],False,,1,['headphone'],1,center,...,0,64803.772643,left,red,True,50,0-05-04-266437,0,0,0
1,True,,0,[],False,,1,['headphone'],1,center,...,0,64774.583423,left,red,True,50,0-04-11-966477,0,0,0
2,True,,0,[],True,,0,[],1,center,...,0,64789.866504,left,red,True,50,0-04-32-299795,0,0,0
3,True,,0,[],False,,0,[],0,,...,0,0.000000,,,False,50,0-03-59-833153,0,0,0
4,True,,0,[],False,,0,[],1,right,...,0,5083.970138,right,red,True,50,0-02-29-433221,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13270,True,,0,[],False,,0,[],1,center,...,0,64922.339720,left,red,True,50,0-02-23-599892,0,0,0
13271,True,,0,[],False,,0,[],0,,...,0,0.000000,,,False,50,0-03-37-033170,0,0,0
13272,True,,0,[],False,,0,[],1,center,...,0,64701.664751,left,red,False,50,0-06-00-599728,1,0,1
13273,False,,0,[],False,,0,[],0,,...,0,0.000000,,,False,50,0-05-00-633107,1,0,1


In [5]:
df.dtypes.sort_values()

H-Hand Detected           bool
verification_result       bool
F-Hand Detected           bool
face_is_cheating         int64
Cheat Score              int64
z_rotation               int64
y_rotation               int64
x_rotation               int64
hand_is_cheating         int64
is_cheating              int64
num_faces                int64
F-Illegal Objects        int64
H-Illegal Objects        int64
iris_ratio             float64
F-Distance             float64
radial_distance        float64
H-Distance             float64
mouth_area             float64
mouth_zone              object
F-Prohibited Item       object
gaze_direction          object
gaze_zone               object
H-Prohibited Item       object
timestamp               object
iris_pos                object
dtype: object

In [6]:
drop_cols = [
    "H-Hand Detected",
    "F-Hand Detected",
    "face_is_cheating",
    "Cheat Score",
    "hand_is_cheating",
]
df1 = df.drop(columns=drop_cols)

In [7]:
df1.dtypes.sort_values()

verification_result       bool
is_cheating              int64
H-Illegal Objects        int64
F-Illegal Objects        int64
z_rotation               int64
num_faces                int64
y_rotation               int64
x_rotation               int64
radial_distance        float64
mouth_area             float64
H-Distance             float64
iris_ratio             float64
F-Distance             float64
iris_pos                object
F-Prohibited Item       object
gaze_direction          object
gaze_zone               object
H-Prohibited Item       object
timestamp               object
mouth_zone              object
dtype: object

In [8]:
df1.verification_result = df1.verification_result.astype(np.int64)

In [9]:
df1.dtypes.sort_values()

is_cheating              int64
H-Illegal Objects        int64
verification_result      int64
F-Illegal Objects        int64
z_rotation               int64
num_faces                int64
y_rotation               int64
x_rotation               int64
radial_distance        float64
mouth_area             float64
H-Distance             float64
iris_ratio             float64
F-Distance             float64
iris_pos                object
F-Prohibited Item       object
gaze_direction          object
gaze_zone               object
H-Prohibited Item       object
timestamp               object
mouth_zone              object
dtype: object

In [10]:
def convert_timestamp(ts):
    parts = ts.split('-')
    
    if len(parts) == 4:  # Format: 0-01-06-066617
        _, hh, mm, sec = parts
    elif len(parts) == 3:  # Format: 0-07-13.866465
        _, hh, rest = parts
        mm, sec = rest.split('.')
    else:
        return None  # Handle unexpected formats

    # Convert to HH:MM:SS format
    return f"{int(hh):02}:{int(mm):02}:{int(float(sec)):02}"


In [11]:
df1["timestamp"] = pd.to_timedelta(df1['timestamp'].apply(convert_timestamp)).dt.total_seconds()

In [12]:
obj_cols = df1.select_dtypes(include=[object]).columns
obj_cols

Index(['H-Prohibited Item', 'F-Prohibited Item', 'iris_pos', 'mouth_zone',
       'gaze_direction', 'gaze_zone'],
      dtype='object')

In [13]:
df1["H-Prohibited Item"].unique()

array(['[]', "['cell phone']", "['watch']", "['cell phone', 'watch']",
       "['watch', 'cell phone']", "['watch', 'watch', 'cell phone']",
       "['cell phone', 'cell phone']", "['watch', 'sheet']",
       "['watch', 'watch']", "['closedbook']", "['sheet']"], dtype=object)

In [14]:
all_objects = {'cell phone', 'chits', 'closedbook', 'earpiece', 'headphone', 'openbook', 'sheet', 'watch'}

def one_hot_encode(row, all_objects):
    observed = set(row['F-Prohibited Item']) | set(row['H-Prohibited Item'])  # Merge both columns
    return {obj: int(obj in observed) for obj in all_objects}

In [15]:
one_hot_df = df1.apply(lambda row: one_hot_encode(row, all_objects), axis=1)

In [16]:
one_hot_df = pd.DataFrame(one_hot_df.tolist())

In [17]:
df2 = pd.concat([df1, one_hot_df], axis=1)

In [18]:
df2.drop(columns=['F-Prohibited Item', 'H-Prohibited Item', 'H-Illegal Objects', "F-Illegal Objects"], inplace=True)

In [19]:
df2

Unnamed: 0,H-Distance,F-Distance,num_faces,iris_pos,iris_ratio,mouth_zone,mouth_area,x_rotation,y_rotation,z_rotation,...,timestamp,is_cheating,chits,earpiece,cell phone,headphone,openbook,closedbook,watch,sheet
0,,,1,center,0.486753,YELLOW,365.0,0,0,0,...,284677.0,0,0,0,0,0,0,0,0,0
1,,,1,center,0.479780,GREEN,83.0,0,0,0,...,981537.0,0,0,0,0,0,0,0,0,0
2,,,1,center,0.426601,GREEN,131.5,0,0,0,...,316115.0,0,0,0,0,0,0,0,0,0
3,,,0,,0.000000,,0.0,0,0,0,...,847493.0,0,0,0,0,0,0,0,0,0
4,,,1,right,0.364729,GREEN,9.5,0,0,0,...,442161.0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13270,,,1,center,0.424701,GREEN,72.5,0,0,0,...,608472.0,0,0,0,0,0,0,0,0,0
13271,,,0,,0.000000,,0.0,0,0,0,...,46190.0,0,0,0,0,0,0,0,0,0
13272,,,1,center,0.428700,GREEN,87.0,0,0,0,...,621328.0,1,0,0,0,0,0,0,0,0
13273,,,0,,0.000000,,0.0,0,0,0,...,651107.0,1,0,0,0,0,0,0,0,0


In [20]:
mappings = {
    'iris_pos': {'center': 0, 'left': 1, 'right': 2},
    'mouth_zone': {'GREEN': 0, 'YELLOW': 1, 'ORANGE': 2, 'RED': 3},
    'gaze_direction': {'forward': 0, 'left': 1, 'right': 2, 'up': 3, 'down': 4},
    'gaze_zone': {'white': 0, 'yellow': 1, 'red': 2}
}

In [21]:
df2 = df2.replace(mappings)

  df2 = df2.replace(mappings)


In [22]:
df2.dtypes.sort_values()

sheet                    int64
closedbook               int64
num_faces                int64
openbook                 int64
headphone                int64
cell phone               int64
earpiece                 int64
x_rotation               int64
y_rotation               int64
z_rotation               int64
chits                    int64
watch                    int64
is_cheating              int64
verification_result      int64
H-Distance             float64
gaze_zone              float64
radial_distance        float64
mouth_area             float64
mouth_zone             float64
iris_ratio             float64
iris_pos               float64
F-Distance             float64
timestamp              float64
gaze_direction         float64
dtype: object

In [23]:
df2.isna().sum()

H-Distance             12072
F-Distance             13186
num_faces                  0
iris_pos                1507
iris_ratio                 0
mouth_zone              1507
mouth_area                 0
x_rotation                 0
y_rotation                 0
z_rotation                 0
radial_distance            0
gaze_direction          1507
gaze_zone               1507
verification_result        0
timestamp                  0
is_cheating                0
chits                      0
earpiece                   0
cell phone                 0
headphone                  0
openbook                   0
closedbook                 0
watch                      0
sheet                      0
dtype: int64

In [24]:
# nan mapping dictionary
nan_mappings = {
    'iris_pos': -1,
    'mouth_zone': -1,
    'gaze_direction': -1,
    'gaze_zone': -1,
    "H-Distance": 1000,
    "F-Distance": 1000,
}

In [25]:
df2.fillna(nan_mappings, inplace=True)

In [26]:
df2.is_cheating.value_counts()

is_cheating
0    8715
1    4560
Name: count, dtype: int64

In [27]:
df2.columns

Index(['H-Distance', 'F-Distance', 'num_faces', 'iris_pos', 'iris_ratio',
       'mouth_zone', 'mouth_area', 'x_rotation', 'y_rotation', 'z_rotation',
       'radial_distance', 'gaze_direction', 'gaze_zone', 'verification_result',
       'timestamp', 'is_cheating', 'chits', 'earpiece', 'cell phone',
       'headphone', 'openbook', 'closedbook', 'watch', 'sheet'],
      dtype='object')

In [32]:
columns = [
    'timestamp',
    'verification_result',
    'num_faces',
    'iris_pos', 
    'iris_ratio', 
    'mouth_zone', 
    'mouth_area',
    'x_rotation', 
    'y_rotation', 
    'z_rotation', 
    'radial_distance',
    'gaze_direction', 
    'gaze_zone',
    'watch', 
    'headphone', 
    'closedbook', 
    'earpiece', 
    'cell phone',
    'openbook', 
    'chits', 
    'sheet',
    'H-Distance',
    'F-Distance', 
    'is_cheating'
]

In [33]:
df3 = df2[columns]

In [34]:
df3.to_csv("proctor_results_final_cleaned.csv", index=False)

In [35]:
df3["is_cheating"].value_counts()

is_cheating
0    8715
1    4560
Name: count, dtype: int64

In [37]:
df3 = df3.sort_values(by='timestamp')

In [38]:
df3.head()

Unnamed: 0,timestamp,verification_result,num_faces,iris_pos,iris_ratio,mouth_zone,mouth_area,x_rotation,y_rotation,z_rotation,...,headphone,closedbook,earpiece,cell phone,openbook,chits,sheet,H-Distance,F-Distance,is_cheating
12535,33333.0,1,1,0.0,0.525789,0.0,8.5,0,0,0,...,0,0,0,0,0,0,0,283.53483,1000.0,0
5695,33393.0,1,1,0.0,0.522151,0.0,14.0,0,0,0,...,0,0,0,0,0,0,0,279.234668,1000.0,0
544,33452.0,1,1,0.0,0.511408,0.0,33.0,0,0,0,...,0,0,0,0,0,0,0,272.214989,1000.0,0
695,33511.0,1,1,0.0,0.538145,0.0,112.5,0,0,0,...,0,0,0,0,0,0,0,272.214989,1000.0,0
6505,33570.0,1,1,0.0,0.52216,0.0,27.0,0,0,0,...,0,0,0,0,0,0,0,79.611557,1000.0,0


In [39]:
df3.to_csv("proctor_results_final_cleaned.csv", index=False)