# Feature Engineering

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sys
import os

# Agrega el path al directorio raíz del proyecto
sys.path.append(os.path.abspath(".."))

In [37]:
df = pd.read_csv("../data/clean/data.csv")

In [38]:
df.head()

Unnamed: 0,Video,Evento,PCB,Start_Frame,End_Frame,duracion
0,Abuse001_x264.mp4,Abuse,237,327,397,160
1,Abuse002_x264.mp4,Abuse,0,174,266,266
2,Abuse003_x264.mp4,Abuse,351,951,3614,3263
3,Abuse004_x264.mp4,Abuse,558,898,5779,5221
4,Abuse005_x264.mp4,Abuse,210,717,933,723


In [39]:
# Crear una columna con el path
df['Path'] = df.apply(lambda row: f"../data/Anomaly-Detection-Dataset/Anomaly-Videos/{row['Evento']}/{row['Video']}", axis=1)

In [40]:
df['Path'].head()

0    ../data/Anomaly-Detection-Dataset/Anomaly-Vide...
1    ../data/Anomaly-Detection-Dataset/Anomaly-Vide...
2    ../data/Anomaly-Detection-Dataset/Anomaly-Vide...
3    ../data/Anomaly-Detection-Dataset/Anomaly-Vide...
4    ../data/Anomaly-Detection-Dataset/Anomaly-Vide...
Name: Path, dtype: object

In [41]:
# Añadir la columna is_crime
df['is_crime'] = df['Evento'].apply(lambda x: 0 if x == 'Normal' else 1)

In [42]:
values = df['is_crime'].value_counts()
value = abs(values[0] - values[1])
print(f"Se necesita eliminar {value} registros para balancear el dataset")

Se necesita eliminar 6 registros para balancear el dataset


In [43]:
# Elimina value aleatorias de las que no son normales
df = df.drop(df[df['is_crime'] == 1].sample(value).index)

In [44]:
df['is_crime'].value_counts()

0    323
1    323
Name: is_crime, dtype: int64

In [45]:
# En los videos normales poner de End_frame el valor de Duration
df['End_Frame'] = df.apply(lambda row: row['duracion'] if row['Evento'] == 'Normal' else row['End_Frame'], axis=1)

In [46]:
df[df['Evento'] == 'Normal'][['duracion', 'End_Frame']].head()

Unnamed: 0,duracion,End_Frame
329,544,544
330,1663,1663
331,917,917
332,386,386
333,985,985


## Separación de train, test y validation

In [47]:
# Dividir el dataset en train (80%), valid (10%), test (10%)
train_df, test_valid_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Evento'])
valid_df, test_df = train_test_split(test_valid_df, test_size=0.5, random_state=42, stratify=test_valid_df['Evento'])

In [48]:
# Función para ver cuantos eventos hay en cada dataset en porcentaje
def count_events(df, name):
    print(f"{name}:")
    print(df['Evento'].value_counts(normalize=True) * 100)
    print()

In [49]:
count_events(train_df, "Train")
count_events(valid_df, "Valid")
count_events(test_df, "Test")

Train:
Normal         50.000000
Robbery        15.310078
Stealing        5.038760
Abuse           5.038760
Assault         4.844961
Fighting        4.844961
Burglary        4.263566
Arson           4.069767
Vandalism       2.713178
Shoplifting     2.325581
Shooting        1.550388
Name: Evento, dtype: float64

Valid:
Normal         49.230769
Robbery        15.384615
Abuse           6.153846
Assault         4.615385
Stealing        4.615385
Fighting        4.615385
Arson           4.615385
Burglary        3.076923
Vandalism       3.076923
Shoplifting     3.076923
Shooting        1.538462
Name: Evento, dtype: float64

Test:
Normal         50.769231
Robbery        15.384615
Fighting        6.153846
Stealing        6.153846
Assault         4.615385
Abuse           4.615385
Burglary        4.615385
Arson           3.076923
Shoplifting     1.538462
Shooting        1.538462
Vandalism       1.538462
Name: Evento, dtype: float64



In [50]:
test_df

Unnamed: 0,Video,Evento,PCB,Start_Frame,End_Frame,duracion,Path,is_crime
149,Fighting047_x264.mp4,Fighting,82,384,1898,1816,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,1
569,Normal_Videos359_x264.mp4,Normal,0,0,903,903,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,0
598,Normal_Videos405_x264.mp4,Normal,0,0,1065,1065,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,0
500,Normal_Videos263_x264.mp4,Normal,0,0,1800,1800,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,0
67,Assault007_x264.mp4,Assault,330,668,1099,769,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,1
...,...,...,...,...,...,...,...,...
480,Normal_Videos236_x264.mp4,Normal,0,0,3380,3380,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,0
73,Assault015_x264.mp4,Assault,0,444,1395,1395,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,1
63,Assault003_x264.mp4,Assault,148,381,1789,1641,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,1
622,Normal_Videos435_x264.mp4,Normal,0,0,1148,1148,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,0


In [52]:
# Eliminar columnas que no se van a usar
columnas = ['duracion', 'Evento', 'Video']
train_df = train_df.drop(columns=columnas)
valid_df = valid_df.drop(columns=columnas)
test_df = test_df.drop(columns=columnas)

In [53]:
train_df.head()

Unnamed: 0,PCB,Start_Frame,End_Frame,Path,is_crime
433,0,0,1169,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,0
146,143,295,1302,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,1
413,0,0,4330,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,0
522,0,0,4302,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,0
231,0,140,535,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,1


In [54]:
# Guardar los datasets en archivos csv
train_df.to_csv("../data/ml/train.csv", index=False)
valid_df.to_csv("../data/ml/valid.csv", index=False)
test_df.to_csv("../data/ml/test.csv", index=False)