# Feature Engineering

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sys
import os

# Agrega el path al directorio raíz del proyecto
sys.path.append(os.path.abspath(".."))

In [2]:
df = pd.read_csv("../data/clean/data.csv")

In [3]:
df.head()

Unnamed: 0,Video,Evento,PCB,Start_Frame,End_Frame,duracion
0,Abuse001_x264.mp4,Abuse,237,327,397,160
1,Abuse002_x264.mp4,Abuse,0,174,266,266
2,Abuse003_x264.mp4,Abuse,351,951,3614,3263
3,Abuse004_x264.mp4,Abuse,558,898,5779,5221
4,Abuse005_x264.mp4,Abuse,210,717,933,723


In [None]:
# Crear una columna con el path
df['Path'] = df.apply(lambSda row: f"../data/Anomaly-Detection-Dataset/Anomaly-Videos/{row['Evento']}/{row['Video']}", axis=1)

In [5]:
df['Path'].head()

0    ../data/Anomaly-Detection-Dataset/Anomaly-Vide...
1    ../data/Anomaly-Detection-Dataset/Anomaly-Vide...
2    ../data/Anomaly-Detection-Dataset/Anomaly-Vide...
3    ../data/Anomaly-Detection-Dataset/Anomaly-Vide...
4    ../data/Anomaly-Detection-Dataset/Anomaly-Vide...
Name: Path, dtype: object

In [6]:
# Añadir la columna is_crime
df['is_crime'] = df['Evento'].apply(lambda x: 0 if x == 'Normal' else 1)

In [7]:
values = df['is_crime'].value_counts()
value = abs(values[0] - values[1])
print(f"Se necesita eliminar {value} registros para balancear el dataset")

Se necesita eliminar 6 registros para balancear el dataset


In [8]:
# Elimina value aleatorias de las que no son normales
df = df.drop(df[df['is_crime'] == 1].sample(value).index)

In [9]:
df['is_crime'].value_counts()

0    323
1    323
Name: is_crime, dtype: int64

In [10]:
# En los videos normales poner de End_frame el valor de Duration
df['End_Frame'] = df.apply(lambda row: row['duracion'] if row['Evento'] == 'Normal' else row['End_Frame'], axis=1)

In [11]:
df[df['Evento'] == 'Normal'][['duracion', 'End_Frame']].head()

Unnamed: 0,duracion,End_Frame
329,544,544
330,1663,1663
331,917,917
332,386,386
333,985,985


## Separación de train, test y validation

In [12]:
# Dividir el dataset en train (80%), valid (10%), test (10%)
train_df, test_valid_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Evento'])
valid_df, test_df = train_test_split(test_valid_df, test_size=0.5, random_state=42, stratify=test_valid_df['Evento'])

In [13]:
# Función para ver cuantos eventos hay en cada dataset en porcentaje
def count_events(df, name):
    print(f"{name}:")
    print(df['Evento'].value_counts(normalize=True) * 100)
    print()

In [14]:
count_events(train_df, "Train")
count_events(valid_df, "Valid")
count_events(test_df, "Test")

Train:
Normal         50.000000
Robbery        14.922481
Stealing        5.426357
Abuse           5.426357
Fighting        4.844961
Assault         4.844961
Arson           4.069767
Burglary        4.069767
Vandalism       2.713178
Shoplifting     2.325581
Shooting        1.356589
Name: Evento, dtype: float64

Valid:
Normal         49.230769
Robbery        13.846154
Abuse           6.153846
Stealing        4.615385
Burglary        4.615385
Assault         4.615385
Arson           4.615385
Fighting        4.615385
Shoplifting     3.076923
Vandalism       3.076923
Shooting        1.538462
Name: Evento, dtype: float64

Test:
Normal         50.769231
Robbery        15.384615
Stealing        6.153846
Assault         6.153846
Abuse           4.615385
Burglary        4.615385
Fighting        4.615385
Arson           3.076923
Vandalism       1.538462
Shoplifting     1.538462
Shooting        1.538462
Name: Evento, dtype: float64



In [15]:
test_df

Unnamed: 0,Video,Evento,PCB,Start_Frame,End_Frame,duracion,Path,is_crime
151,Fighting051_x264.mp4,Fighting,989,1294,3355,2366,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,1
380,Normal_Videos086_x264.mp4,Normal,0,0,3391,3391,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,0
598,Normal_Videos405_x264.mp4,Normal,0,0,1065,1065,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,0
414,Normal_Videos134_x264.mp4,Normal,0,0,4476,4476,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,0
67,Assault007_x264.mp4,Assault,330,668,1099,769,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,1
...,...,...,...,...,...,...,...,...
636,Normal_Videos456_x264.mp4,Normal,0,0,3943,3943,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,0
72,Assault013_x264.mp4,Assault,465,615,1022,557,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,1
89,Assault047_x264.mp4,Assault,0,112,1790,1790,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,1
390,Normal_Videos101_x264.mp4,Normal,0,0,1538,1538,../data/Anomaly-Detection-Dataset/Anomaly-Vide...,0


In [16]:
# Quiero que copies los archivos de test en una carpeta de test
import shutil

# Funcion para crear la carpeta de test, valid y train
def create_folder(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    else:
        # Si la carpeta ya existe, eliminarla
        shutil.rmtree(folder_name)
        print(f"La carpeta {folder_name} ya existe.")
        # Crear la carpeta de nuevo
        os.makedirs(folder_name)
# Crear carpetas
create_folder("../data/test")
create_folder("../data/train")
create_folder("../data/valid")
# Copiar los archivos de test
for index, row in test_df.iterrows():
    shutil.copy(row['Path'], f"../data/test/{row['Video']}")
    # Cambia el path 
    test_df.at[index, 'Path'] = f"../data/test/{row['Video']}"

print(test_df.head(5))
input()
# Copiar los archivos de train
for index, row in train_df.iterrows():
    shutil.copy(row['Path'], f"../data/train/{row['Video']}")
    # Cambia el path
    train_df.at[index, 'Path'] = f"../data/train/{row['Video']}"
# Copiar los archivos de valid
for index, row in valid_df.iterrows():
    shutil.copy(row['Path'], f"../data/valid/{row['Video']}")
    # Cambia el path
    valid_df.at[index, 'Path'] = f"../data/valid/{row['Video']}"

                         Video    Evento  PCB  Start_Frame  End_Frame  \
151       Fighting051_x264.mp4  Fighting  989         1294       3355   
380  Normal_Videos086_x264.mp4    Normal    0            0       3391   
598  Normal_Videos405_x264.mp4    Normal    0            0       1065   
414  Normal_Videos134_x264.mp4    Normal    0            0       4476   
67         Assault007_x264.mp4   Assault  330          668       1099   

     duracion                                    Path  is_crime  
151      2366       ../data/test/Fighting051_x264.mp4         1  
380      3391  ../data/test/Normal_Videos086_x264.mp4         0  
598      1065  ../data/test/Normal_Videos405_x264.mp4         0  
414      4476  ../data/test/Normal_Videos134_x264.mp4         0  
67        769        ../data/test/Assault007_x264.mp4         1  


In [17]:
"""# Eliminar columnas que no se van a usar
columnas = ['duracion', 'Evento', 'Video']
train_df = train_df.drop(columns=columnas)
valid_df = valid_df.drop(columns=columnas)
test_df = test_df.drop(columns=columnas)"""

"# Eliminar columnas que no se van a usar\ncolumnas = ['duracion', 'Evento', 'Video']\ntrain_df = train_df.drop(columns=columnas)\nvalid_df = valid_df.drop(columns=columnas)\ntest_df = test_df.drop(columns=columnas)"

In [19]:
valid_df.head()

Unnamed: 0,Video,Evento,PCB,Start_Frame,End_Frame,duracion,Path,is_crime
565,Normal_Videos355_x264.mp4,Normal,0,0,888,888,../data/valid/Normal_Videos355_x264.mp4,0
201,Robbery060_x264.mp4,Robbery,0,98,2106,2106,../data/valid/Robbery060_x264.mp4,1
341,Normal_Videos020_x264.mp4,Normal,0,0,485,485,../data/valid/Normal_Videos020_x264.mp4,0
316,Vandalism015_x264.mp4,Vandalism,1385,2039,2366,981,../data/valid/Vandalism015_x264.mp4,1
63,Assault003_x264.mp4,Assault,148,381,1789,1641,../data/valid/Assault003_x264.mp4,1


In [20]:
# Guardar los dataframes en csv
train_df.to_csv("../data/ml/train.csv", index=False)
valid_df.to_csv("../data/ml/valid.csv", index=False)
test_df.to_csv("../data/ml/test.csv", index=False)