# DATA DE TRAINING

## Cargando features de video de Train

In [2]:
# Directorio donde están almacenados los archivos .npy
import os
import numpy as np

# Obtener el directorio de trabajo actual
current_dir = os.getcwd()

# Definir el subdirectorio de características relativo al directorio de trabajo actual
feature_dir = os.path.join(current_dir, 'train_subset_features')

# Listar todos los archivos .npy en el subdirectorio
feature_files = [f for f in os.listdir(feature_dir) if f.endswith('.npy')]



# Cargar y procesar cada archivo .npy
all_data_video = []
all_names_video= []
for file_name in feature_files:
    file_path = os.path.join(feature_dir, file_name)
    data = np.load(file_path)
    if data.shape == (0,):
        print(f'Skipping {file_name} because it is empty.')
    elif data.shape[1] == 512:
        print(f'Loaded {file_name[0:11]} with shape: {data.shape}')
        all_data_video.append(data)
        all_names_video.append(file_name[0:11])
    else:
        print(f'Skipping {file_name} due to incorrect shape: {data.shape}')

print(f'Loaded {len(all_data_video)} files.')


Loaded KEEaIcb92EI with shape: (18, 512)
Loaded keHoZJf3t2U with shape: (15, 512)
Loaded keK3qv5Nljc with shape: (15, 512)
Loaded KeMyLHQiTCI with shape: (18, 512)
Loaded kenEvdqCrL8 with shape: (18, 512)
Loaded KEnS0kYSsAY with shape: (18, 512)
Loaded KeOkjMaA4F8 with shape: (18, 512)
Loaded keR89NmDqUM with shape: (15, 512)
Loaded kETFtgRjueU with shape: (15, 512)
Loaded kEUk24tUH-c with shape: (6, 512)
Loaded kEUKP5OFbOk with shape: (18, 512)
Loaded Kf-sdMllGps with shape: (12, 512)
Loaded Kf5xVWGbJbQ with shape: (5, 512)
Loaded Kf8qJ4OBRuA with shape: (15, 512)
Loaded KfC9-6vnQks with shape: (17, 512)
Loaded kfd1wOdsh8k with shape: (18, 512)
Loaded KFDz2FXrL-4 with shape: (18, 512)
Loaded kfKK1tDOVKY with shape: (18, 512)
Loaded kFU7J-VIzAg with shape: (18, 512)
Loaded kg2U6Rv0tkY with shape: (13, 512)
Loaded KGbqEyo02ck with shape: (15, 512)
Loaded kgIk22yks0k with shape: (8, 512)
Loaded KgKnbf13xtU with shape: (15, 512)
Loaded KGkNULmOvOQ with shape: (14, 512)
Loaded KgmJrAGREEM 

## Cargando Features de sonido de Train

In [3]:
import os
import pandas as pd

# Obtener el directorio de trabajo actual
current_dir = os.getcwd()

# Definir el subdirectorio de características relativo al directorio de trabajo actual
feature_dir = os.path.join(current_dir, 'train_csv')

# Listar todos los archivos .csv en el subdirectorio
feature_files = [f for f in os.listdir(feature_dir) if f.endswith('.csv')]

# Inicializar listas para almacenar los datos y nombres de los archivos
all_data_sound = []
all_names_sound = []

# Cargar y procesar cada archivo .csv
for file_name in feature_files:
    file_path = os.path.join(feature_dir, file_name)
    try:
        data = pd.read_csv(file_path)
        if data.empty:
            print(f'Skipping {file_name} because it is empty.')
        elif data.shape[1] == 128:
            print(f'Loaded {file_name[0:11]} with shape: {data.shape}')
            all_data_sound.append(data)
            all_names_sound.append(file_name[0:11])
        else:
            print(f'Skipping {file_name} due to incorrect shape: {data.shape}')
    except pd.errors.EmptyDataError:
        print(f'Skipping {file_name} because it is empty or unreadable.')
    except Exception as e:
        print(f'Error loading {file_name}: {e}')

print(f'Loaded {len(all_data_sound)} files.')


Loaded --gx7yb1-x0 with shape: (10, 128)
Loaded --Ntf6n-j9Q with shape: (10, 128)
Loaded -07Ke73N4zI with shape: (10, 128)
Loaded -1MXpPymXFU with shape: (10, 128)
Loaded -2KvnLMnrA0 with shape: (4, 128)
Loaded -36efvC2K54 with shape: (10, 128)
Loaded -3tVVBhz8-o with shape: (10, 128)
Loaded -5oULXqj45c with shape: (5, 128)
Loaded -5s-IR39XaY with shape: (7, 128)
Loaded -5Vx7UtZpzk with shape: (7, 128)
Loaded -5ZUMpdBPyw with shape: (10, 128)
Loaded -6d7xfYKptE with shape: (10, 128)
Loaded -6fbWHuoX1s with shape: (10, 128)
Loaded -6usjfP8hys with shape: (10, 128)
Loaded -6zh0xWFWT4 with shape: (10, 128)
Loaded -8S87U-FVJI with shape: (10, 128)
Loaded -9oHBPWsXqg with shape: (10, 128)
Loaded -AFUY-wuzdU with shape: (9, 128)
Loaded -B9lBd4S48o with shape: (5, 128)
Loaded -b9VMb0QwtY with shape: (10, 128)
Loaded -bN9HFn0Lng with shape: (10, 128)
Loaded -BptOURMao8 with shape: (10, 128)
Loaded -BtQqYdMQU8 with shape: (10, 128)
Loaded -c5ZxVi0ZME with shape: (10, 128)
Loaded -cARmC7Jdx0 wit

## Trabajando con la media para cada dato

In [5]:
data_sound_train = []

for i in range(len(all_data_sound)):
    meandata = np.mean(all_data_sound[i], axis=0) 
    data_sound_train.append(meandata)

# convertir a numpy array
data_sound_train = np.array(data_sound_train)
print(data_sound_train.shape)         # datanueva con la media de cada video
print(len(all_names_sound))           # nombres de los videos


# juntar los nombres de los videos con las medias en un pandas
import pandas as pd
df_sound = pd.DataFrame(data_sound_train)
df_sound['video'] = all_names_sound

(10681, 128)
10681


In [6]:
# SACANDO MEDIA PARA CADA DATO DE all_data
# Esto se hace para que nuestra data sea más manejable
data_video_train = []

for i in range(len(all_data_video)):
    meandata = np.mean(all_data_video[i], axis=0) 
    data_video_train.append(meandata)

# convertir a numpy array
data_video_train = np.array(data_video_train)
print(data_video_train.shape)         # datanueva con la media de cada video
print(len(all_names_video))           # nombres de los videos

# juntar los nombres de los videos con las medias en un pandas
import pandas as pd
df = pd.DataFrame(data_video_train)
df['video'] = all_names_video

(5351, 512)
5351


### Uniendo ambos dataframe
- Para sacar la data final de entrenamiento

In [7]:
df_final = pd.merge(df, df_sound, on='video')
data_train = df_final.drop(columns=['video']).to_numpy()
print(data_train.shape)   

(5345, 640)


## ENTRENANDO EL DATA_TRAIN CON KMEANS

In [46]:
# Aplicar KMeans
from modelos.Kmeans import KMeans
kmeans = KMeans(num_cluster=20,random_state=42)
labels = kmeans.fit(data_train)
print(len(labels))

iteración 0
iteración 1
iteración 2
iteración 3
iteración 4
iteración 5
iteración 6
iteración 7
iteración 8
iteración 9
iteración 10
iteración 11
iteración 12
iteración 13
iteración 14
iteración 15
iteración 16
iteración 17
iteración 18
iteración 19
iteración 20
iteración 21
iteración 22
iteración 23
iteración 24
iteración 25
iteración 26
iteración 27
iteración 28
iteración 29
iteración 30
iteración 31
Convergió en la iteración 31
centroides_last [[1.28216493 1.98558655 0.43727291 ... 0.         0.14379167 0.06281302]
 [0.31810754 0.89295488 0.45523053 ... 0.01612037 0.11529065 0.19745432]
 [1.15433201 2.11048382 0.71126151 ... 0.02816885 0.19238459 0.07443249]
 ...
 [0.63474856 0.83507746 0.61909735 ... 0.06783786 0.21041288 0.07272223]
 [0.74881124 0.63317357 0.50022822 ... 0.04022814 0.09678558 0.04562992]
 [0.76331626 0.68929985 1.43094392 ... 0.08007036 0.12720981 0.21657801]]
centroides_new [[1.28216493 1.98558655 0.43727291 ... 0.         0.14379167 0.06281302]
 [0.31810754 0.89

### Concatenando los  **label predicho** con su respectivo **label real** al df_final

In [9]:
# añadiendo los label predichos al dataframe
df_final['predic_label'] = labels

# cargar las etiquetas de los videos
categorical_labels = pd.read_csv("train_subset.csv")
categorical_labels.set_index("youtube_id")
labels_dict = categorical_labels.set_index('youtube_id')['label'].to_dict() 
categorical_labels = []

for names in df_final['video']:
    categorical_labels.append(labels_dict[names])

df_final['real_label'] = categorical_labels    

true_labels = df_final['real_label']
pred_labels = df_final['predic_label']

## METRICAS PARA LA DATA DE TRAINING

In [11]:
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics.cluster import adjusted_rand_score ,adjusted_mutual_info_score
s_core = silhouette_score(data_train,pred_labels)
ari = adjusted_rand_score(true_labels, pred_labels)
ami = adjusted_mutual_info_score(true_labels, pred_labels)
print('----------------------------------------')
print(f"Silhouette Score: {s_core}")
print('----------------------------------------')
print(f"Adjusted Rand Index: {ari}")
print('----------------------------------------')
print(f"Mutual Info Index: {ami}")


----------------------------------------
Silhouette Score: 0.14204064467734603
----------------------------------------
Adjusted Rand Index: 0.8071472855769588
----------------------------------------
Mutual Info Index: 0.926789769392992


# DATA DE VALIDATION

## Cargando features de video de Validation

In [3]:
# Directorio donde están almacenados los archivos .npy
import os
import numpy as np

# Obtener el directorio de trabajo actual
current_dir = os.getcwd()

# Definir el subdirectorio de características relativo al directorio de trabajo actual
feature_dir = os.path.join(current_dir, 'val_subset_features')

# Listar todos los archivos .npy en el subdirectorio
feature_files = [f for f in os.listdir(feature_dir) if f.endswith('.npy')]



# Cargar y procesar cada archivo .npy
all_data_video = []
all_names_video= []
for file_name in feature_files:
    file_path = os.path.join(feature_dir, file_name)
    data = np.load(file_path)
    if data.shape == (0,):
        print(f'Skipping {file_name} because it is empty.')
    elif data.shape[1] == 512:
        print(f'Loaded {file_name[0:11]} with shape: {data.shape}')
        all_data_video.append(data)
        all_names_video.append(file_name[0:11])
    else:
        print(f'Skipping {file_name} due to incorrect shape: {data.shape}')

print(f'Loaded {len(all_data_video)} files.')


Loaded --33Lscn6sk with shape: (11, 512)
Loaded -0dYbFhZsGU with shape: (15, 512)
Loaded -0WZKTu0xNk with shape: (18, 512)
Loaded -1DARulHLgw with shape: (18, 512)
Loaded -1QTRLQSzhQ with shape: (18, 512)
Loaded -2VKVjgNuE0 with shape: (11, 512)
Loaded -2VXhGGeOWg with shape: (18, 512)
Loaded -2zDnjMmI5U with shape: (10, 512)
Loaded -3cPJnxtl7U with shape: (15, 512)
Loaded -3oo45vpQV4 with shape: (15, 512)
Loaded -4JdZpx3zNk with shape: (18, 512)
Loaded -514AQUrSow with shape: (18, 512)
Loaded -5fnpyU2iE0 with shape: (15, 512)
Loaded -5fqwdtpSOY with shape: (9, 512)
Loaded -65aI53dvdE with shape: (18, 512)
Loaded -7E9WiX7QfA with shape: (18, 512)
Loaded -94nreFhQRg with shape: (12, 512)
Loaded -94oNUNdpQs with shape: (18, 512)
Loaded -9ILBd-ArtM with shape: (18, 512)
Loaded -B-GH43bTjk with shape: (18, 512)
Loaded -bH-BrGmJnM with shape: (18, 512)
Loaded -CaRbdIFhjI with shape: (18, 512)
Loaded -dvwA0Hzj6s with shape: (18, 512)
Loaded -he92lx6vH0 with shape: (9, 512)
Loaded -hjVL6WNC00

## Cargando Features de sonido de Validation

In [4]:
# Directorio donde están almacenados los archivos .npy
import os
import numpy as np

# Obtener el directorio de trabajo actual
current_dir = os.getcwd()

# Definir el subdirectorio de características relativo al directorio de trabajo actual
feature_dir = os.path.join(current_dir, 'val_subset_features_sound')

# Listar todos los archivos .npy en el subdirectorio
feature_files = [f for f in os.listdir(feature_dir) if f.endswith('.npy')]



# Cargar y procesar cada archivo .npy
all_data_sound = []
all_names_sound= []
for file_name in feature_files:
    file_path = os.path.join(feature_dir, file_name)
    data = np.load(file_path)
    if data.shape == (0,):
        print(f'Skipping {file_name} because it is empty.')
    elif data.shape[1] == 128:
        print(f'Loaded {file_name[0:11]} with shape: {data.shape}')
        all_data_sound.append(data)
        all_names_sound.append(file_name[0:11])
    else:
        print(f'Skipping {file_name} due to incorrect shape: {data.shape}')

print(f'Loaded {len(all_data_sound)} files.')


Loaded --33Lscn6sk with shape: (6, 128)
Loaded -0dYbFhZsGU with shape: (10, 128)
Loaded -0WZKTu0xNk with shape: (10, 128)
Loaded -1DARulHLgw with shape: (10, 128)
Loaded -1QTRLQSzhQ with shape: (10, 128)
Loaded -2VKVjgNuE0 with shape: (10, 128)
Loaded -2VXhGGeOWg with shape: (10, 128)
Loaded -2zDnjMmI5U with shape: (5, 128)
Loaded -3cPJnxtl7U with shape: (10, 128)
Loaded -3oo45vpQV4 with shape: (10, 128)
Loaded -4JdZpx3zNk with shape: (10, 128)
Loaded -514AQUrSow with shape: (10, 128)
Loaded -5fnpyU2iE0 with shape: (10, 128)
Loaded -5fqwdtpSOY with shape: (10, 128)
Loaded -65aI53dvdE with shape: (10, 128)
Loaded -7E9WiX7QfA with shape: (10, 128)
Loaded -94nreFhQRg with shape: (6, 128)
Loaded -94oNUNdpQs with shape: (10, 128)
Loaded -9ILBd-ArtM with shape: (10, 128)
Loaded -B-GH43bTjk with shape: (10, 128)
Loaded -bH-BrGmJnM with shape: (10, 128)
Loaded -CaRbdIFhjI with shape: (10, 128)
Loaded -dvwA0Hzj6s with shape: (10, 128)
Loaded -he92lx6vH0 with shape: (10, 128)
Loaded -hjVL6WNC00 

## Trabajando con la media para cada dato

In [5]:
data_sound_val = []

for i in range(len(all_data_sound)):
    meandata = np.mean(all_data_sound[i], axis=0) 
    data_sound_val.append(meandata)

# convertir a numpy array
data_sound_val = np.array(data_sound_val)
print(data_sound_val.shape)         # datanueva con la media de cada video
print(len(all_names_sound))           # nombres de los videos


# juntar los nombres de los videos con las medias en un pandas
import pandas as pd
df_sound = pd.DataFrame(data_sound_val)
df_sound['video'] = all_names_sound

(856, 128)
856


In [6]:
# SACANDO MEDIA PARA CADA DATO DE all_data
# Esto se hace para que nuestra data sea más manejable
data_video_val = []

for i in range(len(all_data_video)):
    meandata = np.mean(all_data_video[i], axis=0) 
    data_video_val.append(meandata)

# convertir a numpy array
data_video_val = np.array(data_video_val)
print(data_video_val.shape)         # datanueva con la media de cada video
print(len(all_names_video))           # nombres de los videos

# juntar los nombres de los videos con las medias en un pandas
import pandas as pd
df = pd.DataFrame(data_video_val)
df['video'] = all_names_video

(855, 512)
855


### Uniendo ambos dataframe
- Para sacar la data final de validacion

In [7]:
df_final = pd.merge(df, df_sound, on='video')
data_val = df_final.drop(columns=['video']).to_numpy()
print(data_val.shape)

(855, 640)


## ENTRENANDO EL DATA_VAL CON KMEANS

In [8]:
# Aplicar KMeans
from modelos.Kmeans import KMeans
kmeans = KMeans(num_cluster=20,random_state=42)
labels = kmeans.fit(data_val)
print(len(labels))

iteración 0
iteración 1
iteración 2
iteración 3
iteración 4
iteración 5
iteración 6
iteración 7
iteración 8
iteración 9
iteración 10
iteración 11
iteración 12
iteración 13
iteración 14
iteración 15
iteración 16
Convergió en la iteración 16
centroides_last [[1.20376829 0.60128281 0.58394045 ... 0.07197315 0.13049545 0.07513471]
 [0.68344957 0.47488022 0.45087259 ... 0.03084506 0.1658038  0.05398273]
 [0.79443945 0.95319586 0.87969659 ... 0.09519648 0.14931027 0.11939387]
 ...
 [0.76099462 0.54785611 0.43700227 ... 0.         0.07709412 0.20692129]
 [0.67258089 0.56285343 1.05420226 ... 0.07204028 0.14947244 0.28364918]
 [0.7806078  0.65109981 0.40828097 ... 0.02570315 0.08134504 0.06610208]]
centroides_new [[1.20376829 0.60128281 0.58394045 ... 0.07197315 0.13049545 0.07513471]
 [0.68344957 0.47488022 0.45087259 ... 0.03084506 0.1658038  0.05398273]
 [0.79443945 0.95319586 0.87969659 ... 0.09519648 0.14931027 0.11939387]
 ...
 [0.76099462 0.54785611 0.43700227 ... 0.         0.07709412 

### Concatenando los  **label predicho** con su respectivo **label real** al df_final

In [9]:
# añadiendo los label predichos al dataframe
df_final['predic_label'] = labels

# cargar las etiquetas de los videos
categorical_labels = pd.read_csv("val.csv")
categorical_labels.set_index("youtube_id")
labels_dict = categorical_labels.set_index('youtube_id')['label'].to_dict() 
categorical_labels = []

for names in df_final['video']:
    categorical_labels.append(labels_dict[names])

df_final['real_label'] = categorical_labels    

true_labels = df_final['real_label']
pred_labels = df_final['predic_label']

## METRICAS PARA LA DATA DE VALIDATION

In [11]:
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics.cluster import adjusted_rand_score ,adjusted_mutual_info_score
s_core = silhouette_score(data_val,pred_labels)
ari = adjusted_rand_score(true_labels, pred_labels)
ami = adjusted_mutual_info_score(true_labels, pred_labels)
print('----------------------------------------')
print(f"Silhouette Score: {s_core}")
print('----------------------------------------')
print(f"Adjusted Rand Index: {ari}")
print('----------------------------------------')
print(f"Mutual Info Index: {ami}")


----------------------------------------
Silhouette Score: 0.10033787411343716
----------------------------------------
Adjusted Rand Index: 0.5713986148484494
----------------------------------------
Mutual Info Index: 0.7583126590720795


# DATA TEST

## Cargando features de video de Test

In [12]:
# Directorio donde están almacenados los archivos .npy
import os
import numpy as np

# Obtener el directorio de trabajo actual
current_dir = os.getcwd()

# Definir el subdirectorio de características relativo al directorio de trabajo actual
feature_dir = os.path.join(current_dir, 'test_subset_features')

# Listar todos los archivos .npy en el subdirectorio
feature_files = [f for f in os.listdir(feature_dir) if f.endswith('.npy')]



# Cargar y procesar cada archivo .npy
all_data_video = []
all_names_video= []
for file_name in feature_files:
    file_path = os.path.join(feature_dir, file_name)
    data = np.load(file_path)
    if data.shape == (0,):
        print(f'Skipping {file_name} because it is empty.')
    elif data.shape[1] == 512:
        print(f'Loaded {file_name[0:11]} with shape: {data.shape}')
        all_data_video.append(data)
        all_names_video.append(file_name[0:11])
    else:
        print(f'Skipping {file_name} due to incorrect shape: {data.shape}')

print(f'Loaded {len(all_data_video)} files.')


Loaded -3T2VPGo2jI with shape: (15, 512)
Loaded -7Wz9S-ZZz4 with shape: (15, 512)
Loaded -a1f1ikmdd8 with shape: (18, 512)
Loaded -AZMzKjls-k with shape: (15, 512)
Loaded -bGVnGCy2yY with shape: (18, 512)
Loaded -BrNqmFy-6Y with shape: (9, 512)
Loaded -CsBjy8T-II with shape: (18, 512)
Loaded -eOxQpTp5zM with shape: (18, 512)
Loaded -FhZ1JPZYl0 with shape: (11, 512)
Loaded -fyfEc1C5Hc with shape: (18, 512)
Loaded -IdatujDsqA with shape: (18, 512)
Loaded -J8jfO0Fp6M with shape: (18, 512)
Loaded -JdfkIKg0w4 with shape: (18, 512)
Loaded -Npm2udxeAE with shape: (18, 512)
Loaded -pBHLSZrn3k with shape: (11, 512)
Loaded -RCv-107T5w with shape: (15, 512)
Loaded -rhOaNjvwWo with shape: (9, 512)
Loaded -rhOXmTJT1M with shape: (18, 512)
Loaded -sODNa60qVI with shape: (18, 512)
Loaded -W2Pcut-jEU with shape: (18, 512)
Loaded -WAMHNq45kY with shape: (18, 512)
Loaded -WKQMew6YXw with shape: (18, 512)
Loaded -XI64LgAgxM with shape: (18, 512)
Loaded -XzpQxfVypI with shape: (18, 512)
Loaded -Yn59LWpYdQ

## Cargando Features de sonido de Test

In [13]:
# Directorio donde están almacenados los archivos .npy
import os
import numpy as np

# Obtener el directorio de trabajo actual
current_dir = os.getcwd()

# Definir el subdirectorio de características relativo al directorio de trabajo actual
feature_dir = os.path.join(current_dir, 'test_subset_features_sound')

# Listar todos los archivos .npy en el subdirectorio
feature_files = [f for f in os.listdir(feature_dir) if f.endswith('.npy')]



# Cargar y procesar cada archivo .npy
all_data_sound = []
all_names_sound= []
for file_name in feature_files:
    file_path = os.path.join(feature_dir, file_name)
    data = np.load(file_path)
    if data.shape == (0,):
        print(f'Skipping {file_name} because it is empty.')
    elif data.shape[1] == 128:
        print(f'Loaded {file_name[0:11]} with shape: {data.shape}')
        all_data_sound.append(data)
        all_names_sound.append(file_name[0:11])
    else:
        print(f'Skipping {file_name} due to incorrect shape: {data.shape}')

print(f'Loaded {len(all_data_sound)} files.')


Loaded -3T2VPGo2jI with shape: (10, 128)
Loaded -7Wz9S-ZZz4 with shape: (10, 128)
Loaded -a1f1ikmdd8 with shape: (10, 128)
Loaded -AZMzKjls-k with shape: (10, 128)
Loaded -bGVnGCy2yY with shape: (10, 128)
Loaded -BrNqmFy-6Y with shape: (10, 128)
Loaded -CsBjy8T-II with shape: (10, 128)
Loaded -eOxQpTp5zM with shape: (10, 128)
Loaded -FhZ1JPZYl0 with shape: (6, 128)
Loaded -fyfEc1C5Hc with shape: (10, 128)
Loaded -IdatujDsqA with shape: (10, 128)
Loaded -J8jfO0Fp6M with shape: (10, 128)
Loaded -JdfkIKg0w4 with shape: (10, 128)
Loaded -Npm2udxeAE with shape: (10, 128)
Loaded -pBHLSZrn3k with shape: (6, 128)
Loaded -RCv-107T5w with shape: (10, 128)
Loaded -rhOaNjvwWo with shape: (10, 128)
Loaded -rhOXmTJT1M with shape: (10, 128)
Loaded -sODNa60qVI with shape: (10, 128)
Loaded -W2Pcut-jEU with shape: (10, 128)
Loaded -WAMHNq45kY with shape: (10, 128)
Loaded -WKQMew6YXw with shape: (10, 128)
Loaded -XI64LgAgxM with shape: (10, 128)
Loaded -XzpQxfVypI with shape: (10, 128)
Loaded -Yn59LWpYdQ

## Trabajando con la media para cada dato

In [14]:
data_sound_test = []

for i in range(len(all_data_sound)):
    meandata = np.mean(all_data_sound[i], axis=0) 
    data_sound_test.append(meandata)

# convertir a numpy array
data_sound_test = np.array(data_sound_test)
print(data_sound_test.shape)         # datanueva con la media de cada video
print(len(all_names_sound))           # nombres de los videos


# juntar los nombres de los videos con las medias en un pandas
import pandas as pd
df_sound = pd.DataFrame(data_sound_test)
df_sound['video'] = all_names_sound

(1622, 128)
1622


In [15]:
# SACANDO MEDIA PARA CADA DATO DE all_data
# Esto se hace para que nuestra data sea más manejable
data_video_test = []

for i in range(len(all_data_video)):
    meandata = np.mean(all_data_video[i], axis=0) 
    data_video_test.append(meandata)

# convertir a numpy array
data_video_test = np.array(data_video_test)
print(data_video_test.shape)         # datanueva con la media de cada video
print(len(all_names_video))           # nombres de los videos

# juntar los nombres de los videos con las medias en un pandas
import pandas as pd
df = pd.DataFrame(data_video_test)
df['video'] = all_names_video

(1626, 512)
1626


### Uniendo ambos dataframe
- Para sacar la data final de test

In [16]:
# concatenar los dos dataframes por la columna 'video' 
df_final = pd.merge(df, df_sound, on='video',how='left')
# los videos de df que no estan en df_sound poner las 128 columnas en 0
# rellenar los valores nan con 0
df_final.fillna(0, inplace=True)

# extrar en un numpy todos los datos menos la columna 'video'
data_test = df_final.drop(columns=['video']).to_numpy()
print(data_test.shape)   

(1626, 640)


## ENTRENANDO EL DATA_TEST CON KMEANS

In [17]:
# Aplicar KMeans
from modelos.Kmeans import KMeans
kmeans = KMeans(num_cluster=20,random_state=42)
labels = kmeans.fit(data_test)
print(len(labels))

iteración 0
iteración 1
iteración 2
iteración 3
iteración 4
iteración 5
iteración 6
iteración 7
iteración 8
iteración 9
iteración 10
iteración 11
iteración 12
iteración 13
iteración 14
iteración 15
iteración 16
iteración 17
iteración 18
iteración 19
iteración 20
iteración 21
iteración 22
iteración 23
Convergió en la iteración 23
centroides_last [[0.68304906 2.29699635 1.89652898 ... 0.04497366 0.14420257 0.06040172]
 [0.72753685 0.53282532 1.56809718 ... 0.02090135 0.09997822 0.36949123]
 [0.81278343 0.68340676 0.46300204 ... 0.01353928 0.08061674 0.05631718]
 ...
 [0.41279556 0.89204092 0.40048556 ... 0.01724396 0.14580806 0.15916171]
 [1.07813023 0.58845867 0.66519064 ... 0.01737318 0.12913108 0.04509237]
 [0.87582952 1.422446   1.31878087 ... 0.045682   0.13667885 0.0947848 ]]
centroides_new [[0.68304906 2.29699635 1.89652898 ... 0.04497366 0.14420257 0.06040172]
 [0.72753685 0.53282532 1.56809718 ... 0.02090135 0.09997822 0.36949123]
 [0.81278343 0.68340676 0.46300204 ... 0.0135392

### Concatenando los  **label predicho** al df_final

In [18]:
df_final['predic_label'] = labels
print(df_final['video'].head())

0    -3T2VPGo2jI
1    -7Wz9S-ZZz4
2    -a1f1ikmdd8
3    -AZMzKjls-k
4    -bGVnGCy2yY
Name: video, dtype: object


### Extrayendo los videos de test que pide el kaggle

In [19]:
# cargamos el test_subset_10.csv
df_test = pd.read_csv('test_subset_10.csv')

# los df_test youtube_id los pasamos a una lista
youtube_id = df_test['youtube_id'].tolist()
#print(youtube_id)

# extraemos de df los videos que estan en youtube_id
df_final = df_final[df_final['video'].isin(youtube_id)]

print(df_final[['video','predic_label']])

            video  predic_label
1     -7Wz9S-ZZz4            14
2     -a1f1ikmdd8            18
4     -bGVnGCy2yY            17
5     -BrNqmFy-6Y            18
6     -CsBjy8T-II            17
...           ...           ...
1620  ZTTEkAxEnUs             5
1622  zujZagp-4jQ            18
1623  ZuvChCAjbak            18
1624  ZxpwgIZg4lI            12
1625  zz1YMml9Z6k             3

[805 rows x 2 columns]


## Metrica de silhouette_score para el test

In [20]:
from sklearn.metrics import silhouette_score, davies_bouldin_score
s_core = silhouette_score(data_test,labels)

print('----------------------------------------')
print(f"Silhouette Score: {s_core}")

----------------------------------------
Silhouette Score: 0.08111943577632434


## Guardando la data de test

In [21]:
labels = np.array(df_final['predic_label'])
youtube_id = np.array(df_final['video'])
print(youtube_id.shape)
print(labels.shape)

# crear una lista de tuplas con los youtube_id y los labels
y_pred = []
for i in range(len(youtube_id)):
    y_pred.append((youtube_id[i], labels[i]))

# 
df_y_pred = pd.DataFrame(y_pred, columns=['youtube_id', 'result'])
# no tener el indice
df_y_pred.set_index('youtube_id', inplace=True)
df_y_pred.to_csv("y_pred_seed_42_sound_&&_video.csv")

(805,)
(805,)
