In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load dataset
df = pd.read_csv("CNN_features_dataset.csv", index_col=0)

# Count occurrences of each class
class_counts = df.iloc[:, -1].value_counts()

# Filter out classes with at least 20 occurrences
valid_classes = class_counts[class_counts >= 20].index
df_filtered = df[df.iloc[:, -1].isin(valid_classes)]

# Select exactly 20 datapoints per class
df_limited = df_filtered.groupby(df_filtered.columns[-1]).head(20)

df_limited.to_csv("CNN_features_dataset_limited.csv")




Dataset size: (1240, 2049)
Training size: (992, 2048), Testing size: (248, 2048)


In [69]:
# Separate features and labels
X = df_limited.iloc[:, :-1]
y = df_limited.iloc[:, -1]

# Encode labels (alphabetically)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Ensure stratified split (16 training, 4 testing per class)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=4/20, random_state=42, stratify=y_encoded)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f"Dataset size: {df_limited.shape}")
print(f"Training size: {X_train.shape}, Testing size: {X_test.shape}")

Dataset size: (1240, 2049)
Training size: (992, 2048), Testing size: (248, 2048)


In [70]:

import numpy as np

# Get unique values and their counts in y_train and y_test
unique_train, counts_train = np.unique(y_train, return_counts=True)
unique_test, counts_test = np.unique(y_test, return_counts=True)

# Create a dictionary mapping unique values to their counts
y_train_counts = dict(zip(unique_train, counts_train))
y_test_counts = dict(zip(unique_test, counts_test))

print("Counts of unique values in y_train:", y_train_counts)
print("Counts of unique values in y_test:", y_test_counts)




Counts of unique values in y_train: {np.int64(0): np.int64(16), np.int64(1): np.int64(16), np.int64(2): np.int64(16), np.int64(3): np.int64(16), np.int64(4): np.int64(16), np.int64(5): np.int64(16), np.int64(6): np.int64(16), np.int64(7): np.int64(16), np.int64(8): np.int64(16), np.int64(9): np.int64(16), np.int64(10): np.int64(16), np.int64(11): np.int64(16), np.int64(12): np.int64(16), np.int64(13): np.int64(16), np.int64(14): np.int64(16), np.int64(15): np.int64(16), np.int64(16): np.int64(16), np.int64(17): np.int64(16), np.int64(18): np.int64(16), np.int64(19): np.int64(16), np.int64(20): np.int64(16), np.int64(21): np.int64(16), np.int64(22): np.int64(16), np.int64(23): np.int64(16), np.int64(24): np.int64(16), np.int64(25): np.int64(16), np.int64(26): np.int64(16), np.int64(27): np.int64(16), np.int64(28): np.int64(16), np.int64(29): np.int64(16), np.int64(30): np.int64(16), np.int64(31): np.int64(16), np.int64(32): np.int64(16), np.int64(33): np.int64(16), np.int64(34): np.int6

In [71]:
# Decode labels
y_decoded = label_encoder.inverse_transform([0, 1, 2, 3])
print(y_decoded)  

['Alejandro_Toledo' 'Alvaro_Uribe' 'Amelie_Mauresmo' 'Andre_Agassi']
