# Human Voice Classification and Clustering - Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

    # To make plots appear inline in the notebook
%matplotlib inline

    # Optional: Improve the aesthetics of plots
plt.style.use('seaborn-v0_8-whitegrid') # Adjust style name if needed

print("Libraries imported successfully!")


Libraries imported successfully!


In [6]:
# Define the path to the data file using the project structure
data_file_path = "../data/vocal_gender_features_new.csv"

# Load the data into a Pandas DataFrame
# The dataset doesn't have headers in the CSV file itself
df = pd.read_csv(data_file_path, header=None)

print("Data loaded successfully!")
print(f"Dataset shape: {df.shape}")
df.head() # Display the first 5 rows

Data loaded successfully!
Dataset shape: (16149, 44)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,mean_spectral_centroid,std_spectral_centroid,mean_spectral_bandwidth,std_spectral_bandwidth,mean_spectral_contrast,mean_spectral_flatness,mean_spectral_rolloff,zero_crossing_rate,rms_energy,mean_pitch,...,mfcc_9_std,mfcc_10_mean,mfcc_10_std,mfcc_11_mean,mfcc_11_std,mfcc_12_mean,mfcc_12_std,mfcc_13_mean,mfcc_13_std,label
1,2247.3317386694885,1158.5377479627396,1870.4154617213685,370.40524126981836,21.44070973910965,0.036878686,4419.438073394495,0.16924096903669725,0.08255184,1592.1033,...,21.73624,2.3030853,8.983318,-17.410305,9.115154,0.30180356,10.452693,-3.0808318,10.146248,0
2,1790.7198885341024,996.5548248590285,1757.8986167468183,410.7103177834014,21.513383444311525,0.018935613,3635.7421875,0.10806783040364583,0.055476625,1112.6351,...,13.937135,-0.9539423,10.831742,-0.08877484,10.29769,-7.2811418,10.926579,-0.450248,8.489134,0
3,1977.9233634651953,1010.1486667795253,1747.0995546714869,461.4583789847496,20.476282729226075,0.03261628,3873.291015625,0.14463297526041666,0.060388464,1557.5225,...,14.900779,0.26009753,14.031009,-0.42066953,10.810292,-0.19982924,11.986182,3.3729858,9.285437,0
4,2037.7655496871362,1311.4406297606188,1745.2248519221325,419.05648424257623,19.516014491022776,0.028481863,3826.5845070422533,0.14893265845070422,0.029559094,1481.0868,...,15.957924,-1.4861215,14.461978,-8.479608,12.550333,3.997028,9.912608,-6.9469657,10.574301,0


In [7]:
# --- Basic Data Inspection ---

# 1. Shape of the dataset (rows, columns)
print("Dataset Shape (rows, columns):", df.shape)

# 2. Column names (should now be correct)
print("\nColumn Names:")
print(df.columns.tolist())

# 3. Data types of each column
print("\nData Types:")
print(df.dtypes)

# 4. Basic statistics for numerical columns
print("\nBasic Statistics (Numerical Columns):")
df.describe() # This will be displayed as a table

# 5. Check for missing values
print("\nMissing Values per Column:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0]) # Show only columns with missing values, if any
if missing_values.sum() == 0:
    print("No missing values found.")

# 6. Check the 'label' column specifically
print("\nLabel Column Value Counts:")
print(df['label'].value_counts())
print(f"Unique labels: {df['label'].unique()}")

Dataset Shape (rows, columns): (16149, 44)

Column Names:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43]

Data Types:
0     object
1     object
2     object
3     object
4     object
5     object
6     object
7     object
8     object
9     object
10    object
11    object
12    object
13    object
14    object
15    object
16    object
17    object
18    object
19    object
20    object
21    object
22    object
23    object
24    object
25    object
26    object
27    object
28    object
29    object
30    object
31    object
32    object
33    object
34    object
35    object
36    object
37    object
38    object
39    object
40    object
41    object
42    object
43    object
dtype: object

Basic Statistics (Numerical Columns):

Missing Values per Column:
Series([], dtype: int64)
No missing values found.

Label Column Value Counts:


KeyError: 'label'

In [8]:
# --- Reload Data Correctly ---

# Path to the data file
data_file_path = "../data/vocal_gender_features_new.csv"

# Define the correct column names based on the PDF and the last row of the CSV
column_names = [
    'mean_spectral_centroid', 'std_spectral_centroid',
    'mean_spectral_bandwidth', 'std_spectral_bandwidth',
    'mean_spectral_contrast', 'mean_spectral_flatness',
    'mean_spectral_rolloff', 'zero_crossing_rate', 'rms_energy',
    'mean_pitch', 'min_pitch', 'max_pitch', 'std_pitch',
    'spectral_skew', 'spectral_kurtosis',
    'energy_entropy', 'log_energy',
    'mfcc_1_mean', 'mfcc_1_std', 'mfcc_2_mean', 'mfcc_2_std',
    'mfcc_3_mean', 'mfcc_3_std', 'mfcc_4_mean', 'mfcc_4_std',
    'mfcc_5_mean', 'mfcc_5_std', 'mfcc_6_mean', 'mfcc_6_std',
    'mfcc_7_mean', 'mfcc_7_std', 'mfcc_8_mean', 'mfcc_8_std',
    'mfcc_9_mean', 'mfcc_9_std', 'mfcc_10_mean', 'mfcc_10_std',
    'mfcc_11_mean', 'mfcc_11_std', 'mfcc_12_mean', 'mfcc_12_std',
    'mfcc_13_mean', 'mfcc_13_std', 'label'
]

# Reload the data:
# 1. Read the CSV, skipping the last row (which contains the column names)
#    We need to specify engine='python' for skipfooter to work
df = pd.read_csv(data_file_path, header=None, skipfooter=1, engine='python')

# 2. Assign the correct column names to the DataFrame
df.columns = column_names

print("Data reloaded successfully with correct column names!")
print(f"Dataset shape: {df.shape}")
print("\nFirst few rows:")
df.head() # Display the first 5 rows with correct names


Data reloaded successfully with correct column names!
Dataset shape: (16148, 44)

First few rows:


Unnamed: 0,mean_spectral_centroid,std_spectral_centroid,mean_spectral_bandwidth,std_spectral_bandwidth,mean_spectral_contrast,mean_spectral_flatness,mean_spectral_rolloff,zero_crossing_rate,rms_energy,mean_pitch,...,mfcc_9_std,mfcc_10_mean,mfcc_10_std,mfcc_11_mean,mfcc_11_std,mfcc_12_mean,mfcc_12_std,mfcc_13_mean,mfcc_13_std,label
0,mean_spectral_centroid,std_spectral_centroid,mean_spectral_bandwidth,std_spectral_bandwidth,mean_spectral_contrast,mean_spectral_flatness,mean_spectral_rolloff,zero_crossing_rate,rms_energy,mean_pitch,...,mfcc_9_std,mfcc_10_mean,mfcc_10_std,mfcc_11_mean,mfcc_11_std,mfcc_12_mean,mfcc_12_std,mfcc_13_mean,mfcc_13_std,label
1,2247.3317386694885,1158.5377479627396,1870.4154617213685,370.40524126981836,21.44070973910965,0.036878686,4419.438073394495,0.16924096903669725,0.08255184,1592.1033,...,21.73624,2.3030853,8.983318,-17.410305,9.115154,0.30180356,10.452693,-3.0808318,10.146248,0
2,1790.7198885341024,996.5548248590285,1757.8986167468183,410.7103177834014,21.513383444311525,0.018935613,3635.7421875,0.10806783040364583,0.055476625,1112.6351,...,13.937135,-0.9539423,10.831742,-0.08877484,10.29769,-7.2811418,10.926579,-0.450248,8.489134,0
3,1977.9233634651953,1010.1486667795253,1747.0995546714869,461.4583789847496,20.476282729226075,0.03261628,3873.291015625,0.14463297526041666,0.060388464,1557.5225,...,14.900779,0.26009753,14.031009,-0.42066953,10.810292,-0.19982924,11.986182,3.3729858,9.285437,0
4,2037.7655496871362,1311.4406297606188,1745.2248519221325,419.05648424257623,19.516014491022776,0.028481863,3826.5845070422533,0.14893265845070422,0.029559094,1481.0868,...,15.957924,-1.4861215,14.461978,-8.479608,12.550333,3.997028,9.912608,-6.9469657,10.574301,0


In [9]:
# --- Basic Data Inspection (After Correct Loading) ---

# 1. Shape of the dataset (rows, columns)
print("Dataset Shape (rows, columns):", df.shape)

# 2. Column names (should now be correct)
print("\nColumn Names:")
# print(df.columns.tolist()) # This list is long, let's just check the first and last few
print(f"First 5 columns: {df.columns[:5].tolist()}")
print(f"Last 5 columns: {df.columns[-5:].tolist()}")

# 3. Data types of each column
print("\nData Types:")
print(df.dtypes)
# Check if all are numeric (float64 or int64) except potentially 'label'
# Let's specifically check the 'label' column type
print(f"\nData type of 'label' column: {df['label'].dtype}")

# 4. Basic statistics for numerical columns (this should work better now)
print("\nBasic Statistics (Numerical Columns):")
# describe() might try to include 'label'. Let's select only numerical columns first
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# Remove 'label' from numerical cols if it's numeric (it's categorical)
if 'label' in numerical_cols:
    numerical_cols.remove('label')

print(df[numerical_cols].describe()) # Display stats for numerical features

# 5. Check for missing values
print("\nMissing Values per Column:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0]) # Show only columns with missing values, if any
if missing_values.sum() == 0:
    print("No missing values found.")

# 6. Check the 'label' column specifically
print("\nLabel Column Value Counts:")
print(df['label'].value_counts())
print(f"\nUnique labels: {df['label'].unique()}")
print(f"Data type of 'label' column: {df['label'].dtype}")


Dataset Shape (rows, columns): (16148, 44)

Column Names:
First 5 columns: ['mean_spectral_centroid', 'std_spectral_centroid', 'mean_spectral_bandwidth', 'std_spectral_bandwidth', 'mean_spectral_contrast']
Last 5 columns: ['mfcc_12_mean', 'mfcc_12_std', 'mfcc_13_mean', 'mfcc_13_std', 'label']

Data Types:
mean_spectral_centroid     object
std_spectral_centroid      object
mean_spectral_bandwidth    object
std_spectral_bandwidth     object
mean_spectral_contrast     object
mean_spectral_flatness     object
mean_spectral_rolloff      object
zero_crossing_rate         object
rms_energy                 object
mean_pitch                 object
min_pitch                  object
max_pitch                  object
std_pitch                  object
spectral_skew              object
spectral_kurtosis          object
energy_entropy             object
log_energy                 object
mfcc_1_mean                object
mfcc_1_std                 object
mfcc_2_mean                object
mfcc_2_std   

ValueError: Cannot describe a DataFrame without columns

In [11]:
# --- Reload Data Correctly ---

# Path to the data file
data_file_path = "../data/vocal_gender_features_new.csv"

# Define the correct column names based on the PDF and the structure
column_names = [
    'mean_spectral_centroid', 'std_spectral_centroid',
    'mean_spectral_bandwidth', 'std_spectral_bandwidth',
    'mean_spectral_contrast', 'mean_spectral_flatness',
    'mean_spectral_rolloff', 'zero_crossing_rate', 'rms_energy',
    'mean_pitch', 'min_pitch', 'max_pitch', 'std_pitch',
    'spectral_skew', 'spectral_kurtosis',
    'energy_entropy', 'log_energy',
    'mfcc_1_mean', 'mfcc_1_std', 'mfcc_2_mean', 'mfcc_2_std',
    'mfcc_3_mean', 'mfcc_3_std', 'mfcc_4_mean', 'mfcc_4_std',
    'mfcc_5_mean', 'mfcc_5_std', 'mfcc_6_mean', 'mfcc_6_std',
    'mfcc_7_mean', 'mfcc_7_std', 'mfcc_8_mean', 'mfcc_8_std',
    'mfcc_9_mean', 'mfcc_9_std', 'mfcc_10_mean', 'mfcc_10_std',
    'mfcc_11_mean', 'mfcc_11_std', 'mfcc_12_mean', 'mfcc_12_std',
    'mfcc_13_mean', 'mfcc_13_std', 'label'
]

# Load the raw data without headers
df_raw = pd.read_csv(data_file_path, header=None)

# Assign the last row as column names
df_raw.columns = df_raw.iloc[-1]

# Drop the last row (which was used as column names)
df = df_raw.drop(df_raw.index[-1]).reset_index(drop=True)

# Convert all columns to numeric, except 'label' if needed
for col in df.columns[:-1]:  # Exclude the last column ('label')
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Try to convert 'label' to integer if possible
try:
    df['label'] = pd.to_numeric(df['label'], downcast='integer')
except ValueError:
    print("Warning: 'label' column contains non-numeric values.")

print("Data loaded and processed successfully!")
print(f"Dataset shape: {df.shape}")
print("\nFirst few rows:")
print(df.head())
print("\nData Types:")
print(df.dtypes)

KeyError: 'label'