In [1]:
import librosa
import numpy as np
import pandas as pd
import os

# Function to extract features from an audio segment
def extract_features(y, sr):

    features = {}
    features.update({f"mfcc_{i}": val for i, val in enumerate(np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1))})
    features["chroma_mean"] = np.mean(librosa.feature.chroma_stft(y=y, sr=sr))  # Single chroma feature (mean of all bins)
    features["spectral_centroid"] = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    features["spectral_bandwidth"] = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    features["spectral_rolloff"] = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    features["zero_crossing_rate"] = np.mean(librosa.feature.zero_crossing_rate(y=y))
    features["rms_energy"] = np.mean(librosa.feature.rms(y=y))
    return features

# Function to process an audio file in 1-second intervals
def process_audio_in_intervals(file_path, type, interval_duration=1):
    y, sr = librosa.load(file_path, sr=None)  # Load audio file
    interval_samples = sr * interval_duration  # Number of samples per interval

    features_list = []
    total_samples = len(y)

    # Iterate over intervals
    for start_sample in range(0, total_samples, interval_samples):
        end_sample = min(start_sample + interval_samples, total_samples)
        y_segment = y[start_sample:end_sample]  # Extract segment

        # Extract features for this segment
        features = extract_features(y_segment, sr)
        features["type"] = type  # Add type (real or fake)
        features["interval_start"] = start_sample / sr  # Start time of the interval (in seconds)
        features["interval_end"] = end_sample / sr  # End time of the interval (in seconds)
        features_list.append(features)

    return features_list

# Function to process all files in a directory
def process_directory(directory_path, type, interval_duration=1):
    all_features = []
    for file_name in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file_name)
        if os.path.isfile(file_path) and file_name.lower().endswith(('.wav', '.mp3', '.flac')):  # Check for audio files
            print(f"Processing file: {file_name}")
            file_features = process_audio_in_intervals(file_path, type,interval_duration)
            for feature_set in file_features:
                feature_set["file_name"] = file_name  # Add file name for reference
            all_features.extend(file_features)

    return pd.DataFrame(all_features)

# Example usage
directory_path = "/content/drive/MyDrive/KAGGLE/AUDIO/FAKE"  # Replace with the path to your directory
df = process_directory(directory_path, "FAKE")
df.to_csv("FAKE_audio_features_directory.csv", index=False)
# Example usage
directory_path = "/content/drive/MyDrive/KAGGLE/AUDIO/REAL"  # Replace with the path to your directory
df2 = process_directory(directory_path, "REAL")
#combined_df = pd.concat([df, df2], axis=1)

# Save to CSV
#combined_df.to_csv("combined_columns.csv", index=False)
# Save to a CSV file (optional)
df2.to_csv("REAL_audio_features_directory.csv", index=False)

# Print the DataFrame
print(df2)

Processing file: Obama-to-Biden.wav
Processing file: Obama-to-Trump.wav
Processing file: biden-to-Obama.wav
Processing file: biden-to-Trump.wav
Processing file: biden-to-linus.wav
Processing file: biden-to-margot.wav
Processing file: biden-to-musk.wav
Processing file: biden-to-ryan.wav
Processing file: biden-to-taylor.wav
Processing file: linus-to-biden.wav
Processing file: linus-to-margot.wav
Processing file: linus-to-musk.wav
Processing file: linus-to-obama.wav
Processing file: linus-to-ryan.wav
Processing file: linus-to-taylor.wav
Processing file: margot-to-biden.wav
Processing file: linus-to-trump.wav
Processing file: margot-to-linus.wav
Processing file: margot-to-musk.wav
Processing file: margot-to-obama.wav
Processing file: margot-to-ryan.wav
Processing file: margot-to-taylor.wav
Processing file: margot-to-trump.wav
Processing file: musk-to-biden.wav
Processing file: musk-to-linus.wav
Processing file: musk-to-margot.wav
Processing file: musk-to-obama.wav
Processing file: musk-to-



Processing file: margot-original.wav
Processing file: linus-original.wav
Processing file: musk-original.wav
Processing file: ryan-original.wav
Processing file: obama-original.wav
Processing file: taylor-original.wav
Processing file: trump-original.wav
          mfcc_0      mfcc_1      mfcc_2     mfcc_3     mfcc_4     mfcc_5  \
0    -344.940247  184.372253  -53.522011 -10.656567 -31.722338   0.496074   
1    -331.827148  180.274582  -66.526703   2.252668 -26.897394   8.383003   
2    -301.955811  203.748108 -100.482590 -13.155165 -38.819607 -17.975796   
3    -268.484070  208.112518 -109.855804 -19.142450 -45.374626 -19.284824   
4    -287.688446  210.582413  -98.025452 -15.918679 -48.244888 -12.454594   
...          ...         ...         ...        ...        ...        ...   
3745 -359.348450  143.910110  -53.743927  12.865443 -21.503048  -5.505956   
3746 -342.633911  168.411026  -63.005383  27.147928 -13.705704  21.142042   
3747 -294.075684  198.536072  -86.627548   6.403258 -32

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the provided CSV files
real_audio_features_path = '/content/drive/MyDrive/KAGGLE/REAL_audio_features_directory.csv'
fake_audio_features_path = '/content/drive/MyDrive/KAGGLE/FAKE_audio_features_directory.csv'

real_audio_features = pd.read_csv(real_audio_features_path)
fake_audio_features = pd.read_csv(fake_audio_features_path)

# Combine the datasets and label them appropriately
audio_features = pd.concat([real_audio_features, fake_audio_features], ignore_index=True)
audio_features.to_csv("audio_features.csv", index=False)

# Drop non-feature columns that are not useful for training
columns_to_drop = ['file_name', 'interval_start', 'interval_end', 'type']  # 'type' will be used as the label
X = audio_features.drop(columns=columns_to_drop)
y = audio_features['type'].map({'REAL': 0, 'FAKE': 1})  # Encode labels as 0 for REAL and 1 for FAKE

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # Using 5 neighbors as a default
knn.fit(X_train_scaled, y_train)

# Evaluate the model on the test data
y_pred = knn.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)


Accuracy: 98.56%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       750
           1       0.99      0.99      0.99      5243

    accuracy                           0.99      5993
   macro avg       0.97      0.96      0.97      5993
weighted avg       0.99      0.99      0.99      5993



In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Load the provided CSV files
real_audio_features_path = '/content/drive/MyDrive/KAGGLE/REAL_audio_features_directory.csv'
fake_audio_features_path = '/content/drive/MyDrive/KAGGLE/FAKE_audio_features_directory.csv'

real_audio_features = pd.read_csv(real_audio_features_path)
fake_audio_features = pd.read_csv(fake_audio_features_path)

# Combine the datasets and label them appropriately
audio_features = pd.concat([real_audio_features, fake_audio_features], ignore_index=True)

# Drop non-feature columns that are not useful for training
columns_to_drop = ['file_name', 'interval_start', 'interval_end']  # Drop metadata
X = audio_features.drop(columns=columns_to_drop + ['type'])  # Keep all 19 feature columns
y = audio_features['type'].map({'REAL': 0, 'FAKE': 1})  # Encode labels as 0 for REAL and 1 for FAKE

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features (necessary for SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the SVM classifier
svm = SVC(kernel='linear', C=1.0, random_state=42)  # Linear kernel and regularization parameter C=1.0
svm.fit(X_train_scaled, y_train)

# Evaluate the model on the test data
y_pred = svm.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)


Accuracy: 93.54%
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.57      0.69       750
           1       0.94      0.99      0.96      5243

    accuracy                           0.94      5993
   macro avg       0.91      0.78      0.83      5993
weighted avg       0.93      0.94      0.93      5993



In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score

# Load the provided CSV files
real_audio_features_path = '/content/drive/MyDrive/KAGGLE/REAL_audio_features_directory.csv'
fake_audio_features_path = '/content/drive/MyDrive/KAGGLE/FAKE_audio_features_directory.csv'

real_audio_features = pd.read_csv(real_audio_features_path)
fake_audio_features = pd.read_csv(fake_audio_features_path)

# Combine the datasets and label them appropriately
audio_features = pd.concat([real_audio_features, fake_audio_features], ignore_index=True)

# Drop non-feature columns that are not useful for training
columns_to_drop = ['file_name', 'interval_start', 'interval_end']  # Drop metadata
X = audio_features.drop(columns=columns_to_drop + ['type'])  # Keep all 19 feature columns
y = audio_features['type'].map({'REAL': 0, 'FAKE': 1})  # Encode labels as 0 for REAL and 1 for FAKE

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features (Naive Bayes doesn't require this but can help with certain feature ranges)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Naive Bayes classifier
naive_bayes = GaussianNB()
naive_bayes.fit(X_train_scaled, y_train)

# Evaluate the model on the test data
y_pred = naive_bayes.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)


Accuracy: 84.20%
Classification Report:
              precision    recall  f1-score   support

           0       0.39      0.46      0.42       750
           1       0.92      0.90      0.91      5243

    accuracy                           0.84      5993
   macro avg       0.65      0.68      0.66      5993
weighted avg       0.85      0.84      0.85      5993

