## Generating Metafile

In [8]:
import os
import csv

In [5]:
directory = "/home/arunb/Abhijeet_2021509/data"
metafile_path = "/home/arunb/Abhijeet_2021509/scam-legit/openSMILE/metafile.csv"

In [None]:
folder_to_label = {
    "human+legit": "legit",
    "robot+legit": "legit",
    "robot+scam": "scam"
}

In [7]:
file_data = []

for folder, label in folder_to_label.items():
    folder_path = os.path.join(directory, folder)
    if os.path.exists(folder_path):
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                file_data.append([file_path, label])


In [10]:
with open(metafile_path, mode='w', newline='', encoding='utf-8') as metafile:
    writer = csv.writer(metafile)
    writer.writerow(["file_location", "label"])
    writer.writerows(file_data)

print(f"Metafile generated at: {metafile_path}")

Metafile generated at: /home/arunb/Abhijeet_2021509/scam-legit/openSMILE/metafile.csv


## Feature Extraction

In [7]:
import os
import pandas as pd
import opensmile
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

In [8]:
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,  # Choose feature set (e.g., ComParE_2016)
    feature_level=opensmile.FeatureLevel.Functionals,  # Choose feature level
)

In [9]:
def process_audio(row):
    audio_file = row['file_location']
    label = row['label']
    features = smile.process_file(audio_file)
    features['label'] = 1 if label == "scam" else 0
    return features


In [10]:
metafile_path = "/home/arunb/Abhijeet_2021509/scam-legit/openSMILE/metafile.csv"
features_file = "/home/arunb/Abhijeet_2021509/scam-legit/openSMILE/features.csv"

metadata = pd.read_csv(metafile_path)

In [11]:
with Pool(cpu_count()) as pool:
    results = list(tqdm(pool.imap(process_audio, [row for _, row in metadata.iterrows()]), 
                        total=len(metadata), desc="Extracting Features"))

all_features = pd.concat(results, ignore_index=True)

# Save combined features to a CSV file
all_features.to_csv(features_file, index=False)
print(f"Combined features saved to: {features_file}")

Extracting Features: 100%|██████████| 15295/15295 [04:02<00:00, 62.98it/s] 


Combined features saved to: /home/arunb/Abhijeet_2021509/scam-legit/openSMILE/features.csv


## Classification

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import shuffle

In [13]:
# Load the saved features
features_file = "/home/arunb/Abhijeet_2021509/scam-legit/openSMILE/features.csv"
data = pd.read_csv(features_file)

In [15]:
data = shuffle(data, random_state=42)

In [18]:
X = data.drop(columns=["label"])
y = data["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [23]:
classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "Support Vector Machine (SVM)": SVC(random_state=42),
    "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

In [24]:
# Loop through each classifier, train, and evaluate
for name, model in classifiers.items():
    print(f"\nTraining and evaluating {name}...")

    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    print("\nClassification Report:\n")
    print(classification_report(y_test, y_pred))


Training and evaluating Random Forest...
Accuracy: 1.0

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2774
           1       1.00      1.00      1.00       285

    accuracy                           1.00      3059
   macro avg       1.00      1.00      1.00      3059
weighted avg       1.00      1.00      1.00      3059


Training and evaluating Logistic Regression...
Accuracy: 0.9993461915658712

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2774
           1       0.99      1.00      1.00       285

    accuracy                           1.00      3059
   macro avg       1.00      1.00      1.00      3059
weighted avg       1.00      1.00      1.00      3059


Training and evaluating Support Vector Machine (SVM)...
Accuracy: 0.9980385746976136

Classification Report:

              precision    recall  f1-score   suppor