In [1]:
import numpy as np
import pickle

# Path to saved features
features_file = "/Volumes/T7/Capstone Proj/kagg/extracted_features.pkl"

# Load extracted features
with open(features_file, "rb") as f:
    X, y = pickle.load(f)

print(f"✅ Loaded Features: {X.shape}")
print(f"✅ Loaded Labels: {y.shape}")


✅ Loaded Features: (39145, 181)
✅ Loaded Labels: (39145,)


In [2]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

# Load saved features
features_file = "/Volumes/T7/Capstone Proj/kagg/extracted_features.pkl"
with open(features_file, "rb") as f:
    X, y = pickle.load(f)

# Split dataset into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Store models & results
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "SVM": SVC(kernel="linear", probability=True)
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    print(f"\n🚀 Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

    print(f"✅ {name} Results:")
    print(f"  📊 Accuracy: {accuracy:.4f}")
    print(f"  🎯 Precision: {precision:.4f}")
    print(f"  🔄 Recall: {recall:.4f}")
    print(f"  🏆 F1 Score: {f1:.4f}")

# Convert results to DataFrame and display
results_df = pd.DataFrame(results).T
print("\n📊 Model Performance Comparison:\n")
print(results_df)



🚀 Training Logistic Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


✅ Logistic Regression Results:
  📊 Accuracy: 0.9587
  🎯 Precision: 0.9623
  🔄 Recall: 0.9683
  🏆 F1 Score: 0.9653

🚀 Training Random Forest...
✅ Random Forest Results:
  📊 Accuracy: 0.9826
  🎯 Precision: 0.9839
  🔄 Recall: 0.9869
  🏆 F1 Score: 0.9854

🚀 Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



✅ XGBoost Results:
  📊 Accuracy: 0.9914
  🎯 Precision: 0.9929
  🔄 Recall: 0.9927
  🏆 F1 Score: 0.9928

🚀 Training SVM...
✅ SVM Results:
  📊 Accuracy: 0.9586
  🎯 Precision: 0.9609
  🔄 Recall: 0.9696
  🏆 F1 Score: 0.9652

📊 Model Performance Comparison:

                     Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.958743   0.962296  0.968312  0.965295
Random Forest        0.982629   0.983881  0.986851  0.985364
XGBoost              0.991442   0.992885  0.992671  0.992778
SVM                  0.958615   0.960906  0.969606  0.965236


In [5]:
import joblib

# Save the best model
model_save_path = "/Volumes/T7/Capstone Proj/kagg/models_1/modelsbest_model_XGBoost.pkl"
joblib.dump(models["XGBoost"], model_save_path)

print(f"🎯 Best Model (XGBoost) Saved at: {model_save_path}")


🎯 Best Model (XGBoost) Saved at: /Volumes/T7/Capstone Proj/kagg/models_1/modelsbest_model_XGBoost.pkl


# 🚀 Deep Learning Model Training Plan

In [6]:
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Load extracted features
features_file = "/Volumes/T7/Capstone Proj/kagg/extracted_features.pkl"
X, y = joblib.load(features_file)

print(f"✅ Loaded Features: {X.shape}")
print(f"✅ Loaded Labels: {y.shape}")

# Normalize features
X = X / np.max(np.abs(X), axis=0)  # Normalize for stability

# Convert labels to categorical for DL models
y_categorical = to_categorical(y, num_classes=2)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

print(f"✅ Data Split: {X_train.shape[0]} train, {X_test.shape[0]} test")


✅ Loaded Features: (39145, 181)
✅ Loaded Labels: (39145,)
✅ Data Split: 31316 train, 7829 test


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define DNN Model
model_dnn = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')  # 2 classes (Real/Fake)
])

# Compile Model
model_dnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train Model
history_dnn = model_dnn.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32)


Epoch 1/20


2025-03-16 18:00:38.909140: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [8]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten

# Reshape X for CNN (add extra dimension)
X_train_cnn = X_train[..., np.newaxis]
X_test_cnn = X_test[..., np.newaxis]

# Define CNN Model
model_cnn = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Conv1D(128, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')
])

# Compile Model
model_cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train Model
history_cnn = model_cnn.fit(X_train_cnn, y_train, validation_data=(X_test_cnn, y_test), epochs=20, batch_size=32)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [9]:
from tensorflow.keras.layers import LSTM, Reshape

# Reshape X for LSTM (time-series format)
X_train_lstm = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_lstm = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Define LSTM Model
model_lstm = Sequential([
    LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], 1)),
    LSTM(64, return_sequences=False),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')
])

# Compile Model
model_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train Model
history_lstm = model_lstm.fit(X_train_lstm, y_train, validation_data=(X_test_lstm, y_test), epochs=20, batch_size=32)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [10]:
def evaluate_model(model, X_test, y_test):
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f"✅ Test Accuracy: {accuracy:.4f}")

print("\n📊 Evaluating Models:")
print("📌 DNN Model:")
evaluate_model(model_dnn, X_test, y_test)

print("\n📌 CNN Model:")
evaluate_model(model_cnn, X_test_cnn, y_test)

print("\n📌 LSTM Model:")
evaluate_model(model_lstm, X_test_lstm, y_test)



📊 Evaluating Models:
📌 DNN Model:
✅ Test Accuracy: 0.9889

📌 CNN Model:
✅ Test Accuracy: 0.9937

📌 LSTM Model:
✅ Test Accuracy: 0.9659
