In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers, regularizers, callbacks
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
file_path = '/content/drive/MyDrive/Project - 1 | Machine Learning/combinedcsv.csv'
df = pd.read_csv(file_path)


In [4]:
target = df['malicious']
df = df.drop(columns=['malicious', 'md5_hash'])

# Standardize the data (recommended for PCA)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

In [5]:
pca = PCA()
pca_result = pca.fit_transform(df_scaled)

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance_ratio)
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1

pca = PCA(n_components=n_components_95)
pca_result = pca.fit_transform(df_scaled)

In [None]:
# Splitting data
X_train, X_temp, y_train, y_temp = train_test_split(pca_result, target, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=1/3, random_state=42)

# Define the AdaBoost classifier
ada_classifier = AdaBoostClassifier()

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 1]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=ada_classifier, param_grid=param_grid, cv=3, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Instantiate AdaBoost classifier with best parameters
best_ada_classifier = AdaBoostClassifier(**best_params)

# Train the model
best_ada_classifier.fit(X_train, y_train)

# Predictions
y_train_pred = best_ada_classifier.predict(X_train)
y_val_pred = best_ada_classifier.predict(X_val)
y_test_pred = best_ada_classifier.predict(X_test)

# Calculate scores
train_score = accuracy_score(y_train, y_train_pred)
val_score = accuracy_score(y_val, y_val_pred)
test_score = accuracy_score(y_test, y_test_pred)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)

# Classification Report
class_report = classification_report(y_test, y_test_pred)

# Print the results
print("1) Training Score:", train_score)
print("   Validation Score:", val_score)
print("   Testing Score:", test_score)
print("\n2) Confusion Matrix:\n", conf_matrix)
print("\n3) Classification Report:\n", class_report)
print("\n4) Best Parameters:", best_params)


In [None]:
from sklearn.model_selection import learning_curve

# Function to plot learning curve
def plot_learning_curve(estimator, X, y, title="Learning Curve", ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

# Plot learning curve
title = "Learning Curves (AdaBoost)"
plot_learning_curve(grid_search.best_estimator_, X_train, y_train, title=title, ylim=(0.7, 1.01), cv=3, n_jobs=-1)

# Display the plot
plt.show()
