In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib  # For model persistence

# Load data from CSV
combined_data = pd.read_csv("combined_dataset.csv")

# Assuming 'Label' column contains the labels (1 for positive, 0 for negative)
# If not, adjust the code accordingly
X = combined_data.drop(['Name', 'Label'], axis=1)
y = combined_data['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define multiple models
models = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'NB': GaussianNB(),
    'DT': DecisionTreeClassifier(),
    'RF': RandomForestClassifier(random_state=42),
    'MLP': MLPClassifier(random_state=42),
    'ADA': AdaBoostClassifier(random_state=42),
    'LR': LogisticRegression(random_state=42)
}

# Train individual models and evaluate performance
for model_name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{model_name} Accuracy: {accuracy}")


# Ensemble by averaging predictions
ensemble_preds = sum(model.predict(X_test) for model in models.values()) / len(models)
ensemble_accuracy = accuracy_score(y_test, np.round(ensemble_preds))
print("Ensemble Accuracy:", ensemble_accuracy)

SVM Accuracy: 0.7464788732394366
KNN Accuracy: 0.7183098591549296
NB Accuracy: 0.647887323943662
DT Accuracy: 0.6619718309859155
RF Accuracy: 0.704225352112676




MLP Accuracy: 0.7605633802816901
ADA Accuracy: 0.6619718309859155
LR Accuracy: 0.7183098591549296
Ensemble Accuracy: 0.7887323943661971


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib  # For model persistence
from imblearn.over_sampling import SMOTE
from collections import Counter

# Load data from CSV
combined_data = pd.read_csv("combined_dataset.csv")

# Assuming 'Label' column contains the labels (1 for positive, 0 for negative)
# If not, adjust the code accordingly
X = combined_data.drop(['Name', 'Label'], axis=1)
y = combined_data['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Oversample the training data using SMOTE
oversampler = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Print the distribution of classes after oversampling
print("Class distribution after oversampling:", Counter(y_train_resampled))

# Define multiple models
models = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'NB': GaussianNB(),
    'DT': DecisionTreeClassifier(),
    'RF': RandomForestClassifier(random_state=42),
    'MLP': MLPClassifier(random_state=42),
    'ADA': AdaBoostClassifier(random_state=42),
    'LR': LogisticRegression(random_state=42)
}

# Train individual models and evaluate performance
for model_name, model in models.items():
    model.fit(X_train_resampled, y_train_resampled)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{model_name} Accuracy: {accuracy}")

    # Save individual models
    joblib.dump(model, f'{model_name}_model.pkl')

# Ensemble by averaging predictions
ensemble_preds = sum(model.predict(X_test) for model in models.values()) / len(models)
ensemble_accuracy = accuracy_score(y_test, np.round(ensemble_preds))
print("Ensemble Accuracy:", ensemble_accuracy)

# Save ensemble predictions
joblib.dump(ensemble_preds, 'ensemble_model.pkl')


Class distribution after oversampling: Counter({0: 173, 1: 173})
SVM Accuracy: 0.7323943661971831
KNN Accuracy: 0.704225352112676
NB Accuracy: 0.676056338028169
DT Accuracy: 0.647887323943662
RF Accuracy: 0.7746478873239436




MLP Accuracy: 0.7464788732394366
ADA Accuracy: 0.676056338028169
LR Accuracy: 0.7183098591549296
Ensemble Accuracy: 0.7887323943661971


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


['ensemble_model.pkl']

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib
from imblearn.over_sampling import SMOTE
from collections import Counter

# Load data from CSV
combined_data = pd.read_csv("combined_dataset.csv")

# Assuming 'Label' column contains the labels (1 for positive, 0 for negative)
# If not, adjust the code accordingly
X = combined_data.drop(['Name', 'Label'], axis=1)
y = combined_data['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Oversample the training data using SMOTE
oversampler = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Print the distribution of classes after oversampling
print("Class distribution after oversampling:", Counter(y_train_resampled))

# Duplicate random rows to increase the dataset size to 3000 for both positive and negative data
positive_indices = y_train_resampled[y_train_resampled == 1].index
negative_indices = y_train_resampled[y_train_resampled == 0].index

# Duplicate positive samples
positive_duplicates = X_train_resampled.loc[positive_indices].sample(n=3000 - len(positive_indices), replace=True)
X_train_resampled = pd.concat([X_train_resampled, positive_duplicates])
y_train_resampled = pd.concat([y_train_resampled, pd.Series([1] * len(positive_duplicates))])

# Duplicate negative samples
negative_duplicates = X_train_resampled.loc[negative_indices].sample(n=3000 - len(negative_indices), replace=True)
X_train_resampled = pd.concat([X_train_resampled, negative_duplicates])
y_train_resampled = pd.concat([y_train_resampled, pd.Series([0] * len(negative_duplicates))])

# Print the new class distribution
print("New class distribution:", Counter(y_train_resampled))

# Define multiple models
models = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'NB': GaussianNB(),
    'DT': DecisionTreeClassifier(),
    'RF': RandomForestClassifier(random_state=42),
    'MLP': MLPClassifier(random_state=42),
    'ADA': AdaBoostClassifier(random_state=42),
    'LR': LogisticRegression(random_state=42, max_iter=1000),
}

# Train individual models and evaluate performance
for model_name, model in models.items():
    model.fit(X_train_resampled, y_train_resampled)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{model_name} Accuracy: {accuracy}")

# Ensemble by averaging predictions
ensemble_preds = sum(model.predict(X_test) for model in models.values()) / len(models)
ensemble_accuracy = accuracy_score(y_test, np.round(ensemble_preds))
print("Ensemble Accuracy:", ensemble_accuracy)

Class distribution after oversampling: Counter({0: 173, 1: 173})
New class distribution: Counter({0: 3000, 1: 3000})
SVM Accuracy: 0.7605633802816901
KNN Accuracy: 0.676056338028169
NB Accuracy: 0.676056338028169
DT Accuracy: 0.647887323943662
RF Accuracy: 0.8309859154929577
MLP Accuracy: 0.7887323943661971
ADA Accuracy: 0.8028169014084507
LR Accuracy: 0.704225352112676
Ensemble Accuracy: 0.8169014084507042


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import joblib
from imblearn.over_sampling import SMOTE
from collections import Counter

# Load data from CSV
combined_data = pd.read_csv("combined_dataset.csv")

# Assuming 'Label' column contains the labels (1 for positive, 0 for negative)
# If not, adjust the code accordingly
X = combined_data.drop(['Name', 'Label'], axis=1)
y = combined_data['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Oversample the training data using SMOTE
oversampler = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Scale features using StandardScaler
scaler = StandardScaler()
X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames after scaling
X_train_resampled_scaled_df = pd.DataFrame(X_train_resampled_scaled, columns=X_train.columns)

# Print the distribution of classes after oversampling
print("Class distribution after oversampling:", Counter(y_train_resampled))

# Duplicate random rows to increase the dataset size to 3000 for both positive and negative data
positive_indices = y_train_resampled[y_train_resampled == 1].index
negative_indices = y_train_resampled[y_train_resampled == 0].index

# Duplicate positive samples
positive_duplicates = X_train_resampled_scaled_df.loc[positive_indices].sample(n=3000 - len(positive_indices), replace=True)
X_train_resampled_scaled_df = pd.concat([X_train_resampled_scaled_df, positive_duplicates])
y_train_resampled = pd.concat([y_train_resampled, pd.Series([1] * len(positive_duplicates))])

# Duplicate negative samples
negative_duplicates = X_train_resampled_scaled_df.loc[negative_indices].sample(n=3000 - len(negative_indices), replace=True)
X_train_resampled_scaled_df = pd.concat([X_train_resampled_scaled_df, negative_duplicates])
y_train_resampled = pd.concat([y_train_resampled, pd.Series([0] * len(negative_duplicates))])

# Print the new class distribution
print("New class distribution:", Counter(y_train_resampled))

# Define multiple models
models = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'NB': GaussianNB(),
    'DT': DecisionTreeClassifier(),
    'RF': RandomForestClassifier(random_state=42),
    'MLP': MLPClassifier(random_state=42),
    'ADA': AdaBoostClassifier(random_state=42),
    'LR': LogisticRegression(random_state=42, max_iter=1000),
}

# Train individual models and evaluate performance
for model_name, model in models.items():
    model.fit(X_train_resampled_scaled_df.values, y_train_resampled.values)
    predictions = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{model_name} Accuracy: {accuracy}")

# Ensemble by averaging predictions
ensemble_preds = sum(model.predict(X_test_scaled) for model in models.values()) / len(models)
ensemble_accuracy = accuracy_score(y_test, np.round(ensemble_preds))
print("Ensemble Accuracy:", ensemble_accuracy)


Class distribution after oversampling: Counter({0: 173, 1: 173})
New class distribution: Counter({0: 3000, 1: 3000})
SVM Accuracy: 0.7464788732394366
KNN Accuracy: 0.7183098591549296
NB Accuracy: 0.7183098591549296
DT Accuracy: 0.5774647887323944
RF Accuracy: 0.7464788732394366
MLP Accuracy: 0.7605633802816901
ADA Accuracy: 0.676056338028169
LR Accuracy: 0.7183098591549296
Ensemble Accuracy: 0.7887323943661971


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import cross_val_score
import warnings

# Suppress ConvergenceWarnings
warnings.simplefilter("ignore", ConvergenceWarning)

# Load data from CSV
combined_data = pd.read_csv("combined_dataset.csv")

# Assuming 'Label' column contains the labels (1 for positive, 0 for negative)
# If not, adjust the code accordingly
X = combined_data.drop(['Name', 'Label'], axis=1)
y = combined_data['Label']

# Oversample the data using SMOTE
oversampler = SMOTE(random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# Scale features using StandardScaler
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)

# Convert back to DataFrame after scaling
X_resampled_scaled_df = pd.DataFrame(X_resampled_scaled, columns=X.columns)

# Print the distribution of classes after oversampling
print("Class distribution after oversampling:", Counter(y_resampled))

# Use StratifiedKFold for 100-fold cross-validation
cv = StratifiedKFold(n_splits=100, shuffle=True, random_state=42)

# Define multiple models
models = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'NB': GaussianNB(),
    'DT': DecisionTreeClassifier(),
    'RF': RandomForestClassifier(random_state=42),
    'MLP': MLPClassifier(random_state=42),
    'ADA': AdaBoostClassifier(random_state=42),
    'LR': LogisticRegression(random_state=42, max_iter=1000),
}

# Train individual models and evaluate performance using cross-validation
for model_name, model in models.items():
    # Perform cross-validation
    scores = cross_val_score(model, X_resampled_scaled_df, y_resampled, scoring='accuracy', cv=cv)
    print(f"{model_name} Cross-Validation Accuracy: {np.mean(scores)}")

    # Optionally, you can use the entire dataset for training
    model.fit(X_resampled_scaled_df.values, y_resampled.values)

    # Evaluate on the test set
    predictions = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{model_name} Test Set Accuracy: {accuracy}")

# Ensemble by averaging predictions
ensemble_preds = sum(model.predict(X_test_scaled) for model in models.values()) / len(models)
ensemble_accuracy = accuracy_score(y_test, np.round(ensemble_preds))
print("Ensemble Accuracy:", ensemble_accuracy)


Class distribution after oversampling: Counter({1: 215, 0: 215})
SVM Cross-Validation Accuracy: 0.7755000000000001
SVM Test Set Accuracy: 0.8873239436619719
KNN Cross-Validation Accuracy: 0.7454999999999999
KNN Test Set Accuracy: 0.8591549295774648
NB Cross-Validation Accuracy: 0.551
NB Test Set Accuracy: 0.6619718309859155
DT Cross-Validation Accuracy: 0.6725
DT Test Set Accuracy: 1.0
RF Cross-Validation Accuracy: 0.7985
RF Test Set Accuracy: 1.0
MLP Cross-Validation Accuracy: 0.782
MLP Test Set Accuracy: 1.0
ADA Cross-Validation Accuracy: 0.7440000000000001
ADA Test Set Accuracy: 0.9014084507042254
LR Cross-Validation Accuracy: 0.698
LR Test Set Accuracy: 0.8169014084507042
Ensemble Accuracy: 0.971830985915493


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from collections import Counter
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Load data from CSV
combined_data = pd.read_csv("combined_dataset.csv")

# Assuming 'Label' column contains the labels (1 for positive, 0 for negative)
# If not, adjust the code accordingly
X = combined_data.drop(['Name', 'Label'], axis=1)
y = combined_data['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Oversample the training data using SMOTE
oversampler = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Scale features using StandardScaler
scaler = StandardScaler()
X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames after scaling
X_train_resampled_scaled_df = pd.DataFrame(X_train_resampled_scaled, columns=X_train.columns)

# Print the distribution of classes after oversampling
print("Class distribution after oversampling:", Counter(y_train_resampled))

# Duplicate random rows to increase the dataset size to 3000 for both positive and negative data
positive_indices = y_train_resampled[y_train_resampled == 1].index
negative_indices = y_train_resampled[y_train_resampled == 0].index

# Duplicate positive samples
positive_duplicates = X_train_resampled_scaled_df.loc[positive_indices].sample(n=3000 - len(positive_indices), replace=True)
X_train_resampled_scaled_df = pd.concat([X_train_resampled_scaled_df, positive_duplicates])
y_train_resampled = pd.concat([y_train_resampled, pd.Series([1] * len(positive_duplicates))])

# Duplicate negative samples
negative_duplicates = X_train_resampled_scaled_df.loc[negative_indices].sample(n=3000 - len(negative_indices), replace=True)
X_train_resampled_scaled_df = pd.concat([X_train_resampled_scaled_df, negative_duplicates])
y_train_resampled = pd.concat([y_train_resampled, pd.Series([0] * len(negative_duplicates))])

# Print the new class distribution
print("New class distribution:", Counter(y_train_resampled))

model = keras.Sequential([
    layers.Dense(1024, activation='relu', input_shape=(X_train_resampled_scaled_df.shape[1],)),
    layers.Dropout(0.5),
    layers.Dense(512, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(16, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(8, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with more epochs
model.fit(X_train_resampled_scaled_df, y_train_resampled, epochs=15, batch_size=32, validation_split=0.2)

# Evaluate on the test set
predictions = model.predict(X_test_scaled)
binary_predictions = np.round(predictions)
accuracy = accuracy_score(y_test, binary_predictions)
print("Test Accuracy:", accuracy)


Class distribution after oversampling: Counter({0: 173, 1: 173})
New class distribution: Counter({0: 3000, 1: 3000})
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test Accuracy: 0.6619718309859155
