import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('pen_digits.csv')
#data.head()


X = data.iloc[:, 1:-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = 2154

# Ensure the target size is greater than the current minority class size
if desired_minority_class_size > minority_class_count:
    smote_strategy = {1: desired_minority_class_size}

    # Apply SMOTE with the desired number of synthetic samples
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_orig, y_train_orig)


# Print class distribution after SMOTE
print("\nClass distribution after augmentation (on training data):")
print(Counter(y_train_res))

# Print the number of samples generated
num_generated_samples = len(X_train_res) - len(X_train_orig)
print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")


# Train a Random Forest Classifier after augmentation
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_res, y_train_res)
y_pred_test_res = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation (on test data):")
print(classification_report(y_test, y_pred_test_res))

precision_after = precision_score(y_test, y_pred_test_res, pos_label=1)
recall_after = recall_score(y_test, y_pred_test_res, pos_label=1)
f1_after = f1_score(y_test, y_pred_test_res, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")

from openTSNE import TSNE


# Apply openTSNE for t-SNE visualization
tsne = TSNE(
    n_components=2, perplexity=30, metric="euclidean",
    n_jobs=-1  # Use all available CPU cores for speedup
)

# Fit openTSNE on the PCA-reduced minority class data
X_tsne_minority = tsne.fit(X_minority)
X_tsne_res_minority = tsne.fit(X_train_res[y_train_res == 1])

# Create t-SNE Plot for minority class before and after SMOTE
fig, ax = plt.subplots(1, 1, figsize=(6, 4))

# Plot for original minority class data
ax.scatter(X_tsne_minority[:, 0], X_tsne_minority[:, 1], c='blue', label='Minority Class (Original)', alpha=0.6)
# Plot for SMOTE-generated synthetic minority class data
ax.scatter(X_tsne_res_minority[:, 0], X_tsne_res_minority[:, 1], c='orange', label='Minority Class (SMOTE)', alpha=0.6)

# Add labels and legends
ax.set_title('t-SNE of Minority Class (Before and After SMOTE)')
ax.legend()

# Display the plot
plt.tight_layout()
plt.show()

# Mammography Data

# 50% of minority data

import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('mammography_dataset.csv')
#data.head()


X = data.iloc[:, 1:-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)//2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_train_res = X_train_orig[combined_indices]
    y_train_res = y_train_orig[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_train_res))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_orig, y_train_orig)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_train_res))

    # Print the number of samples generated
    num_generated_samples = len(X_train_res) - len(X_train_orig)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_train_res, y_train_res = X_train_orig, y_train_orig
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_res, y_train_res)
y_pred_test_res = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation (on test data):")
print(classification_report(y_test, y_pred_test_res))

precision_after = precision_score(y_test, y_pred_test_res, pos_label=1)
recall_after = recall_score(y_test, y_pred_test_res, pos_label=1)
f1_after = f1_score(y_test, y_pred_test_res, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


# Mammography Data

# 50% of minority class

In [5]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('mammography_dataset.csv')
#data.head()


X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)//2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {-1: 10923, 1: 260}
Recall score (original data): 0.5000
F1 score (original data): 0.6240

Class distribution after augmentation :
Counter({-1: 10923, 1: 390})

Number of samples generated by SMOTE: 130

Classification report after augmentation:
              precision    recall  f1-score   support

          -1       0.99      1.00      0.99      3277
           1       0.82      0.71      0.76        78

    accuracy                           0.99      3355
   macro avg       0.91      0.85      0.88      3355
weighted avg       0.99      0.99      0.99      3355

Precision after augmentation: 0.8209
Recall after augmentation: 0.7051
F1 Score after augmentation: 0.7586


# 100% of minority class

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('mammography_dataset.csv')
#data.head()


X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority))

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {-1: 10923, 1: 260}
Recall score (original data): 0.5000
F1 score (original data): 0.6240

Class distribution after augmentation :
Counter({-1: 10923, 1: 520})

Number of samples generated by SMOTE: 260

Classification report after augmentation:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      3277
           1       0.80      0.82      0.81        78

    accuracy                           0.99      3355
   macro avg       0.90      0.91      0.90      3355
weighted avg       0.99      0.99      0.99      3355

Precision after augmentation: 0.8000
Recall after augmentation: 0.8205
F1 Score after augmentation: 0.8101


# 200% of minority class

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('mammography_dataset.csv')
#data.head()


X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+((len(X_minority))*2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {-1: 10923, 1: 260}
Recall score (original data): 0.5000
F1 score (original data): 0.6240

Class distribution after augmentation :
Counter({-1: 10923, 1: 780})

Number of samples generated by SMOTE: 520

Classification report after augmentation:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      3277
           1       0.81      0.85      0.83        78

    accuracy                           0.99      3355
   macro avg       0.91      0.92      0.91      3355
weighted avg       0.99      0.99      0.99      3355

Precision after augmentation: 0.8148
Recall after augmentation: 0.8462
F1 Score after augmentation: 0.8302


# HTRU_2 Data

# 50% of minority class

In [8]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('HTRU_2.csv')
#data.head()


X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)//2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {0: 16258, 1: 1639}
Recall score (original data): 0.8415
F1 score (original data): 0.8743

Class distribution after augmentation :
Counter({0: 16258, 1: 2458})

Number of samples generated by SMOTE: 819

Classification report after augmentation:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4878
           1       0.91      0.92      0.92       492

    accuracy                           0.98      5370
   macro avg       0.95      0.96      0.95      5370
weighted avg       0.98      0.98      0.98      5370

Precision after augmentation: 0.9100
Recall after augmentation: 0.9248
F1 Score after augmentation: 0.9173


# 100% of minority class

In [9]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('HTRU_2.csv')
#data.head()


X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+len(X_minority)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {0: 16258, 1: 1639}
Recall score (original data): 0.8415
F1 score (original data): 0.8743

Class distribution after augmentation :
Counter({0: 16258, 1: 3278})

Number of samples generated by SMOTE: 1639

Classification report after augmentation:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4878
           1       0.89      0.94      0.92       492

    accuracy                           0.98      5370
   macro avg       0.94      0.96      0.95      5370
weighted avg       0.98      0.98      0.98      5370

Precision after augmentation: 0.8936
Recall after augmentation: 0.9390
F1 Score after augmentation: 0.9158


# 200% of minority class

In [10]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('HTRU_2.csv')
#data.head()


X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)*2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {0: 16258, 1: 1639}
Recall score (original data): 0.8415
F1 score (original data): 0.8743

Class distribution after augmentation :
Counter({0: 16258, 1: 4917})

Number of samples generated by SMOTE: 3278

Classification report after augmentation:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      4878
           1       0.89      0.95      0.92       492

    accuracy                           0.98      5370
   macro avg       0.94      0.97      0.96      5370
weighted avg       0.99      0.98      0.98      5370

Precision after augmentation: 0.8866
Recall after augmentation: 0.9533
F1 Score after augmentation: 0.9187


# Credit Card Data

# 50% of minority class

In [11]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('creditcard.csv')
#data.head()


X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)//2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {0: 284315, 1: 492}
Recall score (original data): 0.7568
F1 score (original data): 0.8453

Class distribution after augmentation :
Counter({0: 284315, 1: 738})

Number of samples generated by SMOTE: 246

Classification report after augmentation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.96      0.86      0.91       148

    accuracy                           1.00     85443
   macro avg       0.98      0.93      0.95     85443
weighted avg       1.00      1.00      1.00     85443

Precision after augmentation: 0.9552
Recall after augmentation: 0.8649
F1 Score after augmentation: 0.9078


# 100% of minority class

In [12]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('creditcard.csv')

X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority))

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {0: 284315, 1: 492}
Recall score (original data): 0.7568
F1 score (original data): 0.8453

Class distribution after augmentation :
Counter({0: 284315, 1: 984})

Number of samples generated by SMOTE: 492

Classification report after augmentation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.93      0.88      0.90       148

    accuracy                           1.00     85443
   macro avg       0.96      0.94      0.95     85443
weighted avg       1.00      1.00      1.00     85443

Precision after augmentation: 0.9286
Recall after augmentation: 0.8784
F1 Score after augmentation: 0.9028


# 200% of minority class

In [13]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('creditcard.csv')

X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)*2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {0: 284315, 1: 492}
Recall score (original data): 0.7568
F1 score (original data): 0.8453

Class distribution after augmentation :
Counter({0: 284315, 1: 1476})

Number of samples generated by SMOTE: 984

Classification report after augmentation:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.91      0.93      0.92       148

    accuracy                           1.00     85443
   macro avg       0.96      0.96      0.96     85443
weighted avg       1.00      1.00      1.00     85443

Precision after augmentation: 0.9133
Recall after augmentation: 0.9257
F1 Score after augmentation: 0.9195


# Reduced Diabetes Data (Reduced minority: 200)

# 50% of minority class

In [14]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv("diabetes.csv")

# Separate the minority and majority classes
minority_class = data[data['Outcome'] == 1]
majority_class = data[data['Outcome'] == 0]

# Randomly sample 68 rows to drop from the minority class
minority_class_reduced = minority_class.sample(n=len(minority_class) - 68, random_state=42)

# Combine the reduced minority class with the majority class
balanced_data = pd.concat([majority_class, minority_class_reduced])

# Shuffle the dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into X (features) and y (target)
X = balanced_data.drop(columns=['Outcome'])  # Drop the target column to get features
y = balanced_data['Outcome']  # Target variable

# Print results
print("Class distribution after balancing:")
print(balanced_data['Outcome'].value_counts())
print("\nShape of X:", X.shape)
print("Shape of y:", y.shape)
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)//2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution after balancing:
Outcome
0    500
1    200
Name: count, dtype: int64

Shape of X: (700, 8)
Shape of y: (700,)
Class distribution before augmentation: {0: 500, 1: 200}
Recall score (original data): 0.5333
F1 score (original data): 0.5872

Class distribution after augmentation :
Counter({0: 500, 1: 300})

Number of samples generated by SMOTE: 100

Classification report after augmentation:
              precision    recall  f1-score   support

           0       0.92      0.87      0.89       150
           1       0.71      0.80      0.75        60

    accuracy                           0.85       210
   macro avg       0.81      0.83      0.82       210
weighted avg       0.86      0.85      0.85       210

Precision after augmentation: 0.7059
Recall after augmentation: 0.8000
F1 Score after augmentation: 0.7500


# 100% of minority class

In [15]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv("diabetes.csv")

# Separate the minority and majority classes
minority_class = data[data['Outcome'] == 1]
majority_class = data[data['Outcome'] == 0]

# Randomly sample 68 rows to drop from the minority class
minority_class_reduced = minority_class.sample(n=len(minority_class) - 68, random_state=42)

# Combine the reduced minority class with the majority class
balanced_data = pd.concat([majority_class, minority_class_reduced])

# Shuffle the dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into X (features) and y (target)
X = balanced_data.drop(columns=['Outcome'])  # Drop the target column to get features
y = balanced_data['Outcome']  # Target variable

# Print results
print("Class distribution after balancing:")
print(balanced_data['Outcome'].value_counts())
print("\nShape of X:", X.shape)
print("Shape of y:", y.shape)
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority))

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution after balancing:
Outcome
0    500
1    200
Name: count, dtype: int64

Shape of X: (700, 8)
Shape of y: (700,)
Class distribution before augmentation: {0: 500, 1: 200}
Recall score (original data): 0.5333
F1 score (original data): 0.5872

Class distribution after augmentation :
Counter({0: 500, 1: 400})

Number of samples generated by SMOTE: 200

Classification report after augmentation:
              precision    recall  f1-score   support

           0       0.94      0.83      0.88       150
           1       0.68      0.87      0.76        60

    accuracy                           0.84       210
   macro avg       0.81      0.85      0.82       210
weighted avg       0.86      0.84      0.85       210

Precision after augmentation: 0.6753
Recall after augmentation: 0.8667
F1 Score after augmentation: 0.7591


# 200% of minority class

In [16]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv("diabetes.csv")

# Separate the minority and majority classes
minority_class = data[data['Outcome'] == 1]
majority_class = data[data['Outcome'] == 0]

# Randomly sample 68 rows to drop from the minority class
minority_class_reduced = minority_class.sample(n=len(minority_class) - 68, random_state=42)

# Combine the reduced minority class with the majority class
balanced_data = pd.concat([majority_class, minority_class_reduced])

# Shuffle the dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into X (features) and y (target)
X = balanced_data.drop(columns=['Outcome'])  # Drop the target column to get features
y = balanced_data['Outcome']  # Target variable

# Print results
print("Class distribution after balancing:")
print(balanced_data['Outcome'].value_counts())
print("\nShape of X:", X.shape)
print("Shape of y:", y.shape)
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)*2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution after balancing:
Outcome
0    500
1    200
Name: count, dtype: int64

Shape of X: (700, 8)
Shape of y: (700,)
Class distribution before augmentation: {0: 500, 1: 200}
Recall score (original data): 0.5333
F1 score (original data): 0.5872

Class distribution after augmentation :
Counter({1: 600, 0: 500})

Number of samples generated by SMOTE: 400

Classification report after augmentation:
              precision    recall  f1-score   support

           0       0.97      0.78      0.87       150
           1       0.63      0.95      0.76        60

    accuracy                           0.83       210
   macro avg       0.80      0.86      0.81       210
weighted avg       0.88      0.83      0.84       210

Precision after augmentation: 0.6333
Recall after augmentation: 0.9500
F1 Score after augmentation: 0.7600


# Oil Data

# 50% of minority class

In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('oil.csv')



X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)//2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {-1: 896, 1: 41}
Recall score (original data): 0.3333
F1 score (original data): 0.4706

Class distribution after augmentation :
Counter({-1: 896, 1: 61})

Number of samples generated by SMOTE: 20

Classification report after augmentation:
              precision    recall  f1-score   support

          -1       0.98      0.99      0.99       270
           1       0.75      0.50      0.60        12

    accuracy                           0.97       282
   macro avg       0.86      0.75      0.79       282
weighted avg       0.97      0.97      0.97       282

Precision after augmentation: 0.7500
Recall after augmentation: 0.5000
F1 Score after augmentation: 0.6000


# 100% of minority class

In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('oil.csv')



X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority))

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {-1: 896, 1: 41}
Recall score (original data): 0.3333
F1 score (original data): 0.4706

Class distribution after augmentation :
Counter({-1: 896, 1: 82})

Number of samples generated by SMOTE: 41

Classification report after augmentation:
              precision    recall  f1-score   support

          -1       0.98      0.99      0.98       270
           1       0.64      0.58      0.61        12

    accuracy                           0.97       282
   macro avg       0.81      0.78      0.80       282
weighted avg       0.97      0.97      0.97       282

Precision after augmentation: 0.6364
Recall after augmentation: 0.5833
F1 Score after augmentation: 0.6087


# 200% of minority class

In [3]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('oil.csv')



X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)*2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {-1: 896, 1: 41}
Recall score (original data): 0.3333
F1 score (original data): 0.4706

Class distribution after augmentation :
Counter({-1: 896, 1: 123})

Number of samples generated by SMOTE: 82

Classification report after augmentation:
              precision    recall  f1-score   support

          -1       0.99      0.99      0.99       270
           1       0.73      0.67      0.70        12

    accuracy                           0.98       282
   macro avg       0.86      0.83      0.84       282
weighted avg       0.97      0.98      0.97       282

Precision after augmentation: 0.7273
Recall after augmentation: 0.6667
F1 Score after augmentation: 0.6957


# Reduced Spambase (Reduced minority: 1000)

# 50% of minority class

In [4]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('Spambase.csv')

# Separate the minority and majority classes
minority_class = data[data.iloc[:, -1] == 1]  # Rows where the last column (target) is 1
majority_class = data[data.iloc[:, -1] == 0]  # Rows where the last column (target) is 0

# Randomly select 813 samples to drop from the minority class
drop_indices = np.random.choice(minority_class.index, 813, replace=False)
minority_class = minority_class.drop(drop_indices)

# Combine the modified minority class and the majority class
balanced_data = pd.concat([majority_class, minority_class])

# Shuffle the dataset to ensure randomness
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into X and y
X = balanced_data.iloc[:, :-1]  
y = balanced_data.iloc[:, -1]   

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)//2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {0: 2788, 1: 1000}
Recall score (original data): 0.8600
F1 score (original data): 0.8897

Class distribution after augmentation :
Counter({0: 2788, 1: 1500})

Number of samples generated by SMOTE: 500

Classification report after augmentation:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       837
           1       0.91      0.92      0.92       300

    accuracy                           0.96      1137
   macro avg       0.94      0.94      0.94      1137
weighted avg       0.96      0.96      0.96      1137

Precision after augmentation: 0.9109
Recall after augmentation: 0.9200
F1 Score after augmentation: 0.9154


# 100% of minority class

In [5]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('Spambase.csv')

# Separate the minority and majority classes
minority_class = data[data.iloc[:, -1] == 1]  # Rows where the last column (target) is 1
majority_class = data[data.iloc[:, -1] == 0]  # Rows where the last column (target) is 0

# Randomly select 813 samples to drop from the minority class
drop_indices = np.random.choice(minority_class.index, 813, replace=False)
minority_class = minority_class.drop(drop_indices)

# Combine the modified minority class and the majority class
balanced_data = pd.concat([majority_class, minority_class])

# Shuffle the dataset to ensure randomness
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into X and y
X = balanced_data.iloc[:, :-1]  
y = balanced_data.iloc[:, -1]   

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority))

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {0: 2788, 1: 1000}
Recall score (original data): 0.9067
F1 score (original data): 0.9189

Class distribution after augmentation :
Counter({0: 2788, 1: 2000})

Number of samples generated by SMOTE: 1000

Classification report after augmentation:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       837
           1       0.93      0.96      0.95       300

    accuracy                           0.97      1137
   macro avg       0.96      0.97      0.96      1137
weighted avg       0.97      0.97      0.97      1137

Precision after augmentation: 0.9323
Recall after augmentation: 0.9633
F1 Score after augmentation: 0.9475


# 200% of minority class

In [6]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('Spambase.csv')

# Separate the minority and majority classes
minority_class = data[data.iloc[:, -1] == 1]  # Rows where the last column (target) is 1
majority_class = data[data.iloc[:, -1] == 0]  # Rows where the last column (target) is 0

# Randomly select 813 samples to drop from the minority class
drop_indices = np.random.choice(minority_class.index, 813, replace=False)
minority_class = minority_class.drop(drop_indices)

# Combine the modified minority class and the majority class
balanced_data = pd.concat([majority_class, minority_class])

# Shuffle the dataset to ensure randomness
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into X and y
X = balanced_data.iloc[:, :-1]  
y = balanced_data.iloc[:, -1]   

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)*2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {0: 2788, 1: 1000}
Recall score (original data): 0.9267
F1 score (original data): 0.9298

Class distribution after augmentation :
Counter({1: 3000, 0: 2788})

Number of samples generated by SMOTE: 2000

Classification report after augmentation:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       837
           1       0.92      0.98      0.95       300

    accuracy                           0.97      1137
   macro avg       0.96      0.98      0.97      1137
weighted avg       0.97      0.97      0.97      1137

Precision after augmentation: 0.9245
Recall after augmentation: 0.9800
F1 Score after augmentation: 0.9515


# Spambase Data

# 50% of minority class

In [7]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('Spambase.csv')



X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)//2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {0: 2788, 1: 1813}
Recall score (original data): 0.9283
F1 score (original data): 0.9430

Class distribution after augmentation :
Counter({0: 2788, 1: 2719})

Number of samples generated by SMOTE: 906

Classification report after augmentation:
              precision    recall  f1-score   support

           0       0.98      0.97      0.97       837
           1       0.95      0.97      0.96       544

    accuracy                           0.97      1381
   macro avg       0.97      0.97      0.97      1381
weighted avg       0.97      0.97      0.97      1381

Precision after augmentation: 0.9514
Recall after augmentation: 0.9724
F1 Score after augmentation: 0.9618


# 100% of minority class

In [8]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('Spambase.csv')



X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority))

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {0: 2788, 1: 1813}
Recall score (original data): 0.9283
F1 score (original data): 0.9430

Class distribution after augmentation :
Counter({1: 3626, 0: 2788})

Number of samples generated by SMOTE: 1813

Classification report after augmentation:
              precision    recall  f1-score   support

           0       0.99      0.96      0.97       837
           1       0.94      0.98      0.96       544

    accuracy                           0.97      1381
   macro avg       0.96      0.97      0.97      1381
weighted avg       0.97      0.97      0.97      1381

Precision after augmentation: 0.9368
Recall after augmentation: 0.9816
F1 Score after augmentation: 0.9587


# 200% of minority class

In [9]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('Spambase.csv')



X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)*2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {0: 2788, 1: 1813}
Recall score (original data): 0.9283
F1 score (original data): 0.9430

Class distribution after augmentation :
Counter({1: 5439, 0: 2788})

Number of samples generated by SMOTE: 3626

Classification report after augmentation:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       837
           1       0.93      0.99      0.96       544

    accuracy                           0.97      1381
   macro avg       0.96      0.97      0.96      1381
weighted avg       0.97      0.97      0.97      1381

Precision after augmentation: 0.9277
Recall after augmentation: 0.9908
F1 Score after augmentation: 0.9582


# Diabetes Data

# 50% of minority class

In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('diabetes.csv')



X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)//2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {0: 500, 1: 268}
Recall score (original data): 0.5309
F1 score (original data): 0.5972

Class distribution after augmentation :
Counter({0: 500, 1: 402})

Number of samples generated by SMOTE: 134

Classification report after augmentation:
              precision    recall  f1-score   support

           0       0.90      0.82      0.86       150
           1       0.72      0.84      0.77        81

    accuracy                           0.83       231
   macro avg       0.81      0.83      0.82       231
weighted avg       0.84      0.83      0.83       231

Precision after augmentation: 0.7158
Recall after augmentation: 0.8395
F1 Score after augmentation: 0.7727


# 100% of minority class

In [2]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('diabetes.csv')



X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority))

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {0: 500, 1: 268}
Recall score (original data): 0.5309
F1 score (original data): 0.5972

Class distribution after augmentation :
Counter({1: 536, 0: 500})

Number of samples generated by SMOTE: 268

Classification report after augmentation:
              precision    recall  f1-score   support

           0       0.95      0.79      0.86       150
           1       0.70      0.93      0.80        81

    accuracy                           0.84       231
   macro avg       0.83      0.86      0.83       231
weighted avg       0.86      0.84      0.84       231

Precision after augmentation: 0.7009
Recall after augmentation: 0.9259
F1 Score after augmentation: 0.7979


# 200% of minority class

In [3]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

data = pd.read_csv('diabetes.csv')



X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

#Separate minority class
X_minority = X[y == 1]

X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig)
f1_orig = f1_score(y_test, y_pred_orig)

print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")


majority_class_count = np.sum(y_train_orig == -1)
minority_class_count = np.sum(y_train_orig == 1)

# Set the target number of samples for the minority class
desired_minority_class_size = len(X_minority)+(len(X_minority)*2)

# Adjust sampling based on the desired size
if desired_minority_class_size < minority_class_count:
    # Reduce the size of the minority class
    minority_indices = np.where(y_train_orig == 1)[0]
    majority_indices = np.where(y_train_orig != 1)[0]

    # Randomly select the desired number of samples
    reduced_minority_indices = np.random.choice(
        minority_indices, 
        size=desired_minority_class_size, 
        replace=False
    )

    # Combine reduced minority with majority class
    combined_indices = np.concatenate([reduced_minority_indices, majority_indices])
    X_balanced = X[combined_indices]
    y_balanced = y[combined_indices]

    print("\nClass distribution after reduction (on training data):")
    print(Counter(y_balanced))

elif desired_minority_class_size > minority_class_count:
    # Use SMOTE to oversample if the desired size is greater
    smote_strategy = {1: desired_minority_class_size}
    smote = SMOTE(sampling_strategy=smote_strategy, random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)

    # Print class distribution after SMOTE
    print("\nClass distribution after augmentation :")
    print(Counter(y_balanced))

    # Print the number of samples generated
    num_generated_samples = len(X_balanced) - len(X)
    print(f"\nNumber of samples generated by SMOTE: {num_generated_samples}")

else:
    # If the desired size is equal to the current size, use the original data
    X_balanced, y_balanced= X, y
    print("\nNo augmentation needed. Class distribution remains unchanged.")

# Train a Random Forest Classifier after augmentation
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)
clf_res = RandomForestClassifier(random_state=42)
clf_res.fit(X_train_bal, y_train_bal)
y_pred_bal = clf_res.predict(X_test)

# Print classification report, precision, recall, and F1 score after SMOTE (on test data)
print("\nClassification report after augmentation:")
print(classification_report(y_test, y_pred_bal))

precision_after = precision_score(y_test, y_pred_bal, pos_label=1)
recall_after = recall_score(y_test, y_pred_bal, pos_label=1)
f1_after = f1_score(y_test, y_pred_bal, pos_label=1)

print(f"Precision after augmentation: {precision_after:.4f}")
print(f"Recall after augmentation: {recall_after:.4f}")
print(f"F1 Score after augmentation: {f1_after:.4f}")


Class distribution before augmentation: {0: 500, 1: 268}
Recall score (original data): 0.5309
F1 score (original data): 0.5972

Class distribution after augmentation :
Counter({1: 804, 0: 500})

Number of samples generated by SMOTE: 536

Classification report after augmentation:
              precision    recall  f1-score   support

           0       0.97      0.77      0.86       150
           1       0.69      0.95      0.80        81

    accuracy                           0.84       231
   macro avg       0.83      0.86      0.83       231
weighted avg       0.87      0.84      0.84       231

Precision after augmentation: 0.6937
Recall after augmentation: 0.9506
F1 Score after augmentation: 0.8021
