# Dataset Preprocessing

### Drop irrelevant Columns

In [None]:
# Drop ['FILENAME', 'URL', 'DOMAIN', 'TLD', 'TITLE'] columns, as they are not needed for the model training, and causes issues with the model

df = df.select_dtypes(include=['number']).copy()

# Remove duplicate rows
df = df.drop_duplicates()
df.shape

### Train-test Split

In [None]:
# Split
X = df.iloc[:, :-1]  # All rows, all columns except the last one
y = df.iloc[:, -1]   # All rows, only the last column

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}, {y_train.shape}")
print(f"Testing set shape: {X_test.shape}, {y_test.shape}")

print("\nSample of X_train:\n")
print(X_train.head())

### Feature Scaling

In [None]:
# Initialize scaler
scaler = StandardScaler()

# Fit only on training data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print(f"Shape after scaling: X_train: {X_train_scaled.shape}, X_test: {X_test_scaled.shape}")

print("\nSample of X_train_scaled:\n")
X_train_scaled.head()

### Compare SMOTE, ADASYN, BorderlineSMOTE

In [None]:
# Check original class distribution
print("Original class distribution in training set:")
print(y_train.value_counts())

# Apply SMOTE
X_smote, y_smote = SMOTE(random_state=42).fit_resample(X_train_scaled, y_train)
print("\nAfter SMOTE:")
print(y_smote.value_counts())

# Apply ADASYN
X_adasyn, y_adasyn = ADASYN(random_state=42).fit_resample(X_train_scaled, y_train)
print("\nAfter ADASYN:")
print(y_adasyn.value_counts())

# Apply BorderlineSMOTE
X_bsmote, y_bsmote = BorderlineSMOTE(random_state=42, kind='borderline-1').fit_resample(X_train_scaled, y_train)
print("\nAfter BorderlineSMOTE:")
print(y_bsmote.value_counts())

### Apply SMOTE

In [None]:
# Apply SMOTE
X_train_resampled, y_train_resampled = SMOTE(random_state=42).fit_resample(X_train_scaled, y_train)

print(f"Shape after SMOTE resampling: {X_train_resampled.shape}")
print("\nClass distribution after SMOTE:")
print(y_train_resampled.value_counts())

## Dataset Selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt

# Initialize SelectKBest
k = 20  # Change the number of features you want to select
selector = SelectKBest(score_func=f_classif, k=k)

# Fit on resampled training data
X_train_selected = selector.fit_transform(X_train_resampled, y_train_resampled)

# Apply the same selection on test data
X_test_selected = selector.transform(X_test_scaled)

# Get indices and scores of selected features
selected_indices = selector.get_support(indices=True)
scores = selector.scores_

# Get the original feature names
feature_names = X_train_scaled.columns
selected_feature_names = [feature_names[i] for i in selected_indices]

# Print selected features
print(f"Top {k} selected features:\n")
print(selected_feature_names)

print("\nShape of selected training set:", X_train_selected.shape)
print("Shape of selected testing set:", X_test_selected.shape)

# Plot scores
# --------------------------------------------------
# Create figure 1920x1080 pixels at 100 DPI
plt.figure(figsize=(19.2, 10.8))

# Plot F-scores
plt.barh(selected_feature_names, [scores[i] for i in selected_indices], color='skyblue', edgecolor='black')

# Axis labels and title
plt.xlabel("F-score", fontsize=16)
plt.title(f"Top {k} Features via SelectKBest (ANOVA F-test)", fontsize=20)
plt.gca().invert_yaxis()

# Add gridlines for better readability
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Adjust tick label sizes
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.tight_layout()
plt.show()