In [None]:
!pip install tldextract
!pip install xgboost
!pip install pandas numpy beautifulsoup4 textblob scikit-learn
!pip install matplotlib
!pip install seaborn
!pip install scipy
!pip install nltk
!pip install wordcloud
!pip install scipy
!pip install nltk
!pip install wordcloud

In [None]:
import pandas as pd
import re
import urllib.parse
from collections import Counter
import tldextract
import matplotlib.pyplot as plt
import numpy as np


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import make_scorer, f1_score


def load_and_prepare_data():
    print("Basic Information About the Dataset")
    df = pd.read_csv('final_phishing_dataset.csv')
    
    X = df.drop('label', axis=1)
    y = df['label']
    
    print(f"Dataset shape: {X.shape}")
    print(f"Class distribution:\n{y.value_counts()}")
    
    return X, y

#i need to seperate the features into custom features and tf-idf features since i need to standardize the custom features and use tf-idf for the rest
def identify_feature_types(X):
    # separate custom features from tf-idf features
    total_features = X.shape[1]
    tfidf_count = 5000
    
    if total_features > tfidf_count:
        custom_features = list(X.columns[:-tfidf_count])
        tfidf_features = list(X.columns[-tfidf_count:])
    else:
        custom_features = list(X.columns)
        tfidf_features = []
    
    print(f"Custom features: {len(custom_features)}")
    print(f"TF-IDF features: {len(tfidf_features)}")
    
    return custom_features, tfidf_features

#
def create_preprocessing_pipeline(custom_features, tfidf_features):
    # create models for different feature types
    transformers = []
    
    #WE NEED TO STANDARDIZE THE CUSTOM FEATURES BECAUSE THEY ARE NOT ON THE SAME SCALE AS THE TF-IDF FEATURES
    if custom_features:
        transformers.append(('custom', StandardScaler(), custom_features))
    
    #leave the tfidf features as they are
    if tfidf_features:
        transformers.append(('tfidf', 'passthrough', tfidf_features))
    
    preprocessor = ColumnTransformer(transformers=transformers,remainder='drop')
    
    return preprocessor

def model_selection_pipeline(X, y, custom_features, tfidf_features):
    print("\n" + "="*60)
    print("MODEL SELECTION USING 5-FOLD CROSS-VALIDATION")
    print("="*60)
    
    # define models to test with default parameters
    models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42, probability=True),
    'Naive Bayes': MultinomialNB()
}
    
    preprocessor = create_preprocessing_pipeline(custom_features, tfidf_features)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    model_scores = {}
    best_model_name = None
    best_score = 0
    
    print("\nTesting models with default parameters...")
    print("-" * 60)
    
    for name, model in models.items():
        print(f"\nTesting {name}...")
        
        #this i
        pipeline = Pipeline([
            ('preprocess', preprocessor),
            ('select', SelectKBest(f_classif, k=1000)),
            ('classifier', model)
        ])
        
        scores = cross_val_score(pipeline, X, y, cv=cv, scoring='f1_weighted', n_jobs=-1)
        mean_score = scores.mean()
        std_score = scores.std()
        
        model_scores[name] = {
            'mean_score': mean_score,
            'std_score': std_score,
            'scores': scores,
            'model': model
        }
        
        print(f"Mean F1 Score: {mean_score})")
        print(f"Individual cross validation scores: {scores}")
        
        if mean_score > best_score:
            best_score = mean_score
            best_model_name = name
    
    return best_model_name, models[best_model_name], model_scores

def print_results(best_model_name, model_scores):
    print(f"\n{'='*60}")
    print("MODEL SELECTION RESULTS")
    print(f"{'='*60}")
    
    # sort models by performance
    sorted_models = sorted(model_scores.items(), key=lambda x: x[1]['mean_score'], reverse=True)
    
    print("\nModel Rankings (by F1 Score):")
    print("-" * 40)
    for i, (name, scores) in enumerate(sorted_models, 1):
        print(f"{i}. {name}: {scores['mean_score']}")
    
    print(f"\n{'='*60}")
    print(f"BEST MODEL: {best_model_name}")
    print(f"Best F1 Score: {model_scores[best_model_name]['mean_score']}")
    print(f"{'='*60}")


#MAIN SECTION WHERE EVERYTHING WILL BE EXECUTED

X, y = load_and_prepare_data()
custom_features, tfidf_features = identify_feature_types(X)
best_model_name, best_model, model_scores = model_selection_pipeline(
    X, y, custom_features, tfidf_features
)
print_results(best_model_name, model_scores)

print(f"\n{'='*60}")
print("Model Selection Completed")
print(f"{'='*60}") 

Basic Information About the Dataset
Dataset shape: (200609, 2521)
Class distribution:
label
0.0    107534
1.0     93075
Name: count, dtype: int64
Custom features: 2521
TF-IDF features: 0

MODEL SELECTION USING 5-FOLD CROSS-VALIDATION

Testing models with default parameters...
------------------------------------------------------------

Testing Random Forest...


NameError: name 'f_classif' is not defined

In [None]:
#Now we need to 