In [60]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the data set

In [61]:
data = pd.read_csv('Dataset/dataset.csv')
df = pd.DataFrame(data)

# Step 2: Tidy dataset by dropping irrelevant columns

In [62]:
# Droping irrelevant columns
df.drop(columns=['index'], inplace=True)

# Separate features and target variable
X = df.drop(columns=['Result'])  # Features
y = df['Result']  # Target

In [63]:
#Display the first 5 rows of the dataset
data.head()

Unnamed: 0,index,having_IPhaving_IP_Address,URLURL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,1,-1,1,1,1,-1,-1,-1,-1,-1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,2,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,3,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,4,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,5,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1


In [64]:
# Handle missing values
X.fillna(X.median(), inplace=True)

In [65]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11055 entries, 0 to 11054
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   index                        11055 non-null  int64
 1   having_IPhaving_IP_Address   11055 non-null  int64
 2   URLURL_Length                11055 non-null  int64
 3   Shortining_Service           11055 non-null  int64
 4   having_At_Symbol             11055 non-null  int64
 5   double_slash_redirecting     11055 non-null  int64
 6   Prefix_Suffix                11055 non-null  int64
 7   having_Sub_Domain            11055 non-null  int64
 8   SSLfinal_State               11055 non-null  int64
 9   Domain_registeration_length  11055 non-null  int64
 10  Favicon                      11055 non-null  int64
 11  port                         11055 non-null  int64
 12  HTTPS_token                  11055 non-null  int64
 13  Request_URL                  11055 non-null  i

In [66]:
data.max(axis=0)

index                          11055
having_IPhaving_IP_Address         1
URLURL_Length                      1
Shortining_Service                 1
having_At_Symbol                   1
double_slash_redirecting           1
Prefix_Suffix                      1
having_Sub_Domain                  1
SSLfinal_State                     1
Domain_registeration_length        1
Favicon                            1
port                               1
HTTPS_token                        1
Request_URL                        1
URL_of_Anchor                      1
Links_in_tags                      1
SFH                                1
Submitting_to_email                1
Abnormal_URL                       1
Redirect                           1
on_mouseover                       1
RightClick                         1
popUpWidnow                        1
Iframe                             1
age_of_domain                      1
DNSRecord                          1
web_traffic                        1
P

# Step 3: Splitting the data into test train sets

In [67]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [68]:
X_train.shape

(8844, 30)

In [None]:
from sklearn.model_selection import StratifiedKFold


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
# Initialize RandomForest Model
model = RandomForestClassifier(
    n_estimators=400,
    random_state=42,
    n_jobs=-1,
    max_depth=10,
    min_samples_split=3,
    min_samples_leaf=2,
    max_features='sqrt',
    bootstrap=True
)
model.fit(X_train, y_train)

In [None]:
# Perform Cross-Validation
from sklearn.model_selection import cross_val_score


cv_scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')


# Step 4: Evaluate model

In [72]:
# Print Cross-Validation Results
print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {np.mean(cv_scores):.4f}")
print(f"Standard Deviation: {np.std(cv_scores):.4f}")

Cross-Validation Accuracy Scores: [0.94934419 0.95929444 0.95070104 0.95296246 0.94708277]
Mean Accuracy: 0.9519
Standard Deviation: 0.0042


In [73]:
# Train the model on the full dataset
model.fit(X, y)

In [74]:
import joblib
joblib.dump(model, 'Final_model3.pkl')

['Final_model3.pkl']