In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
from urllib.parse import urlparse
import re

In [22]:
df = pd.read_csv('C:\\Users\\Abhinav\\Desktop\\RJPOLICE_HACK_224_KubeCentrix_5\\models\\25kmain.csv')

In [23]:
df.head()

Unnamed: 0,url,type
0,http://br-icloud.com.br/,phishing
1,https://mp3raid.com/,benign
2,https://bopsecrets.org/,benign
3,http://www.garage-pirenne.be/,defacement
4,http://adventure-nicaragua.net/,defacement


In [24]:
def extract_features(url):
    features = {}
    
    # Length of URL
    features['length'] = len(url)
    
    # Whether URL contains IP address
    features['has_ip'] = int(bool(re.match(r'\d+\.\d+\.\d+\.\d+', url)))
    
    # Count of special characters
    special_chars = [';', '?', '=', '&']
    features['count_special'] = sum(map(url.count, special_chars))
    
    # Presence of HTTPS
    features['https'] = url.startswith('https')

    return features

In [25]:
feature_data = pd.DataFrame(df['url'].apply(lambda x: extract_features(x)).tolist())

In [26]:
df = df.join(feature_data)

In [27]:
df.head()

Unnamed: 0,url,type,length,has_ip,count_special,https
0,http://br-icloud.com.br/,phishing,24,0,0,False
1,https://mp3raid.com/,benign,20,0,0,True
2,https://bopsecrets.org/,benign,23,0,0,True
3,http://www.garage-pirenne.be/,defacement,29,0,0,False
4,http://adventure-nicaragua.net/,defacement,31,0,0,False


In [28]:
label_mapping = {'benign': 0, 'phishing': 1, 'defacement': 2}
df['type'] = df['type'].map(label_mapping)

In [29]:
df.head()

Unnamed: 0,url,type,length,has_ip,count_special,https
0,http://br-icloud.com.br/,1.0,24,0,0,False
1,https://mp3raid.com/,0.0,20,0,0,True
2,https://bopsecrets.org/,0.0,23,0,0,True
3,http://www.garage-pirenne.be/,2.0,29,0,0,False
4,http://adventure-nicaragua.net/,2.0,31,0,0,False


In [30]:
X = df.drop(['url', 'type'], axis=1)
y = df['type']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

In [35]:
print(df.isnull().sum())

url              0
type             0
length           0
has_ip           0
count_special    0
https            0
dtype: int64


In [36]:
df['type'] = df['type'].astype(int)

In [37]:
X = df.drop(['url', 'type'], axis=1)
y = df['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_classifier.fit(X_train, y_train)


In [40]:
y_pred = xgb_classifier.predict(X_test)

In [41]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:")
print(accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      3820
           1       0.88      0.13      0.23       277
           2       0.73      0.97      0.83       903

    accuracy                           0.93      5000
   macro avg       0.87      0.69      0.68      5000
weighted avg       0.94      0.93      0.92      5000

Accuracy Score:
0.9288


In [54]:
# # Example URL
# new_url = "http://cut1203.party"

# # Extract features from the new URL
# new_url_features = extract_features(new_url)

# # Convert to DataFrame (assuming 'extract_features' returns a dictionary)
# new_url_df = pd.DataFrame([new_url_features])


In [55]:
# # Making the prediction
# predicted_class = xgb_classifier.predict(new_url_df)

# # Decode the prediction (if necessary)
# label_decoder = {0: 'legit', 1: 'phishing', 2: 'defacement'}
# predicted_label = label_decoder[predicted_class[0]]

# print(f"The URL '{new_url}' is classified as: {predicted_label}")

In [56]:
# # Making the prediction
# predicted_class = xgb_classifier.predict(new_url_df)[0]

# # Simplified classification logic
# if predicted_class == 0:
#     predicted_label = 'Legitimate'
# else:
#     predicted_label = 'Suspicious'

# print(f"The URL '{new_url}' is classified as: {predicted_label}")


The URL 'http://cut1203.party' is classified as: Suspicious


In [None]:
import pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(xgb_classifier,file)