In [None]:
import pandas as pd
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib

warnings.filterwarnings('ignore')

# Load and inspect the dataset
phish_data = pd.read_csv('phishing_urls.csv', encoding='latin1', on_bad_lines='skip')

# Inspect the first few rows and the column names
print(phish_data.head())  # Print first few rows of the data
print(phish_data.columns)  # Print the column names to identify the correct URL and Label columns

# Extract the label column and rename it if necessary
# For this example, we assume 'phishing' is the label column
y = phish_data['phishing']

# Feature extraction: Use all existing columns as features
features_df = phish_data.drop(columns=['phishing'])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_df, y, test_size=0.3, random_state=42)

# Train the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

# Save the trained model for future use
joblib.dump(model, 'phishing_model.pkl')

# Get feature names used in training
feature_names = list(X_train.columns)

def predict_url(features):
    # Ensure all features used during training are included, fill missing features with 0
    feature_dict = {name: features.get(name, 0) for name in feature_names}
    
    # Convert features to DataFrame
    features_df = pd.DataFrame([feature_dict])
    
    # Predict and return result
    return 'Phishing' if model.predict(features_df)[0] == 1 else 'Legitimate'

# Example prediction with all features provided
example_features = {
    'qty_dot_url': 3,
    'qty_hyphen_url': 0,
    'qty_underline_url': 0,
    'qty_slash_url': 1,
    'qty_questionmark_url': 0,
    'qty_equal_url': 0,
    'qty_at_url': 0,
    'qty_and_url': 0,
    'qty_exclamation_url': 0,
    'qty_space_url': 0,
    # Add all other feature values here
    'qty_ip_resolved': 1,
    'qty_nameservers': 2,
    'qty_mx_servers': 0,
    'ttl_hostname': 892,
    'tls_ssl_certificate': 0,
    'qty_redirects': 0,
    'url_google_index': 0,
    'domain_google_index': 0,
    'url_shortened': 0
}

print(f'The URL with features {example_features} is predicted to be {predict_url(example_features)}.')


   qty_dot_url  qty_hyphen_url  qty_underline_url  qty_slash_url  \
0            3               0                  0              1   
1            5               0                  1              3   
2            2               0                  0              1   
3            4               0                  2              5   
4            2               0                  0              0   

   qty_questionmark_url  qty_equal_url  qty_at_url  qty_and_url  \
0                     0              0           0            0   
1                     0              3           0            2   
2                     0              0           0            0   
3                     0              0           0            0   
4                     0              0           0            0   

   qty_exclamation_url  qty_space_url  ...  qty_ip_resolved  qty_nameservers  \
0                    0              0  ...                1                2   
1                    0      