In [11]:
import pandas as pd
import re
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sb

In [12]:
from google.colab import drive
drive.mount("/content/drive")
df = pd.read_csv("/content/drive/MyDrive/ML/data.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# Optimized suspicious keyword list for URL-based detection
SUSPICIOUS_KEYWORDS = [
    "money", "win", "free", "offer", "cash", "prize", "bonus", "credit", "loan", "earn", "income", "profits",
    "investment", "bank", "refinance", "withdraw", "transfer", "deposit", "payment", "wallet", "bitcoin",
    "crypto", "forex", "payout", "reward", "check", "funds", "lottery", "jackpot", "fund", "reward",
    "urgent", "immediate", "act-now", "limited", "hurry", "instant", "exclusive", "offer", "bargain",
    "alert", "verify", "confirm", "secure", "action-required", "final-notice", "important", "warning",
    "secure-account", "final-reminder",
    "porn", "adult", "sex", "xxx", "nude", "hot", "escort", "cams", "dating", "erotic", "free-porn", "fetish",
    "webcam", "naked", "taboo", "gay", "trans", "lingerie", "teen", "milf", "swingers", "escort", "babe",
    "login", "account", "signin", "signup", "password", "username", "credentials", "access", "verify-account",
    "reset-password", "recover", "unlock-account", "pre-approved", "activate", "fraud", "congratulations",
    "unlock", "claim", "winner", "click-here", "verify-identity", "confirm-identity", "urgent-notice",
    "scam", "fraud", "fake", "hidden", "trial", "no-catch", "cancel", "unsubscribe", "survey", "gift",
    "voucher", "donate", "inheritance", "biggest", "luxury", "remove", "subscribe"
]

def extract_features(df):
    """Feature extraction with emphasis on suspicious words and ignoring HTTP/HTTPS and 'www.' prefixes."""

    # Remove the protocol (http/https) and 'www.' from the URLs before feature extraction
    df['cleaned_url'] = df['url'].apply(lambda x: re.sub(r'^https?:\/\/(www\.)?', '', x))

    # URL length (without the protocol and 'www')
    df['url_length'] = df['cleaned_url'].apply(len)

    # Number of dots in URL (subdomains)
    df['num_dots'] = df['cleaned_url'].apply(lambda x: x.count('.'))

    # Number of special characters in URL
    df['num_special_chars'] = df['cleaned_url'].apply(lambda x: len(re.findall(r'[@\-_%?]', x)))

    # Count of suspicious keywords in the URL (focus on this feature)
    df['suspicious_keyword_count'] = df['cleaned_url'].apply(
        lambda x: sum(1 for keyword in SUSPICIOUS_KEYWORDS if keyword in x.lower())
    )

     # **Increase the weight of suspicious_keyword_count**
    df['weighted_suspicious_keyword_count'] = df['suspicious_keyword_count'] * 100  # Scale this by 5x or more


    # Extract the domain (without protocol or 'www') and calculate its length
    df['domain'] = df['cleaned_url'].apply(lambda x: urlparse(x).netloc)
    df['domain_length'] = df['domain'].apply(len)

    # Count subdomains
    df['num_subdomains'] = df['domain'].apply(lambda x: x.count('.'))

    # Drop columns that are not numeric or needed for the model
    df.drop(columns=['url', 'cleaned_url', 'domain'], errors='ignore', inplace=True)

    return df


In [14]:
df = extract_features(df)

In [15]:
from sklearn.preprocessing import StandardScaler

features_to_scale = ['url_length', 'num_dots', 'num_special_chars', 'suspicious_keyword_count']

scaler = StandardScaler()

df[features_to_scale] = scaler.fit_transform(df[features_to_scale])


In [16]:
df

Unnamed: 0,status,url_length,num_dots,num_special_chars,suspicious_keyword_count,weighted_suspicious_keyword_count,domain_length,num_subdomains
0,0,-0.136413,-0.041013,-0.515849,-0.355417,0,0,0
1,0,-0.206756,-0.041013,-0.515849,1.944829,100,0,0
2,0,-0.769501,-0.717157,-0.515849,-0.355417,0,0,0
3,0,-0.230204,-0.041013,-0.515849,-0.355417,0,0,0
4,0,-0.581920,-0.041013,-0.515849,-0.355417,0,0,0
...,...,...,...,...,...,...,...,...
822005,0,-0.839844,-0.717157,-0.515849,-0.355417,0,0,0
822006,0,-0.910187,-0.717157,-0.515849,-0.355417,0,0,0
822007,0,-0.839844,-0.041013,-0.515849,-0.355417,0,0,0
822008,0,-0.675710,-0.041013,-0.515849,-0.355417,0,0,0


In [17]:

X = df.drop(columns=['status'])
y = df['status']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Use only a smaller subset of the training data for quick testing
X_train_small = X_train.sample(5000, random_state=42)  # Select 5000 samples
y_train_small = y_train.loc[X_train_small.index]  # Ensure labels match the selected samples

svm_model = SVC(kernel='linear', random_state=42)

svm_model.fit(X_train_small, y_train_small)

y_pred_svm = svm_model.predict(X_test)

# Evaluate the model's performance
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Model Accuracy with smaller training set: {accuracy_svm:.2f}")

print(classification_report(y_test, y_pred_svm))


SVM Model Accuracy with smaller training set: 0.65
              precision    recall  f1-score   support

           0       0.67      0.52      0.58    118377
           1       0.63      0.77      0.69    128226

    accuracy                           0.65    246603
   macro avg       0.65      0.64      0.64    246603
weighted avg       0.65      0.65      0.64    246603



In [18]:
from sklearn.metrics import precision_score
precisionvalue = precision_score(y_test,y_pred_svm)
print(precisionvalue)

0.6328444692837917


In [19]:
# List of suspicious keywords (same as in your feature extraction)
SUSPICIOUS_KEYWORDS = [
    "money", "win", "free", "offer", "cash", "prize", "bonus", "credit", "loan", "earn", "income", "profits", "download",
    "investment", "bank", "refinance", "withdraw", "transfer", "deposit", "payment", "wallet", "bitcoin",
    "crypto", "forex", "payout", "reward", "check", "funds", "lottery", "jackpot", "porn", "adult", "xxx", "sex"
]

# Function to check for suspicious words in a URL
def flag_suspicious_url(url):
    """Check if a URL contains any suspicious words."""
    url_cleaned = re.sub(r'^https?:\/\/(www\.)?', '', url.lower())  # Clean the URL
    return any(keyword in url_cleaned for keyword in SUSPICIOUS_KEYWORDS)

# Function to make predictions using the machine learning model
def model_prediction(url, model, extract_features_func):
    """Use the machine learning model to predict if the URL is scammy."""
    # Create a DataFrame for the single URL
    url_data = pd.DataFrame([{'url': url}])

    # Extract features using the feature extraction function
    url_features = extract_features_func(url_data)

    # Drop unnecessary columns like 'url'
    url_features.drop(columns=['url'], errors='ignore', inplace=True)

    # Use the trained model to make a prediction
    prediction = model.predict(url_features)

    # Return the result (1 = scammy, 0 = legitimate)
    return "Scammy" if prediction[0] == 1 else "Legitimate"

# Function to handle URL scanning with the hybrid approach
def scan_url(url, model, extract_features_func):
    """First scan the URL for suspicious words, if not found, pass to the model."""
    # Step 1: Check for suspicious words
    if flag_suspicious_url(url):
        return f"The URL '{url}' is flagged as: Scammy (based on suspicious keywords)"

    # Step 2: If no suspicious words, use the model
    return f"The URL '{url}' is predicted to be: {model_prediction(url, model, extract_features_func)}"


# Test the hybrid approach on a URL
url_to_test = 'downloadgamees.com'

#Assuming you have a trained model and the extract_features function
result = scan_url(url_to_test, svm_model, extract_features)
print(result)


The URL 'downloadgamees.com' is flagged as: Scammy (based on suspicious keywords)


In [20]:
#saving the model

from joblib import dump
from google.colab import files

# Assuming rf_model is your trained Random Forest model

# Save the model to a file
dump(svm_model, 'svm_model.joblib')

# Download the saved model
files.download('svm_model.joblib')




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>