# Explore here

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [None]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"
data = pd.read_csv(url)
print(data.head())
print(data.info())


In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

# Download NLTK resources if not already available
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing function
def preprocess_url(url):
    # Split URLs into tokens based on special characters
    url = re.sub(r'http\S+|www.\S+', '', url)  # Remove http/https links
    url = re.sub(r'[^a-zA-Z]', ' ', url)  # Remove special characters
    tokens = url.lower().split()

    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

# Apply preprocessing
data['processed_url'] = data['url'].apply(preprocess_url)
print(data.head())


In [None]:
X = data['processed_url']
y = data['is_spam']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


In [None]:
svm = SVC(kernel='linear')  # Default parameters
svm.fit(X_train_vect, y_train)


In [None]:
y_pred = svm.predict(X_test_vect)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf'],
    'degree': [2, 3, 4]
}

grid_search = GridSearchCV(SVC(), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train_vect, y_train)

print("Best Parameters:", grid_search.best_params_)


In [None]:
best_svm = grid_search.best_estimator_
best_svm.fit(X_train_vect, y_train)

y_pred_optimized = best_svm.predict(X_test_vect)

print("Optimized Accuracy:", accuracy_score(y_test, y_pred_optimized))
print("Optimized Classification Report:\n", classification_report(y_test, y_pred_optimized))


In [None]:
# Load vectorizer and model
loaded_vectorizer = joblib.load('vectorizer.pkl')
loaded_model = joblib.load('svm_model.pkl')

# Example usage
sample_url = "http://example.com/suspicious-link"
sample_url_processed = preprocess_url(sample_url)
sample_url_vect = loaded_vectorizer.transform([sample_url_processed])
prediction = loaded_model.predict(sample_url_vect)
print("Spam" if prediction[0] == 1 else "Not Spam")
