# Natural language processing: spam detection

### 1. Data Loading and Preparation


### 1.1. Load the Data

In [5]:
import pandas as pd

# Read the CSV file from the URL into a DataFrame
data_df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv')

# Drop duplicates if any
data_df.drop_duplicates(inplace=True)
data_df.reset_index(inplace=True, drop=True)

### 1.2. Inspect the Data

In [6]:
# Inspect the first few rows of the dataset
data_df.head()

# Check data types and any missing values
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2369 entries, 0 to 2368
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   url      2369 non-null   object
 1   is_spam  2369 non-null   bool  
dtypes: bool(1), object(1)
memory usage: 20.9+ KB


### 1.3. Train-test Split

In [7]:
from sklearn.model_selection import train_test_split

# Separate features (URL text) from labels (is_spam column)
labels = data_df['is_spam']
features = data_df.drop('is_spam', axis=1)

# Encode the labels as integers (1 for spam, 0 for not spam)
encoded_labels = labels.apply(lambda x: 1 if x else 0).astype(int)

# Split the data into training and testing features and labels
training_features, testing_features, encoded_training_labels, encoded_testing_labels = train_test_split(
    features['url'],  # Assuming the text is in the 'url' column
    encoded_labels,
    test_size=0.25,
    random_state=315
)

## 2. Exploratory Data Analysis (EDA)

### 2.1. Text Preprocessing

In [8]:
import regex as re

# Preprocess the text data (remove special characters, digits, etc.)
def preprocess_text(text):
    # Remove non-alphabetic characters and make everything lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    return text

# Apply preprocessing to the training and testing features
training_features = training_features.apply(preprocess_text)
testing_features = testing_features.apply(preprocess_text)

### 2.2. Lemmatization

In [9]:
from nltk.stem import WordNetLemmatizer
from nltk import download

# Download necessary NLTK data
download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize each word in the text
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

# Apply lemmatization to the features
training_features = training_features.apply(lemmatize_text)
testing_features = testing_features.apply(lemmatize_text)

[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...


### 2.3. Vectorization

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to 5000 features for efficiency

# Fit the vectorizer on the training data and transform both training and testing data
training_features_vec = tfidf_vectorizer.fit_transform(training_features)
testing_features_vec = tfidf_vectorizer.transform(testing_features)

### 3. SVM Model

### 3.1. Baseline Model Performance

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

# Example SVM model
model = SVC(random_state=315)

# Perform cross-validation
scores = cross_val_score(model, training_features_vec, encoded_training_labels, cv=5, n_jobs=-1)

print("Cross-validation scores:", scores)

Cross-validation scores: [0.89325843 0.89577465 0.89577465 0.89295775 0.89295775]


### 3.2. Hyperparameter Optimization

In [21]:
import regex as re
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for hyperparameter optimization
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Set up GridSearchCV to tune the hyperparameters using cross-validation
grid_search = GridSearchCV(estimator=SVC(random_state=315), param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(training_features_vec, encoded_training_labels)

# Display the best hyperparameters
print(f"Best Hyperparameters: {grid_search.best_params_}")

Best Hyperparameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}


### 4. Model Evaluation

In [22]:
from sklearn.metrics import accuracy_score, classification_report

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Make predictions on the test data
predictions = best_model.predict(testing_features_vec)

# Evaluate the model's performance on the test data
accuracy = accuracy_score(encoded_testing_labels, predictions)
print(f"Accuracy on Test Data: {accuracy}")
print("Classification Report:\n", classification_report(encoded_testing_labels, predictions))

Accuracy on Test Data: 0.9106239460370995
Classification Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95       537
           1       1.00      0.05      0.10        56

    accuracy                           0.91       593
   macro avg       0.96      0.53      0.53       593
weighted avg       0.92      0.91      0.87       593

