# Natural language processing: spam detection

In [1]:
# Handle imports up-front
import regex as re
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from utils import cross_val

## 1. Data loading

### 1.1. Load the data

In [2]:
# Read csv file into dataframe
data_df=pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv')

# Drop duplicates if any
data_df.drop_duplicates(inplace=True)
data_df.reset_index(inplace=True, drop=True)

### 1.2. Inspect the data

In [3]:
# Your code here

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True


### 1.3. Train-test split

In [5]:
# Separate features from labels
labels=data_df['is_spam']
features=data_df.drop('is_spam', axis=1)

# Encode the labels
encoded_labels = labels.apply(lambda x: 1 if x else 0).astype(int)

# Split the data into training and testing features and labels
training_features, testing_features, encoded_training_labels, encoded_testing_labels=train_test_split(
    features,
    encoded_labels,
    test_size=0.25,
    random_state=315
)

## 2. EDA

### 2.1. Text preprocessing

In [None]:
# Your code here - think about how you are cleaning the text. Look at some examples, are there words we should filter out?

### 2.2. Lematization

In [7]:
# Your code here

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/siderealyear/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/siderealyear/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,url
311,"[nytimes, wild, removed, html]"
2317,"[digg, cmail, ptuurik, vkjjhbly]"
1953,[snarkmarket]
1872,"[youtube, watch]"
1752,"[wired, story, amazon, shake, self, driving, r..."


### 2.3. Vectorization

In [8]:
# Your code here. Take a look at the features. Do we need all of them? Could we reduce dimensionality or do feature selection?

Encoded features shape: (1776, 386)


## 3. SVM model

### 3.1. Baseline model performance

In [9]:
# Instantiate the support vector machine classifier with defaults
baseline_model=SVC(random_state=315)

# Cross-validate the default model on the encoded training data
scores=cross_val(baseline_model, encoded_training_features, encoded_training_labels)

# Save the baseline cross-validation scores for later
results={'Baseline': scores}

Cross-validation accuracy: 93.24 +/- 1.46%


### 3.2. Hyperparameter optimization

In [None]:
# Do the optimization

Best hyperparameters: {'C': 1, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}


In [None]:
# Use cross-validation to evaluate a new model trained with the best hyperparameter values from the optimization

## 4. Model evaluation

In [None]:
# Finaly, try the model out on the test data. Be sure to process the test data the same way you did the training data!