# NLP Project

### Importing Libraries

In [1]:
import pandas as pd
import regex as re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

## Loading datasets

In [2]:
total_data = pd.read_csv("https://breathecode.herokuapp.com/asset/internal-link?id=435&path=url_spam.csv")
total_data.head()

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True


# Data Processing

In [3]:
total_data["is_spam"].dtype

dtype('bool')

### Categorical to numerical transformation

In [4]:
total_data["is_spam"] = total_data["is_spam"].astype(int)
total_data.head() 

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,1
1,https://www.hvper.com/,1
2,https://briefingday.com/m/v4n3i4f3,1
3,https://briefingday.com/n/20200618/m#commentform,0
4,https://briefingday.com/fan,1


### Elimination of repeated values

In [5]:
print(total_data.shape)
print(f"Spam: {len(total_data.loc[total_data.is_spam == 1])}")
print(f"No Spam: {len(total_data.loc[total_data.is_spam == 0])}")

(2999, 2)
Spam: 696
No Spam: 2303


In [6]:
total_data = total_data.drop_duplicates()
total_data = total_data.reset_index(inplace=False, drop = True)
total_data.shape

(2369, 2)

### Conversion into numbers

In [7]:
X = TfidfVectorizer(analyzer="char", ngram_range=(3, 5), min_df=5, max_df=0.9, lowercase=True).fit_transform(total_data["url"])
y = total_data["is_spam"].astype(int)

### Splitting into train and test

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Initialization and training of the model

In [9]:
# Using SVC to train
model = SVC()
model.fit(X_train, y_train)

# Predicting test
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  0.9620253164556962


### Optimizing the model using Random Search

In [10]:
# params
param_dist = {
    "C": [0.1, 1, 3, 10],
    "kernel": ["linear", "rbf"],
    "gamma": ["scale", "auto", 0.1, 0.01]
}

model_RS = RandomizedSearchCV(
    estimator=SVC(),
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    random_state=42,
    n_jobs=-1
)

# fit the model
model_RS.fit(X_train, y_train)

print("Best parameters: ", model_RS.best_params_)

# model evaluation
RS_y_pred=model_RS.predict(X_test)
RS_accuracy = accuracy_score(y_test, RS_y_pred)
print("Random Search accuracy: ", RS_accuracy)

Best parameters:  {'kernel': 'rbf', 'gamma': 'scale', 'C': 3}
Random Search accuracy:  0.9641350210970464
