# NLP Project Tutorial

In this project we are reusing the spam dataset and we will repeat the cleaning process. However, instead of building a spam detector using SVM algorithm, we will have a very brief and simple introduction to sentiment analysis by adding two columns to the dataset that will detect polarity and subjectivity of the message text by using a new tool.

In [7]:
pip install pandas regex matplotlib sklearn

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting scikit-learn
  Downloading scikit_learn-1.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.2/31.2 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting scipy>=1.3.2
  Downloading scipy-1.8.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.6/41.6 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=1.0.0
  Downloading joblib-1.1.0-py2.py3-none-any.whl (306 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.0/307.0 kB[0m [31m66.5 MB/s[0m eta [36m0:00:00

In [8]:
# Importing dependencies

import pandas as pd 
import regex as re
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import model_selection, svm
from sklearn.metrics import classification_report, accuracy_score

In [9]:
#Loading dataset

df = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv")
df.head()

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True


In [3]:
#Encoding target variable

df['is_spam'] = df['is_spam'].apply(lambda x: 1 if x == 'True' else 0)

In [11]:
# EDA: Establish some baseline counts

print("spam count: " +str(len(df.loc[df.is_spam==1])))
print("not spam count: " +str(len(df.loc[df.is_spam==0])))
print(df.shape)
df['is_spam'] = df['is_spam'].astype(int)

spam count: 696
not spam count: 2303
(2999, 2)


In [12]:
# Eliminate duplicate rows.

df = df.drop_duplicates()
df = df.reset_index(inplace = False)[['url','is_spam']]
df.shape

(2369, 2)

In [13]:
# NLP Cleaning process

clean_desc = []

for w in range(len(df.url)):
    desc = df['url'][w].lower()
    
    #remove punctuation
    desc = re.sub('[^a-zA-Z]', ' ', desc)
    
    #remove tags
    desc=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",desc)
    
    #remove digits and special chars
    desc=re.sub("(\\d|\\W)+"," ",desc)
    
    clean_desc.append(desc)

#assign the cleaned descriptions to the data frame
df['url'] = clean_desc
        
df.head()

Unnamed: 0,url,is_spam
0,https briefingday us list manage com unsubscribe,1
1,https www hvper com,1
2,https briefingday com m v n i f,1
3,https briefingday com n m commentform,0
4,https briefingday com fan,1


In [18]:
# Building a stop words list from scratch

stop_words = ['is','you','your','and', 'the', 'to', 'from', 'or', 'I', 'for', 'do', 'get', 'not', 'here', 'in', 'im', 'have', 'on', 
're', 'https', 'com']

In [None]:
# Exclude stop words

In [19]:
#Create sparse matrix

message_vectorizer = CountVectorizer().fit_transform(df['url'])

# Split the data

X_train, X_test, y_train, y_test = train_test_split(message_vectorizer, df['is_spam'], test_size = 0.45, random_state = 42, shuffle = True)

In [20]:
classifier = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')

In [21]:
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96       959
           1       0.65      0.59      0.62       108

    accuracy                           0.93      1067
   macro avg       0.80      0.78      0.79      1067
weighted avg       0.92      0.93      0.93      1067

