In [9]:
import re
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.metrics import f1_score  #ytrue, ypred
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

# NLP toolkits
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.corpus import wordnet
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/vscode/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
def grid_SVC(X_train, y_train, performance_metric='f1', resultsGrid=False):
    model = SVC()
    C = np.linspace(0.000001 , 1000, 10)
    kernels = ['poly', 'rbf', 'linear', 'sigmoid']
    gamma = ['scale', 'auto']
    grid = dict(C = C, kernel = kernels, gamma = gamma)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                           scoring=performance_metric,error_score='raise')
    grid_result = grid_search.fit(X_train, y_train)
    if resultsGrid==True:
        return grid_result.cv_results_
    else:
        return  grid_result.best_estimator_

lemmatizer = WordNetLemmatizer() # active

def lemmatize_text(text): #Lematización del texto.
    tokens = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text



def clean_text(string):
    string = string.lower()
    string = re.sub(r"http(s)?:*", '', string)
    string = re.sub(r"[-/.#&]", ' ', string)
    string = re.sub(r"w{3}", ' ', string)
    string = string.strip()
    string = ' '.join([word for word in string.split() if word not in stop_words])
    string = lemmatize_text(string)
    return string

In [11]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"

In [12]:
df  = pd.read_csv(url)

In [13]:
df.columns

Index(['url', 'is_spam'], dtype='object')

In [14]:
samples = [df['url'].loc[np.random.randint(0,df.shape[0])] for _ in range(30)]

In [15]:
lemmatizer = WordNetLemmatizer() # active
stop_words.extend(['of', 'yet']) # we can add stop words
list(map(clean_text, samples))

['vanityfair com hollywood 2020 06 ziwe fumudoh baited interview alison roman',
 'briefingday com v4n3i4f3',
 'itsmestevebryant com',
 'propublica org article removed label said medical use prohibited tried sell thousand mask official distribute hospital 970141',
 'mailchi mp bigspaceship big spaceship internet brunch 1729154',
 'theguardian com tv radio 2020 jun 24 netflixs floor lava show save summer',
 'cnet com news dark matter detector pick unexplained unexpected signal',
 'nbcnews com health health news cdc say covid 19 case u may 10 n1232134',
 'youtube com watch ? v=zo0ssjb1tri utm_source=morning_brew',
 'thehustle co 06302020 foraging mushroom',
 'wired com story algorithm predicts criminality based face spark furor',
 'espn com nfl story _ id 29383371 netflix produce 6 part series colin kaepernick',
 'youtube com watch ? v=l3qqqu7qlom feature=emb_title',
 'nytimes com interactive 2020 06 24 magazine reparation slavery html',
 'cnn com 2020 06 23 tech wirecard ceo markus braun

In [16]:
X, y = df['url'], df['is_spam']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    shuffle=True,
                                                    test_size = 0.3,
                                                    random_state=123)

In [18]:
# cleaning data
X_train = X_train.apply(lambda x : clean_text(x))
X_test = X_test.apply(lambda x : clean_text(x))

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer().fit(X_train)
#vect = TfidfVectorizer().fit(X_train)
X_train = vect.transform(X_train)
X_test  = vect.transform(X_test)

In [20]:
y_train = np.where(y_train==True,1,0)

In [21]:
best_ml = grid_SVC(X_train, y_train)
preds = best_ml.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

       False       0.97      0.96      0.97       682
        True       0.87      0.92      0.89       218

    accuracy                           0.95       900
   macro avg       0.92      0.94      0.93       900
weighted avg       0.95      0.95      0.95       900

