# Explore here

In [None]:
import re
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
def grid_SVC(X_train, y_train, performance_metric='f1', resultsGrid=False):
    model = SVC()
    C = np.linspace(0.000001 , 1000, 10)
    kernels = ['poly', 'rbf', 'linear', 'sigmoid']
    gamma = ['scale', 'auto']
    grid = dict(C = C, kernel = kernels, gamma = gamma)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                           scoring=performance_metric,error_score='raise')
    grid_result = grid_search.fit(X_train, y_train)
    if resultsGrid==True:
        return grid_result.cv_results_
    else:
        return  grid_result.best_estimator_


def lemmatize_text(text):
    tokens = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_words)



def clean_text(string):
    string = string.lower()
    string = re.sub(r"http(s)?:*", '', string)
    string = re.sub(r"[-/.#&]", ' ', string)
    string = re.sub(r"w{3}", ' ', string)
    string = string.strip()
    string = ' '.join([word for word in string.split() if word not in stop_words])
    string = lemmatize_text(string)
    return string

In [24]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/NLP-project-tutorial/main/url_spam.csv"

In [25]:
df  = pd.read_csv(url)

In [26]:
df.columns

Index(['url', 'is_spam'], dtype='object')

In [27]:
samples = [df['url'].loc[np.random.randint(0,df.shape[0])] for _ in range(30)]

In [None]:
lemmatizer = WordNetLemmatizer()
stop_words.extend(['of', 'yet'])
list(map( clean_text, samples))

['nytimes com 2020 05 30 technology twitter trump dorsey html',
 'bbc com news uk england wiltshire 53132567',
 'morningbrew com daily story 2020 06 30 2020 photo',
 'hvper com',
 'youtube com watch?v=r9ozap my5e',
 'theverge com 2020 6 18 21296180 apple hey email app basecamp rejection response controversy antitrust regulation',
 'cosmopolitan com lifestyle a32980205 jenna marble quitting youtube blackface nicki minaj apology',
 'tiqets com blog black history museum',
 'nirandfar com focus quiz',
 'gimletmedia com show reply n8hwl7 128 crime machine part ii episode player',
 'briefingday com fan',
 'cato org publication commentary dc statehood fool errand',
 'food52 com',
 'usatoday com story news politics election 2020 06 23 june 23 primary aoc win charles booker amy mcgrath result delayed 3235505001',
 'wired co uk article coronavirus beijing second wave lockdown',
 'vox com policy politics 2020 6 24 21281485 bill barr donald trump berman doj david rohde',
 'nikolehannahjones com',


In [29]:
X, y = df['url'], df['is_spam']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    shuffle=True,
                                                    test_size = 0.3,
                                                    random_state=123)

In [31]:
X_train

1099    https://www.reuters.com/article/us-usa-trump-o...
1448    https://www.eventbrite.com/e/big-friendship-bo...
2327    https://news.rice.edu/2020/06/29/laser-welded-...
1412    https://creativemornings.com/companies/sdco-pa...
1224    https://www.nytimes.com/2020/06/25/world/afric...
                              ...                        
1147    https://en.wikipedia.org/wiki/Tim_O%27Brien_(a...
2154    https://www.washingtonpost.com/privacy-policy/...
1766    https://www.cnbc.com/2020/06/26/amazon-buys-se...
1122    https://www.amazon.com/Rivers-Tides-Andy-Golds...
1346            https://www.gao.gov/assets/710/707839.pdf
Name: url, Length: 2099, dtype: object

In [32]:
# cleaning data
X_train = X_train.apply(lambda x : clean_text(x))
X_test = X_test.apply(lambda x : clean_text(x))

In [33]:
X_train

1099    reuters com article u usa trump obamacare trum...
1448    eventbrite com e big friendship book launch ti...
2327    news rice edu 2020 06 29 laser welded sugar sw...
1412            creativemornings com company sdco partner
1224    nytimes com 2020 06 25 world africa ebola cong...
                              ...                        
1147         en wikipedia org wiki tim_o%27brien_(author)
2154    washingtonpost com privacy policy 2011 11 18 g...
1766    cnbc com 2020 06 26 amazon buy self driving te...
1122    amazon com river tide andy goldsworthy dp b001...
1346                         gao gov asset 710 707839 pdf
Name: url, Length: 2099, dtype: object

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer().fit(X_train)
X_train = vect.transform(X_train)
X_test  = vect.transform(X_test)

In [35]:
y_train = np.where(y_train==True,1,0)

In [36]:
y_train

array([0, 1, 0, ..., 0, 0, 0], shape=(2099,))

In [37]:
best_ml = grid_SVC(X_train, y_train)

In [38]:
preds = best_ml.predict(X_test)

In [39]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

       False       0.97      0.96      0.97       682
        True       0.87      0.92      0.89       218

    accuracy                           0.95       900
   macro avg       0.92      0.94      0.93       900
weighted avg       0.95      0.95      0.95       900



El modelo SVM obtuvo un 95% de accuracy, mostrando un buen desempeño en la detección de URLs spam. El alto recall en la clase spam (92%) indica que el modelo identifica correctamente la mayoría de los enlaces maliciosos, siendo adecuado para este problema de clasificación.