### 0. Importing modules and data

In [24]:
import pandas as pd
import os
from edastatmil_milser import edas_tatmil as EDA
from pickle import dump
import matplotlib.pyplot as plt
import numpy as np
import regex as re
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [15]:
raw_df = pd.read_csv('../data/raw/url_spam.csv')
raw_df.head()

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,True
1,https://www.hvper.com/,True
2,https://briefingday.com/m/v4n3i4f3,True
3,https://briefingday.com/n/20200618/m#commentform,False
4,https://briefingday.com/fan,True


### 1. Preprocess the links
#### 1.1 Transform to numerical

In [16]:
raw_df["is_spam"] = raw_df["is_spam"].apply(lambda x: 1 if x else 0).astype(int)
raw_df.head()

Unnamed: 0,url,is_spam
0,https://briefingday.us8.list-manage.com/unsubs...,1
1,https://www.hvper.com/,1
2,https://briefingday.com/m/v4n3i4f3,1
3,https://briefingday.com/n/20200618/m#commentform,0
4,https://briefingday.com/fan,1


#### 1.2 Drop replicates

In [17]:
raw_df.shape

(2999, 2)

In [18]:
raw_df = raw_df.drop_duplicates()
raw_df = raw_df.reset_index(inplace = False, drop = True)
raw_df.shape

(2369, 2)

#### 1.3 Clean url text

In [19]:
def cleantext(text):
    # Remove not letters or white spaces
    text = re.sub(r'[^a-z ]', " ", text)
    
    # Remove white spaces
    text = re.sub(r'\s+[a-zA-Z]\s+', " ", text)
    text = re.sub(r'\^[a-zA-Z]\s+', " ", text)

    # Multiple white spaces into one
    text = re.sub(r'\s+', " ", text.lower())

    # Remove tags
    text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", text)

    return text.split()

In [20]:
df = raw_df.copy()
df["url"] = df["url"].apply(cleantext)
df.head()

Unnamed: 0,url,is_spam
0,"[https, briefingday, us, list, manage, com, un...",1
1,"[https, www, hvper, com]",1
2,"[https, briefingday, com, v, i]",1
3,"[https, briefingday, com, m, commentform]",0
4,"[https, briefingday, com, fan]",1


#### 1.4 Lemmatize

In [21]:
download("wordnet")
lemmatizer = WordNetLemmatizer()

download("stopwords")
stop_words = stopwords.words("english")

[nltk_data] Downloading package wordnet to /home/taticc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/taticc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
def lemmatize(words, lemmatizer = lemmatizer):
    tokens = [lemmatizer.lemmatize(word) for word in words]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [word for word in tokens if len(word) > 3]
    return tokens

In [23]:
df["url"] = df["url"].apply(lemmatize)
df.head()

Unnamed: 0,url,is_spam
0,"[http, briefingday, list, manage, unsubscribe]",1
1,"[http, hvper]",1
2,"[http, briefingday]",1
3,"[http, briefingday, commentform]",0
4,"[http, briefingday]",1


#### 1.5 Visualization

In [27]:
wordcloud = WordCloud(width = 800, height = 800, background_color = "black", max_words = 1000, min_font_size = 20, random_state = 42).generate(str(df["url"]))

fig = plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")


ValueError: Only supported for TrueType fonts