In [2]:
import pathlib
import pandas as pd

BASE_DIR = pathlib.Path().resolve().parent
DATASET_DIR = BASE_DIR / "datasets"

EXPORT_DIR = DATASET_DIR / "exports"
EXPORT_DIR.mkdir(exist_ok=True, parents=True)
SPAM_DATASET_PATH = EXPORT_DIR / "spam-dataset.csv"

ZIPS_DIR = DATASET_DIR / "zips"
ZIPS_DIR.mkdir(exist_ok=True, parents=True)

SPAM_SMS_ZIP_PATH = ZIPS_DIR / "sms-spam-dataset.zip"
SPAM_YOUTUBE_ZIP_PATH = ZIPS_DIR / "youtube-spam-dataset.zip"

In [3]:
SMS_SPAM_ZIP = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
YOUTUBE_SPAM_ZIP = "https://archive.ics.uci.edu/static/public/380/youtube+spam+collection.zip"

In [4]:
!curl $SMS_SPAM_ZIP -o $SPAM_SMS_ZIP_PATH
!curl $YOUTUBE_SPAM_ZIP -o $SPAM_YOUTUBE_ZIP_PATH

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  198k    0  198k    0     0   272k      0 --:--:-- --:--:-- --:--:--  272k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  159k    0  159k    0     0   339k      0 --:--:-- --:--:-- --:--:--  339k


In [5]:
SPAM_CLASSIFICATION_DIR = DATASET_DIR / "spam_classification"
SMS_SPAM_DIR = SPAM_CLASSIFICATION_DIR / "sms-spam"
YOUTUBE_SPAM_DIR = SPAM_CLASSIFICATION_DIR / "youtube-spam"
SMS_SPAM_DIR.mkdir(exist_ok=True, parents=True)
YOUTUBE_SPAM_DIR.mkdir(exist_ok=True, parents=True)

In [6]:
!unzip -o $SPAM_SMS_ZIP_PATH -d $SMS_SPAM_DIR
!unzip -o $SPAM_YOUTUBE_ZIP_PATH -d $YOUTUBE_SPAM_DIR


Archive:  /Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/zips/sms-spam-dataset.zip
  inflating: /Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/spam_classification/sms-spam/SMSSpamCollection  
  inflating: /Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/spam_classification/sms-spam/readme  


Archive:  /Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/zips/youtube-spam-dataset.zip
  inflating: /Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/spam_classification/youtube-spam/Youtube01-Psy.csv  
  inflating: /Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/spam_classification/youtube-spam/__MACOSX/._Youtube01-Psy.csv  
  inflating: /Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/spam_classification/youtube-spam/Youtube02-KatyPerry.csv  
  inflating: /Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/spam_classification/youtube-spam/__MACOSX/._Youtube02-KatyPerry.csv  
  inflating: /Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/spam_classification/youtube-spam/Youtube03-LMFAO.csv  
  inflating: /Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/spam_cl

In [9]:
for path in SMS_SPAM_DIR.glob("*"):
    print(path)
    try:
        print(path.read_text()[:100])
    except:
        pass

/Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/spam_classification/sms-spam/readme
/Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/spam_classification/sms-spam/SMSSpamCollection
ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g


In [11]:
sms_spam_input_path = SMS_SPAM_DIR / "SMSSpamCollection"
sms_spam_input_path.read_text()[:100]


'ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g'

In [12]:
for path in YOUTUBE_SPAM_DIR.glob("*"):
    print(path)

/Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/spam_classification/youtube-spam/Youtube03-LMFAO.csv
/Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/spam_classification/youtube-spam/Youtube04-Eminem.csv
/Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/spam_classification/youtube-spam/Youtube05-Shakira.csv
/Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/spam_classification/youtube-spam/Youtube02-KatyPerry.csv
/Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/spam_classification/youtube-spam/__MACOSX
/Users/edkwang/Drive/Code/projects/public/machine-learning/lstm-spam-api/datasets/spam_classification/youtube-spam/Youtube01-Psy.csv


In [13]:
sms_df = pd.read_csv(sms_spam_input_path, sep="\t", header=None, names=["label", "text"])
sms_df['source'] = 'sms=spam'
sms_df.head()

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms=spam
1,ham,Ok lar... Joking wif u oni...,sms=spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms=spam
3,ham,U dun say so early hor... U c already then say...,sms=spam
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms=spam


In [14]:
dfs = []
for path in YOUTUBE_SPAM_DIR.glob("*.csv"):
    raw_df = pd.read_csv(path)
    raw_df['raw_source'] = str(path.name)
    raw_df.rename(columns={"CLASS": "raw_label", "CONTENT":"text"}, inplace=True)
    raw_df['label'] = raw_df['raw_label'].apply(lambda x: "spam" if str(x) == "1" else "ham")
    raw_df['source'] = 'youtube-spam'
    df = raw_df[['label', 'text', 'raw_source','source']].copy()
    dfs.append(df)
yt_df = pd.concat(dfs)
yt_df.head()

Unnamed: 0,label,text,raw_source,source
0,ham,"<a href=""http://www.youtube.com/watch?v=KQ6zr6...",Youtube03-LMFAO.csv,youtube-spam
1,ham,wierd but funny﻿,Youtube03-LMFAO.csv,youtube-spam
2,spam,"Hey guys, I&#39;m a human.<br /><br /><br />Bu...",Youtube03-LMFAO.csv,youtube-spam
3,ham,Party Rock....lol...who wants to shuffle!!!﻿,Youtube03-LMFAO.csv,youtube-spam
4,ham,Party rock﻿,Youtube03-LMFAO.csv,youtube-spam


In [15]:
spam_df = pd.concat([sms_df, yt_df])
spam_df.head()

Unnamed: 0,label,text,source,raw_source
0,ham,"Go until jurong point, crazy.. Available only ...",sms=spam,
1,ham,Ok lar... Joking wif u oni...,sms=spam,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms=spam,
3,ham,U dun say so early hor... U c already then say...,sms=spam,
4,ham,"Nah I don't think he goes to usf, he lives aro...",sms=spam,


In [16]:
spam_df.to_csv(SPAM_DATASET_PATH, index=False)