In [25]:
from pprint import pprint

import pandas as pd
import numpy as np
import prepare as p
import acquire as a

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

import nltk
import re

In [3]:
# acquire dataframe
df = pd.read_csv('./spam_clean.csv')

# utitlize mother prepare function
df = p.prepare_data(df, 'text', additional_stopwords=['i', 'ur', '2', 'u', '4'])

In [4]:
# quick look at the dataframe
df.head()

Unnamed: 0,label,text,clean,stemmed,lemmatized
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,go jurong point crazi avail bugi n great world...,go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif oni,ok lar joke wif oni,ok lar joking wif oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts 21s...,free entri wkli comp win fa cup final tkt 21st...,free entry wkly comp win fa cup final tkts 21s...
3,ham,U dun say so early hor... U c already then say...,dun say early hor c already say,dun say earli hor c alreadi say,dun say early hor c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though,nah dont think goe usf live around though,nah dont think go usf life around though


In [5]:
def clean(text: str) -> list:
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = set(nltk.corpus.stopwords.words('english'))
    text = (text.encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split() # tokenization
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [10]:
categories = ["business", "sports", "technology", "entertainment", "science", "world"]
news_df = a.get_all_news_articles(categories)



  soup = BeautifulSoup(response.text)


In [12]:
news_df=p.prepare_data(news_df, 'content', ['said'])

In [13]:
news_df.head()

Unnamed: 0,title,content,category,clean,stemmed,lemmatized
0,Need to have commemorative coins depicting Nee...,After javelin thrower Neeraj Chopra won a gold...,business,javelin thrower neeraj chopra gold medal tokyo...,javelin thrower neeraj chopra gold medal tokyo...,javelin thrower neeraj chopra gold medal tokyo...
1,Microsoft Co-founder Paul Allen's superyacht l...,"A 414-foot superyacht, Octopus, once owned by ...",business,414foot superyacht octopus owned microsoft cof...,414foot superyacht octopu own microsoft cofoun...,414foot superyacht octopus owned microsoft cof...
2,Alibaba employee alleges she woke up in hotel ...,China's Alibaba suspended several employees af...,business,chinas alibaba suspended several employees emp...,china alibaba suspend sever employe employe al...,china alibaba suspended several employee emplo...
3,Depict real picture of job loss: Parliamentary...,A Parliamentary panel has asked the Labour Min...,business,parliamentary panel asked labour ministry use ...,parliamentari panel ask labour ministri use re...,parliamentary panel asked labour ministry use ...
4,Snickers in Spain gets accused of homophobia o...,Snickers in Spain has pulled a 20-second TV ad...,business,snickers spain pulled 20second tv advertisemen...,snicker spain pull 20second tv advertis accus ...,snicker spain pulled 20second tv advertisement...


In [14]:
cv = CountVectorizer()
bag_of_words = cv.fit_transform(news_df.lemmatized)
bag_of_words

<150x2616 sparse matrix of type '<class 'numpy.int64'>'
	with 4745 stored elements in Compressed Sparse Row format>

In [15]:
#to see what is inside of the sparse matrix
bag_of_words.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 1]])

In [16]:
# print(news_df.lemmatized)
pd.DataFrame(bag_of_words.todense(), columns=cv.get_feature_names()).head()

Unnamed: 0,027,10,100,1000,10000,1000strong,100metre,100million,1012,107th,...,youth,youve,zaranj,zaveri,zealand,zelda,zero,zimbabwe,zombie,zulum
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TF-IDF

In [18]:
tfidf = TfidfVectorizer()
tfidfs = tfidf.fit_transform(news_df.lemmatized)

# print(news_df.lemmatized)
pd.DataFrame(tfidfs.todense(), columns=tfidf.get_feature_names()).head()

Unnamed: 0,027,10,100,1000,10000,1000strong,100metre,100million,1012,107th,...,youth,youve,zaranj,zaveri,zealand,zelda,zero,zimbabwe,zombie,zulum
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Bag of Ngrams

In [19]:
cv = CountVectorizer(ngram_range=(2, 2))
bag_of_words = cv.fit_transform(news_df.lemmatized)

In [20]:
pd.DataFrame(bag_of_words.todense(), columns=cv.get_feature_names()).head()

Unnamed: 0,027 market,10 bank,10 constituency,10 day,10 esa,10 gram,10 lakh,10 valued,100 million,100 starring,...,zealand government,zealand new,zealand threeyear,zealand visa,zelda game,zero kriti,zimbabwe 14yearold,zimbabwe two,zombie punk,zulum added
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
cv = CountVectorizer()
X = cv.fit_transform(news_df.content.apply(clean).apply(' '.join))
y = news_df.category

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=12)

In [28]:
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X_train, y_train)

tree.score(X_train, y_train)

0.5166666666666667

In [29]:
# take a look at model accuracy
(tree.predict(X_train) == y_train).mean()

0.5166666666666667

In [30]:
tree.score(X_test, y_test)

0.4