In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import nltk
import unicodedata
import re
from env import user, password, host, db, protocol
import acquire

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from wordcloud import WordCloud
from nltk import ngrams

from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder


# Exercises

Do your work for this exercise in a file named explore.

1. Spam Data
* Load the spam data set.
* Create and explore bigrams for the spam data. Visualize them with a word cloud. How do they compare with the ham bigrams?
    * spam seems to be urging a reply for some offer or advertisement. ham words seem to be informative or basic daily conversation

* Is there any overlap in the bigrams for the spam data and the ham data?
    * 'call' is a frequent word in both spam and ham

* Create and explore with trigrams (i.e. a n-gram with an n of 3) for both the spam and ham data.

2. Explore the blog articles using the techniques discussed in the exploration lesson.

3. Explore the news articles using the techniques discussed in the exploration lesson. Use the category variable when exploring.

# Data Exploration

* In this lesson, we'll be taking a look at a data set that contains SMS messages that are labelled as either a spam text message, or an actual text message.

In [None]:
def get_db_url(database, host=host, user=user, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{database}'


url = get_db_url("spam_db")
sql = "SELECT * FROM spam"

df = pd.read_sql(sql, url, index_col="id")
df.head()

In [None]:
filename = 'spamham_data'
df.to_csv(filename)

In [None]:
ADDITIONAL_STOPWORDS = ['r', 'u', '2', 'ltgt']

def clean(text):
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (unicodedata.normalize('NFKD', text)
             .encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]


* Let's first take a look at how many of the messages are spam vs ham:

In [None]:
labels = pd.concat([df.label.value_counts(),
                    df.label.value_counts(normalize=True)], axis=1)
labels.columns = ['n', 'percent']
labels


In [None]:
def show_counts_and_ratios(df, column):
    """
    Takes in a dataframe and a string of a single column
    Returns a dataframe with absolute value counts and percentage value counts
    """
    labels = pd.concat([df[column].value_counts(),
                    df[column].value_counts(normalize=True)], axis=1)
    labels.columns = ['n', 'percent']
    labels
    return labels

show_counts_and_ratios(df, "label")


Let's break the data up into 3 seperate pieces:

* The words that appear in legitimate text messages.

* The words that appear in spam text messages.

* All of the words.

In [None]:
ham_words = clean(' '.join(df[df.label == 'ham'].text))
spam_words = clean(' '.join(df[df.label == 'spam'].text))
all_words = clean(' '.join(df.text))

Once we have a list of words, we can transform it into a pandas Series, which we can then use to show us how often each of the words occurs.

In [None]:
ham_freq = pd.Series(ham_words).value_counts()
spam_freq = pd.Series(spam_words).value_counts()
all_freq = pd.Series(all_words).value_counts()

spam_freq.head()


Now we'll combine these three together to get one resulting data frame that we can work with:

In [None]:
word_counts = (pd.concat([all_freq, ham_freq, spam_freq], axis=1, sort=True)
                .set_axis(['all', 'ham', 'spam'], axis=1, inplace=False)
                .fillna(0)
                .apply(lambda s: s.astype(int)))

word_counts.head()


We can now use this data set to answer some interesting questions:

* What are the most frequently occuring words?

In [None]:
word_counts.sort_values(by='all', ascending=False).head(10)


Are there any words that uniquely identify a spam or ham message?

In [None]:
pd.concat([word_counts[word_counts.spam == 0].sort_values(by='ham').tail(6),
           word_counts[word_counts.ham == 0].sort_values(by='spam').tail(6)])


In [None]:
# figure out the percentage of spam vs ham
(word_counts
 .assign(p_spam=word_counts.spam / word_counts['all'],
         p_ham=word_counts.ham / word_counts['all'])
 .sort_values(by='all')
 [['p_spam', 'p_ham']]
 .tail(20)
 .sort_values('p_ham')
 .plot.barh(stacked=True))

plt.title('Proportion of Spam vs Ham for the 20 most common words')


In [None]:
(word_counts
 [(word_counts.spam > 10) & (word_counts.ham > 10)]
 .assign(ratio=lambda df: df.spam / (df.ham + .01))
 .sort_values(by='ratio')
 .pipe(lambda df: pd.concat([df.head(), df.tail()])))

# Word Clouds

The wordcloud allows you to identify the relative frequency of different keywords using an easily digestible visual.

# Common Use Cases

As a visualization technique, this method gives a more qualitative analysis of the topics in the documents.


**Pros**

* It’s intuitive and easy to comprehend.
* It helps identify overall respondent sentiment and the specific factors that drive it.
* It provides direction for further analysis.

**Cons**

* It fails to measure each word’s value in and of itself.
* It allows irrelevant words to appear.
* When words appear similar in size, it becomes difficult to differentiate them.

First we'll take a look at a simple example:

In [None]:
sentence = 'Mary had a little lamb, little lamb, little lamb. Its fleece was white as snow.'

img = WordCloud(background_color='white').generate(sentence)
# WordCloud() produces an image object, which can be displayed with plt.imshow
plt.imshow(img)
# axis aren't very useful for a word cloud
plt.axis('off')
plt.show()


In [None]:
all_cloud = WordCloud(background_color='white', height=1000, width=400).generate(' '.join(all_words))
ham_cloud = WordCloud(background_color='white', height=600, width=800).generate(' '.join(ham_words))
spam_cloud = WordCloud(background_color='white', height=600, width=800).generate(' '.join(spam_words))

plt.figure(figsize=(10, 8))
axs = [plt.axes([0, 0, .5, 1]), plt.axes([.5, .5, .5, .5]), plt.axes([.5, 0, .5, .5])]

axs[0].imshow(all_cloud)
axs[1].imshow(ham_cloud)
axs[2].imshow(spam_cloud)

axs[0].set_title('All Words')
axs[1].set_title('Ham')
axs[2].set_title('Spam')

for ax in axs: ax.axis('off')


# Bigrams

Bigrams are a specific instance of the broader concept of n-grams, which is a way to combine words together. This lets us measure not just the individual word frequency, but also takes into account which words appear together.

To produce the bigrams, we'll use nltk.

In [None]:
sentence = 'Mary had a little lamb'

bigrams = nltk.ngrams(sentence.split(), 2)
list(bigrams)


We can apply the same transformation to our ham data set in order to find out which bigrams are the most frequently occuring.

In [None]:
top_20_ham_bigrams = (pd.Series(nltk.ngrams(ham_words, 2))
                      .value_counts()
                      .head(20))

top_20_ham_bigrams.head()


# Create and explore bigrams for the spam data. Visualize them with a word cloud. How do they compare with the ham bigrams?
* spam seems to be urging a reply for some offer or advertisement. ham words seem to be informative or basic daily conversation

# Is there any overlap in the bigrams for the spam data and the ham data?
* 'call' is a frequent word in both spam and ham


In [None]:
top_20_spam_bigrams = (pd.Series(nltk.ngrams(spam_words, 2))
                      .value_counts()
                      .head(20))

top_20_spam_bigrams.head()

In [None]:
data = {k[0] + ' ' + k[1]: v for k, v in top_20_spam_bigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(data)
plt.figure(figsize=(8, 4))
plt.imshow(img)
plt.axis('off')
plt.show()

In [None]:
top_20_ham_bigrams.sort_values(ascending=False).plot.barh(color='blue', width=.9, figsize=(10, 6))

plt.title('20 Most frequently occuring ham bigrams')
plt.ylabel('Bigram')
plt.xlabel('# Occurances')

# make the labels pretty
ticks, _ = plt.yticks()
labels = top_20_ham_bigrams.reset_index()['index'].apply(lambda t: t[0] + ' ' + t[1])
_ = plt.yticks(ticks, labels)


We can use these bigrams to make a word cloud as well, with a little more effort.

In [None]:
# We can supply our own values to be used to determine how big the words (or
# phrases) should be through the `generate_from_frequencies` method. The
# supplied values must be in the form of a dictionary where the keys are the
# words (phrases), and the values are numbers that correspond to the sizes.
#
# We'll convert our series to a dictionary, and convert the tuples that make up
# the index into a single string that holds each phrase.


data = {k[0] + ' ' + k[1]: v for k, v in top_20_ham_bigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(data)
plt.figure(figsize=(8, 4))
plt.imshow(img)
plt.axis('off')
plt.show()


# Create and explore with trigrams (i.e. a n-gram with an n of 3) for both the spam and ham data.

# Spam

In [None]:
sentence = 'Mary had a little lamb'

trigrams = nltk.ngrams(sentence.split(), 3)
list(trigrams)


In [None]:
top_20_spam_trigrams = (pd.Series(nltk.ngrams(spam_words, 3))
                      .value_counts()
                      .head(20))

top_20_spam_trigrams.head()

In [None]:
data = {k[0] + ' ' + k[1]: v for k, v in top_20_spam_trigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(data)
plt.figure(figsize=(8, 4))
plt.imshow(img)
plt.axis('off')
plt.show()

# Ham

In [None]:
sentence = 'Mary had a little lamb'

trigrams = nltk.ngrams(sentence.split(), 3)
list(trigrams)


In [None]:
top_20_ham_trigrams = (pd.Series(nltk.ngrams(ham_words, 3))
                      .value_counts()
                      .head(20))

top_20_ham_trigrams.head()

In [None]:
data = {k[0] + ' ' + k[1]: v for k, v in top_20_ham_trigrams.to_dict().items()}
img = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(data)
plt.figure(figsize=(8, 4))
plt.imshow(img)
plt.axis('off')
plt.show()

# Explore the blog articles using the techniques discussed in the exploration lesson.

In [None]:
news_df = acquire.get_news_articles_data(refresh=False)

In [None]:
codeup_df = acquire.get_blog_articles_data(refresh=False)

# News 

In [None]:
news_df

# Word Cloud

In [None]:
# combine all the article texts into a single string
text = ' '.join(news_df['content'])

# generate the word cloud
wordcloud = WordCloud(width=800, height=800, background_color='white', max_words=100).generate(text)

# plot the word cloud
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()


# Bigrams

In [None]:
# tokenize the text
tokens = nltk.word_tokenize(text)

# create a bigram finder
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)

# apply filters to the bigrams
finder.apply_freq_filter(3)
finder.apply_word_filter(lambda word: word in nltk.corpus.stopwords.words('english'))

# get the top 10 bigrams by PMI
bigrams = finder.nbest(bigram_measures.pmi, 10)

# print the top 10 bigrams
print("The top 10 bigrams by PMI:")
for bigram in bigrams:
    print(bigram)


# Trigams

In [None]:
# combine all the article texts into a single string
text = ' '.join(news_df['content'])

# tokenize the text
tokens = nltk.word_tokenize(text)

# generate trigrams
trigrams = list(ngrams(tokens, 3))


In [None]:
print("Example trigrams:")
for i in range(5):
    print(next(iter(trigrams)))


In [None]:
# count the frequency of each trigram
freq_dist = nltk.FreqDist(trigrams)

# print the most common trigrams
print("The most common trigrams:")
for trigram, count in freq_dist.most_common(10):
    print(f"{trigram}: {count}")

# Blog 

In [None]:
codeup_df

# Word Cloud

In [None]:
# combine all the article texts into a single string
text = ' '.join(codeup_df['content'])

# generate the word cloud
wordcloud = WordCloud(width=800, height=800, background_color='white', max_words=100).generate(text)

# plot the word cloud
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

# Bigrams

In [None]:
# combine all the article texts into a single string
text = ' '.join(codeup_df['content'])

# tokenize the text
tokens = nltk.word_tokenize(text)

# create a bigram finder
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)

# apply filters to the bigrams
finder.apply_freq_filter(3)
finder.apply_word_filter(lambda word: word in nltk.corpus.stopwords.words('english'))

# get the top 10 bigrams by PMI
bigrams = finder.nbest(bigram_measures.pmi, 10)

# print the top 10 bigrams
print("The top 10 bigrams by PMI:")
for bigram in bigrams:
    print(bigram)


# Trigrams

In [None]:
# combine all the article texts into a single string
text = ' '.join(codeup_df['content'])

# tokenize the text
tokens = nltk.word_tokenize(text)

# generate trigrams
trigrams = list(ngrams(tokens, 3))


In [None]:
print("Example trigrams:")
for i in range(5):
    print(next(iter(trigrams)))


In [None]:
# count the frequency of each trigram
freq_dist = nltk.FreqDist(trigrams)

# print the most common trigrams
print("The most common trigrams:")
for trigram, count in freq_dist.most_common(10):
    print(f"{trigram}: {count}")
