# Introduction to Natural Language Processing
## 2. Unsupervised Learning

In [None]:
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

# Get Vader data for sentiment analysis
nltk.download("vader_lexicon")

PARTY_COLOURS = {"trump": "#E91D0E", "obama": "#00A6EF"}

%matplotlib inline

## Data Cleaning 

In [None]:
import re


def clean_tweet(text):
    # encode tweets as utf-8 strings
    text = text.decode("utf-8")
    # remove commas in numbers (else vectorizer will split on them)
    text = re.sub(r",([0-9])", "\\1", text)
    # sort out HMTL formatting of &
    text = re.sub(r"&amp", "and", text)
    # strip urls
    return re.sub(r"http[s]{0,1}://[^\s]*", "", text)


df = pd.read_pickle("tweets.pkl")
df["text"] = df["text"].map(clean_tweet)

## Dimension Reduction

If we allow our vectorizer to infer a vocabulary from the corpus, then this will typically result in a huge number of sparesely populated features. We can often dimension reduce and retain relevant information (albeit sacrificing some interpretability), and improve the efficiency of our models and analysis.

Let's visualise our tfidf vectors in a few different ways.

### Principal Component Analysis

Principal component analysis (PCA) aims find a coordinate system where correlation between features is minimized. By keeping only the coordinate directions in the new system that explain the most variance, we can reduce the dimensions of our feature space. This transformation is linear, so each of the principal components is a linear combination of the original features.

In [None]:
from sklearn.decomposition import PCA

# dimension reduction algorithms can be pretty slow, so let's work with a sample
# try on the whole data set if you want!
sample_trump = df.loc[df["label"] == 0, ["text", "label"]].sample(500)
sample_obama = df.loc[df["label"] == 1, ["text", "label"]].sample(500)
sample = sample_trump.append(sample_obama)

tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
tfidf_vectors = tfidf_vectorizer.fit_transform(sample["text"])

In [None]:
tfidf_vectors.shape

**Exercise: Use `PCA` to reduce `tfidf_vectors` to two dimensions, then plot the results using the `scatter_1` function. Pass the labels as colours.**

In [None]:
def scatter_1(x, colors):
    f = plt.figure(figsize=(8, 8))
    ax = plt.subplot(aspect="equal")
    ax.scatter(
        x[(colors == 0), 0],
        x[(colors == 0), 1],
        c=PARTY_COLOURS["trump"],
        label="Trump",
        alpha=0.5,
    )
    ax.scatter(
        x[(colors == 1), 0],
        x[(colors == 1), 1],
        c=PARTY_COLOURS["obama"],
        label="Obama",
        alpha=0.5,
    )
    ax.axes.get_yaxis().set_visible(False)
    ax.axes.get_xaxis().set_visible(False)
    plt.legend()
    return f, ax

In [None]:
# dimension reduce and plot here
pca = PCA(n_components=2)
# dimensional reduction techniques often make use of the `fit_transform` method
# it both fits the model and then transforms the vectors you inputted
X_pca = pca.fit_transform(<your-tfidf-vector>)
# plot your graph now!

### t-SNE

t-SNE is another dimension reduction algorithm, but one that is generally better at preserving the global structure of the data. In the case of our twitter data it does a much better job than PCA. This transformation is highly non-linear though, so it is hard to understand what the 2-dimensional representation means in reference to the original features.

In [None]:
from sklearn.manifold import TSNE

In [None]:
# we have suggested some parameters below, feel free to experiment
tsne = TSNE(perplexity=800, random_state=42)

**Exercise: Transform `tfidf_vectors` using `tsne` and plot them using `scatter_1`**.

In [None]:
# dimension reduce and plot here

### UMAP

UMAP is another dimensional reduction algorithm. It is in many ways similar to t-SNE but it is a lot faster.

In [None]:
from umap import UMAP

In [None]:
umapper = UMAP(n_neighbors=150)

**Exercise: Transform `tfidf_vectors` using `UMAP` and plot them using `scatter_1`**.

In [None]:
# dimension reduce and plot here

## Clustering

Given a numeric representation of our data, there are many clustering algorithms we can try out. Since our feature vectors are extremely high dimensional, it is a good idea to first dimension reduce so that we do not fall foul of the curse of dimensionality. To demonstrate this, we will cluster the original high-dimensional vectors and then the dimension-reduced vectors.

**Exercise: Use KMeans to cluster your tfidf vectors into two classes.**

In [None]:
from sklearn.cluster import KMeans

# cluster tfidf vectors using K-Means
km = KMeans(n_clusters=2, init='k-means++', n_init=20)
km.fit(<your-tfidf-vectors>)

sample['kmeans_labels'] = # your kmeans labels, check the documentation!

In [None]:
# some reorganizing for plotting clusters
df_trump = sample[sample["label"] == 0]
df_obama = sample[sample["label"] == 1]

trump_counts = (
    df_trump[["kmeans_labels", "label"]]
    .groupby("kmeans_labels")
    .count()
    .values.flatten()
)
obama_counts = (
    df_obama[["kmeans_labels", "label"]]
    .groupby("kmeans_labels")
    .count()
    .values.flatten()
)

f, ax = plt.subplots()
bars11 = ax.bar(
    np.arange(2) - 0.15,
    trump_counts,
    0.3,
    color=PARTY_COLOURS["trump"],
    label="Trump",
)
bars12 = ax.bar(
    np.arange(2) + 0.15,
    obama_counts,
    0.3,
    color=PARTY_COLOURS["obama"],
    label="Obama",
)
plt.legend(fontsize=12)
plt.ylabel("Count", fontsize=15)
plt.xticks([0, 1])
plt.xlabel("Cluster", fontsize=15);

**Exercise: Now use k-means clustering to cluster the tsne-vectors**

In [None]:
# run k-means on your tfidf vectors

sample['kmeans_tsne'] = # your k-means labels

In [None]:
# some reorganizing for plotting clusters

df_trump = sample[sample["label"] == 0]
df_obama = sample[sample["label"] == 1]

trump_counts = (
    df_trump[["kmeans_tsne", "label"]]
    .groupby("kmeans_tsne")
    .count()
    .values.flatten()
)
obama_counts = (
    df_obama[["kmeans_tsne", "label"]]
    .groupby("kmeans_tsne")
    .count()
    .values.flatten()
)

f, ax = plt.subplots()
bars11 = ax.bar(
    np.arange(2) - 0.15,
    trump_counts,
    0.3,
    color=PARTY_COLOURS["trump"],
    label="Trump",
)
bars12 = ax.bar(
    np.arange(2) + 0.15,
    obama_counts,
    0.3,
    color=PARTY_COLOURS["obama"],
    label="Obama",
)
plt.legend(fontsize=12)
plt.ylabel("Count", fontsize=15)
plt.xticks([0, 1])
plt.xlabel("Cluster", fontsize=15);

**Exercise: Print a selection of Obama tweets that ended up in the Trump cluster, and a selection of Trump tweets that ended up in the Obama cluster. If the clustering is working well, the Obama tweets should look Trumpian, and the Trump tweets should look Obama-like.**

In [None]:
# print tweets that ended up in the wrong clusters

## Glove vectors - Topic Analysis

In the first notebook we saw how GloVe vectors could be used to determine how similar words are to each other. They can also be used in a similar way to find topics, by first performing dimensionality reduction and then using a clustering algorithm.

In [None]:
import spacy


nlp = spacy.load("en_core_web_lg")

The following dataset contains short survery responses to the question: "what's your passion?"

In [None]:
survey = pd.read_csv("survey_responses.csv")

In [None]:
len(survey)

In [None]:
survey.sample(10)

First we get the GloVe vectors.

In [None]:
glove_vectors = np.concatenate(
    [nlp(response).vector.reshape(1, 300) for response in survey["response"]]
)

Then we use UMAP to perform dimensionality reduction.

In [None]:
umapper = UMAP(n_neighbors=25)
umap_vectors = umapper.fit_transform(glove_vectors)

Finally we use a clustering algorithm, in this case one called hdbscan, to cluster the vectors.

In [None]:
from hdbscan import HDBSCAN

hdbscanner = HDBSCAN()
hdbscanner.fit(umap_vectors)

labels = hdbscanner.labels_

Finally we visualise the clustering. The following code uses an interactive graphing library called `plotly`. It allows you to interact with the plot; try it out!

In [None]:
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

init_notebook_mode(connected=True)


def scatter_2(x, y, labels, text):
    data = [
        go.Scatter(
            x=x[labels == label],
            y=y[labels == label],
            mode="markers",
            opacity=0.7,
            text=text[labels == label],
            name=label,
            marker={"size": 15, "line": {"width": 0.5, "color": "white"}},
        )
        for label in set(labels)
    ]
    layout = go.Layout(
        xaxis={"showgrid": False, "showticklabels": False, "zeroline": False},
        yaxis={"showgrid": False, "showticklabels": False, "zeroline": False},
        hovermode="closest",
    )
    fig = go.Figure(data=data, layout=layout)
    iplot(fig, config={"displayModeBar": False})


scatter_2(
    umap_vectors[:, 0],
    umap_vectors[:, 1],
    labels=labels.astype(str),
    text=survey.response,
)

## Sentiment analysis

Another common task in NLP is sentiment anaylsis, this is often an unsupervised problem. We show off a tool called `vader` below.

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

# find a sample of 400 tweets, split between Obama and Trump
df_sentiment = (
    df[df["label"] == 0]
    .sample(200)
    .append(df[df["label"] == 1].sample(200))
    .copy()
)

for i, tweet in df_sentiment["text"].iteritems():
    ss = sid.polarity_scores(str(tweet))
    for k in sorted(ss):
        df_sentiment.loc[i, k] = ss[k]

df_sentiment

Each sentance gets a negative, neutral and positive score, as well as a compound score.

Let's do a plot here comparing the sentiment distribution of Trump vs. Obama.

In [None]:
trump_sentiment = df_sentiment[df_sentiment["label"] == 0][
    ["compound", "neg", "pos", "text"]
]
obama_sentiment = df_sentiment[df_sentiment["label"] == 1][
    ["compound", "neg", "pos", "text"]
]

trump_neg_sentiment = (
    trump_sentiment["neg"].sort_values().reset_index(drop=True)
)
obama_neg_sentiment = (
    obama_sentiment["neg"].sort_values().reset_index(drop=True)
)

trump_pos_sentiment = (
    trump_sentiment["pos"].sort_values().reset_index(drop=True)
)
obama_pos_sentiment = (
    obama_sentiment["pos"].sort_values().reset_index(drop=True)
)

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(10, 10), sharex=True, sharey=True)
ax[0, 0].hist(trump_pos_sentiment, color=PARTY_COLOURS["trump"])
ax[0, 0].set_title("Trump Positive Sentiment")
ax[0, 1].hist(trump_neg_sentiment, color=PARTY_COLOURS["trump"])
ax[0, 1].set_title("Trump Negative Sentiment")
ax[1, 0].hist(obama_pos_sentiment, color=PARTY_COLOURS["obama"])
ax[1, 0].set_title("Obama Positive Sentiment")
ax[1, 1].hist(obama_neg_sentiment, color=PARTY_COLOURS["obama"])
ax[1, 1].set_title("Obama Negative Sentiment")
plt.show()

Finally let's compare Trump's positive and negative tweets.

In [None]:
trump_pos_sentiment = trump_sentiment.sort_values(
    by="pos", ascending=False
).copy()
trump_neg_sentiment = trump_sentiment.sort_values(
    by="neg", ascending=False
).copy()

In [None]:
for tweet in trump_pos_sentiment.head()["text"].values:
    print(tweet)
    print()

In [None]:
for tweet in trump_neg_sentiment.head()["text"].values:
    print(tweet)
    print()