# Visualizing data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk import FreqDist

In [None]:
url = "https://raw.githubusercontent.com/CaptSolo/BSSDH_2023_beginners/main/corpora/en_old_newspapers_5k.tsv"
df = pd.read_csv(url, sep="\t")

In [None]:
# https://dariuslfuller.medium.com/creating-visuals-with-nltks-freqdist-ac4e667e49f3

all_text = "\n".join(df["Text"]).split()
all_fdist = FreqDist(all_text).most_common(20)

In [None]:
all_fdist

In [None]:
# converting data to Pandas series
all_fdist = pd.Series(dict(all_fdist))

In [None]:
## Matplotlib plot using Pandas attributes + xtick rotation for ease of viewing

#all_plot = plt.bar(x=all_fdist.index, y=all_fdist.values, ax=ax)
all_plot = plt.bar(all_fdist.index, all_fdist.values)
ticks = plt.xticks(rotation=40)

### Stopword removal

For widely used languages such as English we can use NLTK's stopword list.

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords[:10])

In [None]:
# let's convert the list to a set (with more efficient work lookup operations)
stopword_set = set(stopwords)

In [None]:
# removing stopwords
all_text_stopped = [word for word in all_text if word.lower() not in stopword_set]

# let's also remove some special symbols
spec_chars = ['--', '—', '-']
all_text_stopped = [word for word in all_text_stopped if word not in spec_chars]

all_text_stopped[:6]

In [None]:
# let's draw freq distribution again

all_fdist_stopped = FreqDist(all_text_stopped).most_common(20)
all_fdist_stopped = pd.Series(dict(all_fdist_stopped))

for line in all_fdist_stopped.keys():
    print(line, ":\t", all_fdist_stopped[line])

In [None]:
all_plot = plt.bar(all_fdist_stopped.index, all_fdist_stopped.values)
ticks = plt.xticks(rotation=60)

In [None]:
all_plot = plt.barh(all_fdist_stopped.index, all_fdist_stopped.values)

In [None]:
all_plot = plt.barh(all_fdist_stopped.index, all_fdist_stopped.values)
ax = plt.gca()
ax.invert_yaxis()

## Histograms

Let's create histogram displaying text word length

In [None]:
all_text[:10]

In [None]:
# for every word, return its length
word_length = [len(word) for word in all_text]

word_length[:10]

In [None]:
n_bins = 20

plt.hist(word_length, bins=n_bins)

In [None]:
sns.displot(word_length, binwidth=2)

In [None]:
long_words = [word for word in all_text if len(word) >= 15]

long_words[:10]

Seaborn - histograms:
* https://seaborn.pydata.org/tutorial/distributions.html

### Seaborn

Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics.

https://seaborn.pydata.org/

In [None]:
data = FreqDist(all_text_stopped).most_common(20)
data = pd.DataFrame(data, columns = ["Word","Frequency"])

data[:10]

In [None]:
ax = sns.barplot(data, x="Word", y="Frequency")

ax = ax.set_xticklabels(data["Word"], rotation=60)

In [None]:
ax = sns.barplot(data, y="Word", x="Frequency", orient="h")

More information about Seaborn:
* https://seaborn.pydata.org/tutorial/introduction.html

### Visualizing bigrams

In [None]:
import nltk.collocations as collocations
from nltk import FreqDist, bigrams

In [None]:
ngrams = bigrams(all_text_stopped)

ngram_freq_list = FreqDist(ngrams).most_common(20)

In [None]:
# this program expects Python 3.6 or later where dictionary 
# items maintain their insertion order.

ngram_dict = {}

for words, count in ngram_freq_list:
    key = "_".join(words)
    ngram_dict[key] = count

print(ngram_dict)

In [None]:
ngram_freqdist = pd.Series(ngram_dict)

In [None]:
# plot the figure

fig, ax = plt.subplots(figsize=(10,10))

## set the plot to horizontal + set title + display  
bar_plot = sns.barplot(x=ngram_freqdist.values, y=ngram_freqdist.index, orient='h', ax=ax)
title = plt.title('Frequency Distribution')

### Stopwords for languages not included in NLTK

Previously we used stopwords from NLTK stopword list but that won't work for Latvian or other languages not included in NLTK.

Let's use an existing Latvian stopword list from Github:

In [None]:
import requests

stop_url = "https://raw.githubusercontent.com/Xangis/extra-stopwords/master/latvian"
res = requests.get(stop_url)

stopwords = res.text.split()
print(stopwords[:10])

stopword_set = set(stopwords)

In [None]:
# reading our text corpus

import pandas as pd

url_2 = "https://raw.githubusercontent.com/CaptSolo/BSSDH_2023_beginners/main/corpora/lv_old_newspapers_5k.tsv"
df_2 = pd.read_csv(url_2, sep="\t")

In [None]:
all_text = "\n".join(df_2["Text"]).split()
all_fdist = FreqDist(all_text).most_common(20)

In [None]:
# converting data to Pandas series
all_fdist = pd.Series(dict(all_fdist))

In [None]:
# removing stopwords
all_text_stopped = [word for word in all_text if word not in stopword_set]

# removing special characters
spec_chars = ['-', '–', '—']
all_text_stopped = [word for word in all_text_stopped if word not in spec_chars]

all_text_stopped[:6]

In [None]:
# draw freq distribution

all_fdist_stopped = FreqDist(all_text_stopped).most_common(20)
all_fdist_stopped = pd.Series(dict(all_fdist_stopped))

all_plot = plt.barh(all_fdist_stopped.index, all_fdist_stopped.values)
ax = plt.gca()
ax.invert_yaxis()

## Visualization examples

Additional information about different Matplotlib and Seaborn visualizations:

* https://matplotlib.org/stable/gallery/index.html
* https://seaborn.pydata.org/examples/index.html

Tutorials:

* [Matplotlib tutorial](https://github.com/rougier/matplotlib-tutorial) by Nicolas P. Rougier
* [Pyplot tutorial](https://matplotlib.org/stable/tutorials/introductory/pyplot.html)

## Wordcloud visualization

https://github.com/amueller/word_cloud

In [None]:
## not needed if the WordCloud library is already installed
#!pip install wordcloud

In [None]:
import matplotlib.pyplot as plt

from wordcloud import WordCloud

In [None]:
all_text_stopped[:10]

In [None]:
text = " ".join(all_text_stopped)

wordcloud = WordCloud().generate(text)

In [None]:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# lower max_font_size, change the maximum number of word and lighten the background:

wordcloud = WordCloud(max_font_size=50, max_words=40, background_color="white").generate(text)

plt.figure()
plt.imshow(wordcloud) #, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# Save the image in a file:
wordcloud.to_file("wordcloud.png")

Additional information about word cloud generation:

* https://github.com/amueller/word_cloud

---

## Your turn!

Choose a text corpus and **visualize it** using the tools shown in this notebook.

**Write code in notebook cells below**.