In [None]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import altair as alt
import altair_data_server

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans

import wordcloud
import matplotlib.pyplot as plt

from src.data.load_data import Data

In [None]:
# Get a word frequency list for summary
cv = CountVectorizer(stop_words = "english")

cv_fit = cv.fit_transform(df['summary'])


words = cv.get_feature_names()
counts = np.asarray(cv_fit.sum(axis = 0))

# Cast to dictionary of word: frequency
wordcount_dict = dict(zip(words, counts[0]))

# Make a dataframe from dict and sort by frequency
word_df = pd.DataFrame.from_dict(wordcount_dict, orient="index", columns = ["Count"]).reset_index().rename(columns = {"index": "Word"}).sort_values("Count", ascending = False)

In [None]:
alt.Chart(word_df.nlargest(20, "Count")).mark_bar().encode(
    x = alt.X("Word:N", title = "Word", sort = "-y"),
    y = alt.Y("Count:Q", title = "Frequency")
).properties(
    title = "20 Most Common words in Crash Summary",
    height = 500,
    width = 750
).configure_axisX(labelAngle = -40)

This is a good start but I think we can do a little better:

* The most common word is "crashed" I think that should go without saying really!
* Other words like "aircraft", "plane" etc are also a bit redundant.

Really, what I'm looking for here are occurences of words like "engine", "weather" etc.

So to start improving this I'm going to add some of these redundant words to the stop_words list

In [None]:
crash_stop_words = [
    "aircraft",
    "plane",
    "crash",
    "crashed",
    "flight"
]

new_stop_words = list(cv.get_stop_words().union(set(crash_stop_words)))

In [None]:
# Get a word frequency list for summary
cv = CountVectorizer(stop_words = new_stop_words)

cv_fit = cv.fit_transform(df['summary'])


words = cv.get_feature_names()
counts = np.asarray(cv_fit.sum(axis = 0))

# Cast to dictionary of word: frequency
wordcount_dict = dict(zip(words, counts[0]))

# Make a dataframe from dict and sort by frequency
word_df = pd.DataFrame.from_dict(wordcount_dict, orient="index", columns = ["Count"]).reset_index().rename(columns = {"index": "Word"}).sort_values("Count", ascending = False)

In [None]:
alt.Chart(word_df.nlargest(20, "Count")).mark_bar().encode(
    x = alt.X("Word:N", title = "Word", sort = "-y"),
    y = alt.Y("Count:Q", title = "Frequency")
).properties(
    title = "20 Most Common words in Crash Summary",
    height = 500,
    width = 750
).configure_axisX(labelAngle = -40)

I believe the done thing at this point is to make a pretty word cloud...

Cue googling

In [None]:
all_summary_text = " ".join(summary for summary in df['summary'])
cloud = wordcloud.WordCloud(stopwords = new_stop_words).generate(all_summary_text)

plt.imshow(cloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:

tf_idf_vec = TfidfVectorizer(stop_words = new_stop_words)

X = tf_idf_vec.fit_transform(df['summary'])

In [None]:

km = KMeans()

km.fit(X)

labels = km.predict(X)

df['cluster_labels'] = labels

In [None]:
centroids = km.cluster_centers_.argsort()[:, ::-1]
n_clusters = km.cluster_centers_.shape[0]

terms = tf_idf_vec.get_feature_names()
for i in range(n_clusters):
    print(f"Cluster {i}: ")
    for ind in centroids[i, :10]:
        print(f" {terms[ind]}")