In [3]:
!pip install nltk


Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting joblib (from nltk)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp38-cp38-macosx_10_9_x86_64.whl.metadata (40 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Downloading regex-2024.11.6-cp38-cp38-macosx_10_9_x86_64.whl (287 kB)
Using cached click-8.1.8-py3-none-any.whl (98 kB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, joblib, click, nltk
Successfully installed click-8.1.8 joblib-1.4.2 nltk-3.9.1 regex-2024.11.6 tqdm-4.67.1


In [5]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /Users/alden/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/alden/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/alden/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# If not already installed, uncomment the lines below:
# !pip install nltk
# !pip install pandas
# !pip install matplotlib seaborn

# --- IMPORTS ---
import pandas as pd
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# --- LOAD CLEANED EVENT DATA ---
df = pd.read_csv("20th_century_cleaned_events.csv")

# Combine all events into one string
text = " ".join(df["event"].astype(str))

# --- TOKENIZATION ---
tokens = word_tokenize(text)

# --- REMOVE STOPWORDS & PUNCTUATION ---
stop_words = set(stopwords.words("english"))
words = [word for word in tokens if word.isalpha() and word.lower() not in stop_words]

# --- POS TAGGING ---
tagged_words = pos_tag(words)

# --- CONVERT TO DATAFRAME FOR ANALYSIS ---
df_text = pd.DataFrame(tagged_words, columns=["Word", "POS"])
df_text["POS_category"] = df_text["POS"].str[:2]  # For grouping

# --- MOST COMMON WORDS ---
top_words = Counter(words).most_common(15)
df_top_words = pd.DataFrame(top_words, columns=["Word", "Count"])

# Plot common words
plt.figure(figsize=(10, 5))
sns.barplot(data=df_top_words, x="Word", y="Count", palette="viridis")
plt.title("Top 15 Most Common Words")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# --- MOST COMMON VERBS ---
df_verbs = df_text[df_text["POS_category"] == "VB"]
top_verbs = df_verbs["Word"].value_counts().nlargest(15).reset_index()
top_verbs.columns = ["Word", "Count"]

plt.figure(figsize=(10, 5))
sns.barplot(data=top_verbs, x="Word", y="Count", palette="magma")
plt.title("Top 15 Verbs")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# --- MOST COMMON ADJECTIVES ---
df_adjs = df_text[df_text["POS_category"] == "JJ"]
top_adjs = df_adjs["Word"].value_counts().nlargest(15).reset_index()
top_adjs.columns = ["Word", "Count"]

plt.figure(figsize=(10, 5))
sns.barplot(data=top_adjs, x="Word", y="Count", palette="coolwarm")
plt.title("Top 15 Adjectives")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# --- OPTIONAL: SAVE OUTPUTS ---
df_text.to_csv("20th_century_text_pos_tags.csv", index=False)


ModuleNotFoundError: No module named 'matplotlib'