In [1]:
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px
import nltk
from nltk.corpus import stopwords
import string
import fasttext
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import HDBSCAN

nltk.download("stopwords")
nltk.download("punkt_tab")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/filipstrozik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/filipstrozik/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Load data

In [3]:
import json

def load_annotations(file_path):
    with open(file_path, "r", encoding="utf-8") as jfile:
        jdata = json.load(jfile)
    return jdata


def extract_annotations(annotations):
    annot_dict = {}
    for task in annotations:
        full_text = task["data"]["text"]
        annot_dict[task["id"]] = {
            "text": full_text,
            "overall_label": None,
            "span_labels": {},
        }
        # Extarct annot info
        for annot in task["annotations"][0]["result"]:
            if annot["from_name"] == "label":
                start = annot["value"]["start"]
                end = annot["value"]["end"]
                text = annot["value"]["text"].strip()
                label = annot["value"]["labels"][0]
                annot_dict[task["id"]]["span_labels"][(start, end)] = (text, label)
            else:
                annot_dict[task["id"]]["overall_label"] = annot["value"]["choices"][0]
    return annot_dict

final_1_100 = load_annotations("../annot/1-100/final_1_100.json")
final_1_100_dict = extract_annotations(final_1_100)
final_101_200 = load_annotations("../annot/101-200/final_101_200.json")
final_101_200_dict = extract_annotations(final_101_200)

# Combine the two dictionaries
final_dict = {**final_1_100_dict, **final_101_200_dict}



In [12]:
def analyze_word_count(data_dict):
    word_count_dict = {}
    for key, value in data_dict.items():
        text = value.get("text", "")
        word_count = len(text.split())
        word_count_dict[key] = word_count
    return word_count_dict

def analyze_sentence_count(data_dict):
    sentence_count_dict = {}
    for key, value in data_dict.items():
        text = value.get("text", "")
        sentence_count = len(nltk.sent_tokenize(text))
        sentence_count_dict[key] = sentence_count
    return sentence_count_dict

# Analyze word count in final_dict
word_count_analysis = analyze_word_count(final_dict)
word_count = sum(word_count_analysis.values())
print(f" Total word count: {word_count}")

# Analyze sentence count in final_dict
sentence_count_analysis = analyze_sentence_count(final_dict)
sentence_count = sum(sentence_count_analysis.values())
print(f" Total sentence count: {sentence_count}")

print(f" Total number of tasks: {len(final_dict)}")


 Total word count: 13013
 Total sentence count: 694
 Total number of tasks: 200


In [6]:
def analyze_label_distribution(data_dict):
    label_distribution = {}
    for key, value in data_dict.items():
        overall_label = value.get("overall_label", "")
        if overall_label:
            if overall_label in label_distribution:
                label_distribution[overall_label] += 1
            else:
                label_distribution[overall_label] = 1
    return label_distribution

label_distribution = analyze_label_distribution(final_dict)
print(label_distribution)

{'drama': 76, 'romance': 14, 'action': 31, 'comedy': 46, 'fantasy': 17, 'western': 9, 'document': 7}


In [7]:
# show the distribution of the labels in a pie chart
fig = px.pie(
    values=list(label_distribution.values()),
    names=list(label_distribution.keys()),
    title="Label Distribution",
)
fig.show()

In [8]:
def analyze_span_label_distribution(data_dict):
    label_distribution = {}
    for key, value in data_dict.items():
        span_labels = value.get("span_labels", {})
        for span, (text, label) in span_labels.items():
            if label in label_distribution:
                label_distribution[label] += 1
            else:
                label_distribution[label] = 1
    return label_distribution

span_label_distribution = analyze_span_label_distribution(final_dict)
print(span_label_distribution)

# show the distribution of the labels in a pie chart
fig = px.pie(
    values=list(span_label_distribution.values()),
    names=list(span_label_distribution.keys()),
    title="Span Label Distribution",
)
fig.show()

{'TIME': 58, 'NAME': 961, 'CHARACTER': 892, 'PLACE': 274}


In [9]:
def transform_dict_to_df(data_dict):
    data = []
    for key, value in data_dict.items():
        data.append(
            {
                "id": key,
                "text": value.get("text", ""),
                "overall_label": value.get("overall_label", ""),
            }
        )
    return pd.DataFrame(data)

final_df_text = transform_dict_to_df(final_dict)
final_df_text.head()


Unnamed: 0,id,text,overall_label
0,1,The film tells the story of Elizabeth (Colbert...,drama
1,2,"The eventful life of Swami Vivekananda, his te...",drama
2,3,Jagathalaprathapan is a prince who is to be pu...,romance
3,4,Police inspector Holloway (Patrick Wymark) inv...,action
4,5,Lewis and Clark and George opens with Salvator...,drama


In [11]:

def transform_dict_to_NER_df(data_dict):
    data = []
    for key, value in data_dict.items():
        text = value.get("text", "")
        span_labels = value.get("span_labels", {})
        for span, (text, label) in span_labels.items():
            data.append(
                {
                    "id": key,
                    "text": text,
                    "label": label,
                }
            )
    return pd.DataFrame(data)

final_df_NER = transform_dict_to_NER_df(final_dict)
final_df_NER.head(20)


Unnamed: 0,id,text,label
0,1,World War I,TIME
1,1,Elizabeth,NAME
2,1,John,NAME
3,1,John,NAME
4,1,Elizabeth,NAME
5,1,man,NAME
6,1,John,NAME
7,1,daughter,CHARACTER
8,1,son,CHARACTER
9,1,Elizabeth,NAME


In [13]:
# save the dataframes to csv
final_df_text.to_csv("../data/final_df_text.csv", index=False)
final_df_NER.to_csv("../data/final_df_NER.csv", index=False)

In [14]:
final_df_NER = pd.read_csv("../data/final_df_NER.csv")
final_df_NER.head()

Unnamed: 0,id,text,label
0,1,World War I,TIME
1,1,Elizabeth,NAME
2,1,John,NAME
3,1,John,NAME
4,1,Elizabeth,NAME


In [15]:
final_df_text = pd.read_csv("../data/final_df_text.csv")
final_df_text.head()

Unnamed: 0,id,text,overall_label
0,1,The film tells the story of Elizabeth (Colbert...,drama
1,2,"The eventful life of Swami Vivekananda, his te...",drama
2,3,Jagathalaprathapan is a prince who is to be pu...,romance
3,4,Police inspector Holloway (Patrick Wymark) inv...,action
4,5,Lewis and Clark and George opens with Salvator...,drama


# Osadzenia wyrażeń NER

In [18]:
!pip install pandas torch transformers ipywidgets 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting ipywidgets
  Using cached ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Using cached widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Using cached ipywidgets-8.1.5-py3-none-any.whl (139 kB)
Using cached jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)
Using cached widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.1.5 jupyterlab-widgets-3.0.13 widgetsnbextension-4.0.13


In [22]:
import torch
from transformers import AutoModel, AutoTokenizer

# Load pre-trained model and tokenizer
model = AutoModel.from_pretrained("prajjwal1/bert-small")
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-small")

# Device selection
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use CUDA if available
elif torch.backends.mps.is_available():
    device = torch.device("mps")  # Use MPS on Apple Silicon
else:
    device = torch.device("cpu")  # Fallback to CPU

# Move model to the selected device
model.to(device)


def get_embeddings(text):
    # Tokenize input text and move inputs to the selected device
    inputs = tokenizer(
        text, return_tensors="pt", padding=True, truncation=True, max_length=512
    ).to(device)

    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the last hidden states
    last_hidden_states = outputs.last_hidden_state

    # Return mean of last hidden states (or use [CLS] token)
    return (
        last_hidden_states.mean(dim=1).squeeze().cpu().numpy()
    )  # Move back to CPU for numpy conversion


[-2.87632585e-01  6.48084939e-01 -7.96993077e-01 -4.04933512e-01
 -5.51508307e-01 -7.43960023e-01 -5.76235414e-01  5.15497148e-01
 -7.79310912e-02  6.23656154e-01 -1.07959270e+00  1.87924683e-01
 -9.23004150e-01 -6.07855618e-01  3.32713544e-01  7.80829862e-02
 -4.01003301e-01  1.66505992e-01  8.51412296e-01  4.44756210e-01
  3.89101893e-01 -4.32102323e-01 -1.35260433e-01  1.00291789e+00
  6.88637555e-01 -5.86947560e-01 -2.55092144e-01  3.18827868e-01
 -1.30208707e+00  7.87933171e-02  5.37043214e-01  4.79165137e-01
 -2.20728368e-01  9.03079331e-01 -3.25399250e-01  1.18249369e+00
 -1.38104782e-02 -2.50749439e-01  4.79132175e-01  2.80858815e-01
 -1.39467150e-01 -3.97449553e-01  9.31843370e-03  9.82595801e-01
 -1.95800900e-01  4.19800878e-01  6.34099722e-01  6.33377433e-01
 -2.80641913e-01  1.49108386e+00 -1.08077988e-01  1.16526449e+00
 -3.05682063e-01 -8.79098922e-02 -4.75858957e-01 -2.55498469e-01
 -8.12434912e-01  4.41605225e-02  3.91435236e-01 -2.00691327e-01
 -4.87094879e-01 -1.06905

# Preprocess text

In [24]:
def preprocess_texts(text):
    # remove extra whitespace
    text = text.strip()
    text = " ".join(text.split())
    
    # lowercase text
    text = text.lower()

    # tokenize text
    tokens = nltk.word_tokenize(text)

    # remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    # remove stopwords
    tokens = [token for token in tokens if token not in stopwords.words("english")]
    return " ".join(tokens)

In [25]:
final_df_text["cleaned_text"] = final_df_text["text"].apply(preprocess_texts)
final_df_text.head(1)

Unnamed: 0,id,text,overall_label,cleaned_text
0,1,The film tells the story of Elizabeth (Colbert...,drama,film tells story elizabeth colbert john welles...


# Text embeddings

### FastText

In [49]:
import fasttext.util

fasttext.util.download_model("en", if_exists="ignore")  # English


'cc.en.300.bin'

In [31]:
ft = fasttext.load_model("cc.en.300.bin")

In [32]:
ft_df = final_df_text.copy()

ft_df["ft_embeddings"] = ft_df["cleaned_text"].apply(
    lambda x: ft.get_sentence_vector(x)
)
ft_df.head(1)

Unnamed: 0,id,text,overall_label,cleaned_text,ft_embeddings
0,1,The film tells the story of Elizabeth (Colbert...,drama,film tells story elizabeth colbert john welles...,"[-0.0124314, 0.007053868, 0.0022771724, 0.0611..."


In [33]:
ft_embed_arr = np.array(ft_df["ft_embeddings"].to_list())

In [34]:
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
X_tsne = tsne.fit_transform(ft_embed_arr)

ft_df['text'] = ft_df['text'].apply(lambda x: " ".join(x.split()[:5]) + "...")
ft_df["x-tsne"] = X_tsne[:, 0]
ft_df["y-tsne"] = X_tsne[:, 1]
ft_df.head(1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Unnamed: 0,id,text,overall_label,cleaned_text,ft_embeddings,x-tsne,y-tsne
0,1,The film tells the story...,drama,film tells story elizabeth colbert john welles...,"[-0.0124314, 0.007053868, 0.0022771724, 0.0611...",2.994706,2.655316


In [36]:
# Create interactive scatter plot
fig = px.scatter(
    ft_df,
    x="x-tsne",
    y="y-tsne",
    color="overall_label",
    hover_data={"text": True, "overall_label": True},
    title="Sentences t-SNE Visualization with Fasttext Embeddings",
    width=1200,
    height=1200,
)

# Show plot
fig.show()

### TF-IDF

In [38]:
vectorizer = TfidfVectorizer(max_features=300)

In [39]:
tfidf_df = final_df_text.copy()

X = vectorizer.fit_transform(tfidf_df["cleaned_text"])
print(f" Shape of the TF-IDF matrix: {X.shape}")

tfidf_df["embeddings"] = [X[i].toarray()[0] for i in range(X.shape[0])]

tfidf_emb_arr = np.array(tfidf_df['embeddings'].tolist())

 Shape of the TF-IDF matrix: (200, 300)


In [41]:
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
X_tsne = tsne.fit_transform(tfidf_emb_arr)

tfidf_df['text'] = tfidf_df['text'].apply(lambda x: " ".join(x.split()[:5]) + "...")

# Add t-SNE results to the DataFrame
tfidf_df["x-tsne"] = X_tsne[:, 0]
tfidf_df["y-tsne"] = X_tsne[:, 1]

# Create interactive scatter plot
fig_tfidf = px.scatter(
    tfidf_df,
    x="x-tsne",
    y="y-tsne",
    color="overall_label",
    hover_data={"text": True, "overall_label": True},
    title="Sentences t-SNE Visualization with TF-IDF Embeddings",
    width=1200,
    height=1200,
)
# Show plot

fig_tfidf.show()

### HDBSCAN

In [42]:
hdbscan = HDBSCAN(
    min_cluster_size=5, metric="euclidean", cluster_selection_method="eom"
)

In [43]:
ft_df["cluster"] = hdbscan.fit_predict(ft_embed_arr)
ft_df.head(1)

Unnamed: 0,id,text,overall_label,cleaned_text,ft_embeddings,x-tsne,y-tsne,cluster
0,1,The film tells the story...,drama,film tells story elizabeth colbert john welles...,"[-0.0124314, 0.007053868, 0.0022771724, 0.0611...",2.994706,2.655316,0


In [45]:
# Create interactive scatter plot
ft_df["cluster"] = ft_df["cluster"].astype(str)
fig = px.scatter(
    ft_df,
    x="x-tsne",
    y="y-tsne",
    color="cluster",
    hover_data={"text": True, "overall_label": True, "cluster": True},
    title="Sentences t-SNE Visualization with FastText Embeddings and HDBSCAN Clustering",
    width=1200,
    height=1200,
)

# Show plot
fig.show()

In [46]:
tfidf_df['cluster'] = hdbscan.fit_predict(tfidf_emb_arr)

In [48]:
tfidf_df['cluster'] = tfidf_df['cluster'].astype(str)
fig_cluster = px.scatter(
    tfidf_df,
    x="x-tsne",
    y="y-tsne",
    color="cluster",
    hover_data={"text": True, "overall_label": True, "cluster": True},
    title="Sentences t-SNE Visualization with TF-IDF Embeddings and HDBSCAN Clustering",
    width=1200,
    height=1200,
)
fig_cluster.show()