In [91]:
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px
import nltk
from nltk.corpus import stopwords
import string
import fasttext
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import HDBSCAN

nltk.download("stopwords")
nltk.download("punkt_tab")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\izam1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\izam1\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Load data

In [92]:
import json

def load_annotations(file_path):
    with open(file_path, "r", encoding="utf-8") as jfile:
        jdata = json.load(jfile)
    return jdata


def extract_annotations(annotations):
    annot_dict = {}
    for task in annotations:
        full_text = task["data"]["text"]
        annot_dict[task["id"]] = {
            "text": full_text,
            "overall_label": None,
            "span_labels": {},
        }
        # Extarct annot info
        for annot in task["annotations"][0]["result"]:
            if annot["from_name"] == "label":
                start = annot["value"]["start"]
                end = annot["value"]["end"]
                text = annot["value"]["text"].strip()
                label = annot["value"]["labels"][0]
                annot_dict[task["id"]]["span_labels"][(start, end)] = (text, label)
            else:
                annot_dict[task["id"]]["overall_label"] = annot["value"]["choices"][0]
    return annot_dict

final_1_100 = load_annotations("../annot/1-100/final_1_100.json")
final_1_100_dict = extract_annotations(final_1_100)
final_101_200 = load_annotations("../annot/101-200/final_101_200.json")
final_101_200_dict = extract_annotations(final_101_200)

# Combine the two dictionaries
final_dict = {**final_1_100_dict, **final_101_200_dict}



In [93]:
def analyze_word_count(data_dict):
    word_count_dict = {}
    for key, value in data_dict.items():
        text = value.get("text", "")
        word_count = len(text.split())
        word_count_dict[key] = word_count
    return word_count_dict

def analyze_sentence_count(data_dict):
    sentence_count_dict = {}
    for key, value in data_dict.items():
        text = value.get("text", "")
        sentence_count = len(nltk.sent_tokenize(text))
        sentence_count_dict[key] = sentence_count
    return sentence_count_dict

# Analyze word count in final_dict
word_count_analysis = analyze_word_count(final_dict)
word_count = sum(word_count_analysis.values())
print(f" Total word count: {word_count}")

# Analyze sentence count in final_dict
sentence_count_analysis = analyze_sentence_count(final_dict)
sentence_count = sum(sentence_count_analysis.values())
print(f" Total sentence count: {sentence_count}")

print(f" Total number of tasks: {len(final_dict)}")


 Total word count: 13013
 Total sentence count: 694
 Total number of tasks: 200


In [94]:
def analyze_label_distribution(data_dict):
    label_distribution = {}
    for key, value in data_dict.items():
        overall_label = value.get("overall_label", "")
        if overall_label:
            if overall_label in label_distribution:
                label_distribution[overall_label] += 1
            else:
                label_distribution[overall_label] = 1
    return label_distribution

label_distribution = analyze_label_distribution(final_dict)
print(label_distribution)

{'drama': 76, 'romance': 14, 'action': 31, 'comedy': 46, 'fantasy': 17, 'western': 9, 'document': 7}


In [95]:
# show the distribution of the labels in a pie chart
fig = px.pie(
    values=list(label_distribution.values()),
    names=list(label_distribution.keys()),
    title="Label Distribution",
)
fig.show()

In [96]:
def analyze_span_label_distribution(data_dict):
    label_distribution = {}
    for key, value in data_dict.items():
        span_labels = value.get("span_labels", {})
        for span, (text, label) in span_labels.items():
            if label in label_distribution:
                label_distribution[label] += 1
            else:
                label_distribution[label] = 1
    return label_distribution

span_label_distribution = analyze_span_label_distribution(final_dict)
print(span_label_distribution)

# show the distribution of the labels in a pie chart
fig = px.pie(
    values=list(span_label_distribution.values()),
    names=list(span_label_distribution.keys()),
    title="Span Label Distribution",
)
fig.show()

{'TIME': 58, 'NAME': 961, 'CHARACTER': 892, 'PLACE': 274}


In [97]:
def transform_dict_to_df(data_dict):
    data = []
    for key, value in data_dict.items():
        data.append(
            {
                "id": key,
                "text": value.get("text", ""),
                "overall_label": value.get("overall_label", ""),
            }
        )
    return pd.DataFrame(data)

final_df_text = transform_dict_to_df(final_dict)
final_df_text.head()


Unnamed: 0,id,text,overall_label
0,1,The film tells the story of Elizabeth (Colbert...,drama
1,2,"The eventful life of Swami Vivekananda, his te...",drama
2,3,Jagathalaprathapan is a prince who is to be pu...,romance
3,4,Police inspector Holloway (Patrick Wymark) inv...,action
4,5,Lewis and Clark and George opens with Salvator...,drama


In [98]:

def transform_dict_to_NER_df(data_dict):
    data = []
    for key, value in data_dict.items():
        text = value.get("text", "")
        span_labels = value.get("span_labels", {})
        for span, (text, label) in span_labels.items():
            data.append(
                {
                    "id": key,
                    "text": text,
                    "label": label,
                }
            )
    return pd.DataFrame(data)

final_df_NER = transform_dict_to_NER_df(final_dict)
final_df_NER.head(20)


Unnamed: 0,id,text,label
0,1,World War I,TIME
1,1,Elizabeth,NAME
2,1,John,NAME
3,1,John,NAME
4,1,Elizabeth,NAME
5,1,man,NAME
6,1,John,NAME
7,1,daughter,CHARACTER
8,1,son,CHARACTER
9,1,Elizabeth,NAME


In [99]:
# save the dataframes to csv
final_df_text.to_csv("../data/final_df_text.csv", index=False)
final_df_NER.to_csv("../data/final_df_NER.csv", index=False)

In [100]:
final_df_NER = pd.read_csv("../data/final_df_NER.csv")
final_df_NER.head()

Unnamed: 0,id,text,label
0,1,World War I,TIME
1,1,Elizabeth,NAME
2,1,John,NAME
3,1,John,NAME
4,1,Elizabeth,NAME


In [101]:
final_df_text = pd.read_csv("../data/final_df_text.csv")
final_df_text.head()

Unnamed: 0,id,text,overall_label
0,1,The film tells the story of Elizabeth (Colbert...,drama
1,2,"The eventful life of Swami Vivekananda, his te...",drama
2,3,Jagathalaprathapan is a prince who is to be pu...,romance
3,4,Police inspector Holloway (Patrick Wymark) inv...,action
4,5,Lewis and Clark and George opens with Salvator...,drama


In [102]:
import fasttext.util

fasttext.util.download_model("en", if_exists="ignore")  # English


'cc.en.300.bin'

# Osadzenia wyrażeń NER

In [103]:
!pip install pandas torch transformers ipywidgets 




[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [164]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
def get_k_nearest_neighbors(embeddings, words, k=3, metric="cosine"):

    if metric == "cosine":
        similarity_matrix = cosine_similarity(embeddings)
    elif metric == "euclidean":
        similarity_matrix = euclidean_distances(embeddings) #-np.linalg.norm(embeddings[:, np.newaxis] - embeddings, axis=2)
    else:
        raise ValueError(f"Unsupported metric: {metric}")
    
    pairs = []
    for i in range(len(embeddings)):
        for j in range(i + 1, len(embeddings)):
            similarity = similarity_matrix[i][j]
            pairs.append(((words[i], words[j]), similarity))

    pairs.sort(key=lambda x: x[1], reverse=(metric == "cosine"))

    return pairs[:k]


## Bert-small

In [104]:
import torch
from transformers import AutoModel, AutoTokenizer

# Load pre-trained model and tokenizer
model = AutoModel.from_pretrained("prajjwal1/bert-small")
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-small")

# Device selection
if torch.cuda.is_available():
    device = torch.device("cuda")  # Use CUDA if available
elif torch.backends.mps.is_available():
    device = torch.device("mps")  # Use MPS on Apple Silicon
else:
    device = torch.device("cpu")  # Fallback to CPU

# Move model to the selected device
model.to(device)


def get_embeddings(text):
    # Tokenize input text and move inputs to the selected device
    inputs = tokenizer(
        text, return_tensors="pt", padding=True, truncation=True, max_length=512
    ).to(device)

    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the last hidden states
    last_hidden_states = outputs.last_hidden_state

    # Return mean of last hidden states (or use [CLS] token)
    return (
        last_hidden_states.mean(dim=1).squeeze().cpu().numpy()
    )  # Move back to CPU for numpy conversion


### t-SNE

In [105]:
bert_df = final_df_NER.copy()

In [106]:
bert_df['embedding'] = bert_df['text'].apply(get_embeddings)
bert_embeddings = np.array(bert_df['embedding'].tolist())

In [107]:
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
X_tsne = tsne.fit_transform(bert_embeddings)

bert_df["x-tsne"] = X_tsne[:, 0]
bert_df["y-tsne"] = X_tsne[:, 1]

In [108]:
fig = px.scatter(
    bert_df,
    x="x-tsne",
    y="y-tsne",
    color="label",
    hover_data={"text": True, "label": True},
    title="Expressions t-SNE Visualization with Bert-small Embeddings",
    width=1200,
    height=1200,
)

# Show plot
fig.show()

### Closest words

In [176]:
bert_df_unique = bert_df.copy()
bert_df_unique['text'] = bert_df_unique['text'].apply(lambda x: x.lower())
bert_df_unique = bert_df_unique.drop_duplicates(subset=["text"])
words = bert_df_unique["text"].values
embeddings = bert_df_unique["embedding"].to_list()

nearest_neighbors = get_k_nearest_neighbors(np.array(embeddings),words, k=8, metric="cosine")

# Print the nearest neighbors for each word
print(f'{'*'*20} Cosine Similarity {'*'*20}')
dtf = {"word1": [], "word2": [], "similarity": []}
for (word1, word2), similarity in nearest_neighbors:
    dtf["word1"].append(word1)
    dtf["word2"].append(word2)
    dtf["similarity"].append(round(float(similarity),3))
print(pd.DataFrame(dtf))

print("\n")

nearest_neighbors = get_k_nearest_neighbors(np.array(embeddings),words, k=8, metric="euclidean")
print(f'{'*'*20} Euclidean Distance {'*'*20}')
dtf = {"word1": [], "word2": [], "similarity": []}
for (word1, word2), similarity in nearest_neighbors:
    dtf["word1"].append(word1)
    dtf["word2"].append(word2)
    dtf["similarity"].append(round(float(similarity),3))
print(pd.DataFrame(dtf))

******************** Cosine Similarity ********************
                 word1                 word2  similarity
0       3rd century bc        5th century bc       0.973
1             20 years              10 years       0.956
2        another woman           another man       0.956
3             new wife           new husband       0.950
4            young boy            young girl       0.943
5                1960s                 1970s       0.940
6            young man           young woman       0.939
7  high school student  high school students       0.937


******************** Euclidean Distance ********************
             word1                word2  similarity
0   3rd century bc       5th century bc       3.416
1         20 years             10 years       4.266
2    another woman          another man       4.338
3      vijayakumar  manjula vijayakumar       4.607
4         new wife          new husband       4.669
5  paige's brother       matt's brother       4.817


## FastText

In [168]:
ft = fasttext.load_model("cc.en.300.bin")
ft_df_words = final_df_NER.copy()

ft_df_words["embedding"] = ft_df_words["text"].apply(
    lambda x: ft.get_sentence_vector(x)
)

### t-SNE

In [169]:
ft_embed_arr = np.array(ft_df_words["embedding"].to_list())
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
X_tsne = tsne.fit_transform(ft_embed_arr)

ft_df_words["x-tsne"] = X_tsne[:, 0]
ft_df_words["y-tsne"] = X_tsne[:, 1]

In [170]:
fig = px.scatter(
    ft_df_words,
    x="x-tsne",
    y="y-tsne",
    color="label",
    hover_data={"text": True, "label": True},
    title="Expressions t-SNE Visualization with Fasttext Embeddings",
    width=1200,
    height=1200,
)

# Show plot
fig.show()

### Closest words

In [174]:
tf_df_unique = ft_df_words.copy()
tf_df_unique['text'] = tf_df_unique['text'].apply(lambda x: x.lower())
tf_df_unique = tf_df_unique.drop_duplicates(subset=["text"])
words = tf_df_unique["text"].values
embeddings = tf_df_unique["embedding"].to_list()

nearest_neighbors = get_k_nearest_neighbors(np.array(embeddings),words, k=8, metric="cosine")

print(f'{'*'*15} Cosine Similarity {'*'*15}')
dtf = {"word1": [], "word2": [], "similarity": []}
for (word1, word2), similarity in nearest_neighbors:
    dtf["word1"].append(word1)
    dtf["word2"].append(word2)
    dtf["similarity"].append(round(float(similarity),3))
print(pd.DataFrame(dtf))

print("\n")

nearest_neighbors = get_k_nearest_neighbors(np.array(embeddings),words, k=8, metric="euclidean")
print(f'{'*'*20} Euclidean Distance {'*'*20}')
dtf = {"word1": [], "word2": [], "similarity": []}
for (word1, word2), similarity in nearest_neighbors:
    dtf["word1"].append(word1)
    dtf["word2"].append(word2)
    dtf["similarity"].append(round(float(similarity),3))
print(pd.DataFrame(dtf))

*************** Cosine Similarity ***************
            word1           word2  similarity
0            1981            1983       0.982
1        20 years        10 years       0.980
2  3rd century bc  5th century bc       0.979
3            1962            1959       0.978
4            1962            1964       0.968
5            2002            1999       0.964
6           1960s           1970s       0.957
7            1999            1994       0.955


******************** Euclidean Distance ********************
                 word1                 word2  similarity
0       3rd century bc        5th century bc       0.150
1             20 years              10 years       0.166
2                 1981                  1983       0.188
3                 1962                  1959       0.210
4             new wife           new husband       0.230
5  high school student  high school students       0.237
6                 1962                  1964       0.254
7                

# Preprocess text

In [116]:
def preprocess_texts(text):
    # remove extra whitespace
    text = text.strip()
    text = " ".join(text.split())
    
    # lowercase text
    text = text.lower()

    # tokenize text
    tokens = nltk.word_tokenize(text)

    # remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    # remove stopwords
    tokens = [token for token in tokens if token not in stopwords.words("english")]
    return " ".join(tokens)

In [117]:
final_df_text["cleaned_text"] = final_df_text["text"].apply(preprocess_texts)
final_df_text.head(1)

Unnamed: 0,id,text,overall_label,cleaned_text
0,1,The film tells the story of Elizabeth (Colbert...,drama,film tells story elizabeth colbert john welles...


# Text embeddings

### FastText

In [118]:
ft = fasttext.load_model("cc.en.300.bin")

In [119]:
ft_df = final_df_text.copy()

ft_df["ft_embeddings"] = ft_df["cleaned_text"].apply(
    lambda x: ft.get_sentence_vector(x)
)
ft_df.head(1)

Unnamed: 0,id,text,overall_label,cleaned_text,ft_embeddings
0,1,The film tells the story of Elizabeth (Colbert...,drama,film tells story elizabeth colbert john welles...,"[-0.0124314, 0.007053868, 0.0022771724, 0.0611..."


In [120]:
ft_embed_arr = np.array(ft_df["ft_embeddings"].to_list())

In [121]:
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
X_tsne = tsne.fit_transform(ft_embed_arr)

ft_df['text'] = ft_df['text'].apply(lambda x: " ".join(x.split()[:5]) + "...")
ft_df["x-tsne"] = X_tsne[:, 0]
ft_df["y-tsne"] = X_tsne[:, 1]
ft_df.head(1)

Unnamed: 0,id,text,overall_label,cleaned_text,ft_embeddings,x-tsne,y-tsne
0,1,The film tells the story...,drama,film tells story elizabeth colbert john welles...,"[-0.0124314, 0.007053868, 0.0022771724, 0.0611...",0.811068,0.94434


In [122]:
# Create interactive scatter plot
fig = px.scatter(
    ft_df,
    x="x-tsne",
    y="y-tsne",
    color="overall_label",
    hover_data={"text": True, "overall_label": True},
    title="Sentences t-SNE Visualization with Fasttext Embeddings",
    width=1200,
    height=1200,
)

# Show plot
fig.show()

### TF-IDF

In [123]:
vectorizer = TfidfVectorizer(max_features=300)

In [124]:
tfidf_df = final_df_text.copy()

X = vectorizer.fit_transform(tfidf_df["cleaned_text"])
print(f" Shape of the TF-IDF matrix: {X.shape}")

tfidf_df["embeddings"] = [X[i].toarray()[0] for i in range(X.shape[0])]

tfidf_emb_arr = np.array(tfidf_df['embeddings'].tolist())

 Shape of the TF-IDF matrix: (200, 300)


In [125]:
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
X_tsne = tsne.fit_transform(tfidf_emb_arr)

tfidf_df['text'] = tfidf_df['text'].apply(lambda x: " ".join(x.split()[:5]) + "...")

# Add t-SNE results to the DataFrame
tfidf_df["x-tsne"] = X_tsne[:, 0]
tfidf_df["y-tsne"] = X_tsne[:, 1]

# Create interactive scatter plot
fig_tfidf = px.scatter(
    tfidf_df,
    x="x-tsne",
    y="y-tsne",
    color="overall_label",
    hover_data={"text": True, "overall_label": True},
    title="Sentences t-SNE Visualization with TF-IDF Embeddings",
    width=1200,
    height=1200,
)
# Show plot

fig_tfidf.show()

### HDBSCAN

In [126]:
hdbscan = HDBSCAN(
    min_cluster_size=5, metric="euclidean", cluster_selection_method="eom"
)

In [127]:
ft_df["cluster"] = hdbscan.fit_predict(ft_embed_arr)
ft_df.head(1)

Unnamed: 0,id,text,overall_label,cleaned_text,ft_embeddings,x-tsne,y-tsne,cluster
0,1,The film tells the story...,drama,film tells story elizabeth colbert john welles...,"[-0.0124314, 0.007053868, 0.0022771724, 0.0611...",0.811068,0.94434,0


In [128]:
# Create interactive scatter plot
ft_df["cluster"] = ft_df["cluster"].astype(str)
fig = px.scatter(
    ft_df,
    x="x-tsne",
    y="y-tsne",
    color="cluster",
    hover_data={"text": True, "overall_label": True, "cluster": True},
    title="Sentences t-SNE Visualization with FastText Embeddings and HDBSCAN Clustering",
    width=1200,
    height=1200,
)

# Show plot
fig.show()

In [129]:
tfidf_df['cluster'] = hdbscan.fit_predict(tfidf_emb_arr)

In [130]:
tfidf_df['cluster'] = tfidf_df['cluster'].astype(str)
fig_cluster = px.scatter(
    tfidf_df,
    x="x-tsne",
    y="y-tsne",
    color="cluster",
    hover_data={"text": True, "overall_label": True, "cluster": True},
    title="Sentences t-SNE Visualization with TF-IDF Embeddings and HDBSCAN Clustering",
    width=1200,
    height=1200,
)
fig_cluster.show()