In [6]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
import pickle
from sklearn.decomposition import PCA

# Connecting Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Loading the NLP model
print("Step 1: Loading the NLP model and extracting three-dimensional vectors")
file_path = '/content/drive/My Drive/Colab Notebooks/Numerical Python/word_embeddings_subset.p'
with open(file_path, 'rb') as f:
    word_embeddings = pickle.load(f)

print("Total words: ", len(word_embeddings))

# Extracting words and vectors
words = list(word_embeddings.keys())
print("All words in the model: ", words)
vectors = np.array([word_embeddings[word] for word in words])

# Reducing vectors to 3 dimensions using PCA
pca = PCA(n_components=3)
vectors_3d = pca.fit_transform(vectors)

# Creating a DataFrame with three-dimensional vectors
df = pd.DataFrame({
    'word': words,
    'vector_x': vectors_3d[:, 0],
    'vector_y': vectors_3d[:, 1],
    'vector_z': vectors_3d[:, 2]
})
print("Step 1 completed: DataFrame created with three-dimensional vectors.\n")

# Checking the result
print(f"{'-' * 20} 1 {'-' * 20}\n")
print("Words in the dataset:\n", df['word'].head())
print(f"\n{'-' * 20} 2 {'-' * 20}\n")
print(df.head())


# Step 2: Function to find the closest word to a given three-dimensional vector
def find_closest_word(vector, df):
    df['distance'] = np.sqrt((df['vector_x'] - vector[0])**2 +
                             (df['vector_y'] - vector[1])**2 +
                             (df['vector_z'] - vector[2])**2)
    closest_word = df.loc[df['distance'].idxmin()]['word']
    return closest_word

# Testing the function with a specific vector
print("Step 2: Finding the closest word to a sample vector")
example_vector = [0.5, -0.2, 0.3]
closest_word_example = find_closest_word(example_vector, df)
print(f"The closest word to the vector {example_vector}: {closest_word_example}")
print(f"Conclusion: The word \"{closest_word_example}\" has the strongest semantic\n"
f"connection to the given vector, demonstrating the proximity of their values in the\n"
f"vector space.")
print(f"\n{'-' * 20} 3 {'-' * 20}\n")


# Step 3: Calculating the cross product to find an orthogonal word
def find_orthogonal_word(vector1, vector2, df):
    cross_product = np.cross(vector1, vector2)
    return find_closest_word(cross_product, df)

# Example usage with two arbitrary words
print("Step 3: Finding an orthogonal word for two sample words")
word1, word2 = 'city', 'China'
vector1 = df.loc[df['word'] == word1, ['vector_x', 'vector_y', 'vector_z']].values[0]
vector2 = df.loc[df['word'] == word2, ['vector_x', 'vector_y', 'vector_z']].values[0]
orthogonal_word = find_orthogonal_word(vector1, vector2, df)
print(f"Orthogonal word to '{word1}' and '{word2}': {orthogonal_word}")
print("""
Conclusions:

The result "Orthogonal word to 'city' and 'China': Fiji" means that the word
"Fiji" is closest to the vector obtained as a result of the cross product of the vectors
of the words "city" and "China".

Conclusion:
Semantic connection: The cross product represents a new vector that is orthogonal
to the original two. This allows us to identify concepts or words that do not have a direct
semantic connection with "city" and "China" but may be important in another context.

Fiji as a result: The vector "Fiji" indicates the uniqueness of this word compared to
the original ones ("city" and "China"). This may mean that "Fiji" represents a distinct semantic
category (e.g., a geographical entity) but is unique in a context
that does not overlap with the other two.

Practical significance:
The cross product helps find semantically independent (orthogonal) words,
which can be useful in tasks of identifying unique concepts or categories in
the word vector space.
""")
print(f"\n{'-' * 20} 4 {'-' * 20}\n")


# Step 4: Function to calculate the angle between two vectors
def calculate_angle(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    magnitude_product = norm(vector1) * norm(vector2)
    cosine_angle = dot_product / magnitude_product
    # Clipping to avoid numerical errors
    angle = np.arccos(np.clip(cosine_angle, -1.0, 1.0))
    return np.degrees(angle)

# Testing the function with two words
print("Step 4: Calculating the angle between two sample words")
angle_between_words = calculate_angle(vector1, vector2)
print(f"The angle between '{word1}' and '{word2}': {angle_between_words:.2f} degrees")

print("""Conclusions:
An angle of 66 degrees between the vectors of the words 'city' and 'China' indicates
their semantic relationship in the vector space. This means that the words have some
similarity in meaning but are not very close or identical.

Summary:

Large angle (close to 90°): indicates a weak semantic relationship or orthogonality of meanings.
Small angle (close to 0°): indicates strong similarity or near identity of meanings.
In this case, 'city' and 'China' are related concepts (both belong to the geographical context),
but they are not as similar as, for example, 'city' and 'town.' This demonstrates how
vector representations of words allow us to evaluate the relative similarity between concepts.
""")

print(f"\n{'-' * 20} Conclusions {'-' * 20}\n")
# Conclusions
print("""
1. The NLP model was loaded, and a three-dimensional representation of words was created for analysis.
2. A function was implemented to find the nearest word to a given vector, demonstrating work with semantic similarities.
3. A vector cross-product was used to find an orthogonal word, allowing for the identification of unique relationships between words.
4. A function for calculating the angle between word vectors was implemented, helping analyze the semantic difference between them.

These methods show how semantic relationships between words can be interpreted in a vector space, which is useful for natural language processing (NLP) tasks.
""")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Step 1: Loading the NLP model and extracting three-dimensional vectors
Total words:  243
All words in the model:  ['country', 'city', 'China', 'Iraq', 'oil', 'town', 'Canada', 'London', 'England', 'Australia', 'Japan', 'Pakistan', 'Iran', 'gas', 'happy', 'Russia', 'Afghanistan', 'France', 'Germany', 'Georgia', 'Baghdad', 'village', 'Spain', 'Italy', 'Beijing', 'Jordan', 'Paris', 'Ireland', 'Turkey', 'Egypt', 'Lebanon', 'Taiwan', 'Tokyo', 'Nigeria', 'Vietnam', 'Moscow', 'Greece', 'Indonesia', 'sad', 'Syria', 'Thailand', 'Libya', 'Zimbabwe', 'Cuba', 'Ottawa', 'Tehran', 'Sudan', 'Kenya', 'Philippines', 'Sweden', 'Poland', 'Ukraine', 'Rome', 'Venezuela', 'Switzerland', 'Berlin', 'Bangladesh', 'Portugal', 'Ghana', 'Athens', 'king', 'Madrid', 'Somalia', 'Dublin', 'Qatar', 'Chile', 'Islamabad', 'Bahrain', 'Nepal', 'Norway', 'Serbia', 'Kabul', 'continent', 'Brussels'