# WORD2VEC FOR HINDI WORDS

In [1]:
import gensim
from gensim.models import Word2Vec

In [2]:
def read_hindi_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        hindi_text = f.read()
    return hindi_text

In [3]:
def preprocess_and_tokenize(text):
    tokenized_text = text.split()
    return tokenized_text

In [4]:
file_path = 'IITB.en-hi.hi'
hindi_text = read_hindi_file(file_path)

In [5]:
tokenized_text = preprocess_and_tokenize(hindi_text)
dataset = [tokenized_text]

In [6]:
model = Word2Vec(sentences=dataset, vector_size=100, window=5, min_count=1)

In [7]:
model.save("hindi_word2vec_model.bin")

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
word1 = "राजा"
word2 = "रानी"
vector1 = model.wv[word1]
vector2 = model.wv[word2]
vector1 = vector1.reshape(1, -1) 
vector2 = vector2.reshape(1, -1) 
similarity = cosine_similarity(vector1, vector2)[0][0]
print("Similarity between ", word1, " and ", word2, " : ", similarity)

Similarity between  राजा  and  रानी  :  0.429853


# VISUALIZATION USING t-SNE

In [13]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [18]:
def plot_tsne(model, words):
    # Extract embeddings for the specified words
    embeddings = [model.wv[word] for word in words]

    # Perform t-SNE to reduce dimensionality to 2D
    tsne = TSNE(n_components=2, random_state=42)
    embeddings_2d = tsne.fit_transform(embeddings)

    # Plot the words in 2D space
    plt.figure(figsize=(12, 8))
    for i in range(len(words)):
        plt.scatter(embeddings_2d[i, 0], embeddings_2d[i, 1])
        plt.text(embeddings_2d[i, 0] + 0.2, embeddings_2d[i, 1] + 0.2, words[i], fontsize=12)

    plt.title(f't-SNE Visualization of {lang} Word Embeddings')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')
    plt.show()

In [None]:
words_to_plot = ['राजा', 'रानी', 'मनुष्य', 'महिला']
plot_tsne(model, words_to_plot)