In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install nltk
!pip install torch



In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import string
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
    
    def forward(self, x):
        x = self.embeddings(x)
        x = self.linear(x)
        return x

def preprocessing(corpus):
    stop_words = set(stopwords.words('english'))
    sentences = corpus.split(".")
    training_data = []
    for sentence in sentences:
        sentence = sentence.strip().split()
        sentence = [word.strip(string.punctuation).lower() for word in sentence if word.lower() not in stop_words]
        training_data.append(sentence)
    return training_data

def prepare_data_for_training(sentences):
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            if word not in vocab:
                vocab[word] = len(vocab)
    
    X_train = []
    y_train = []
    for sentence in sentences:
        for i, word in enumerate(sentence):
            center_word = vocab[word]
            context_words = []
            for j in range(i-2, i+3):
                if j != i and j >= 0 and j < len(sentence):
                    context_words.append(vocab[sentence[j]])
            
            for context_word in context_words:
                X_train.append(center_word)
                y_train.append(context_word)
    
    return X_train, y_train, vocab

def train_model(model, X_train, y_train, epochs, device):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        total_loss = 0
        for i in range(len(X_train)):
            center_word = torch.tensor([X_train[i]], dtype=torch.long).to(device)
            context_word = torch.tensor([y_train[i]], dtype=torch.long).to(device)

            optimizer.zero_grad()
            output = model(center_word)
            loss = criterion(output, context_word)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(X_train)}')

def predict(model, word, vocab, top_k, device):
    if word not in vocab:
        print("Word not found in dictionary")
        return []
    
    word_index = vocab[word]
    word_tensor = torch.tensor([word_index], dtype=torch.long).to(device)
    with torch.no_grad():
        output = model(word_tensor).cpu().numpy()
    
    softmax_output = np.exp(output) / np.sum(np.exp(output))
    top_k_indices = softmax_output.argsort()[0][-top_k:][::-1]
    
    index_to_word = {v: k for k, v in vocab.items()}
    top_k_words = [index_to_word[i] for i in top_k_indices]
    
    return top_k_words

# Example usage
corpus = "Ayé yìí yípo orún káàkiri ayé. Òṣùpá náà yípo ayé lọ́títọ́. Òrùn ń ràn sí gbogbo ayé. Ilẹ̀ ń gbó ayé ní àpáta. Òjò ń ró sí gbogbo ilẹ̀. Ayé ń yípo sí ayé nínú àwọ̀n èdá."
epochs = 1000

training_data = preprocessing(corpus)
X_train, y_train, vocab = prepare_data_for_training(training_data)
vocab_size = len(vocab)
embedding_dim = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Word2Vec(vocab_size, embedding_dim)
train_model(model, X_train, y_train, epochs, device)

print(predict(model, "yípo", vocab, 3, device))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Epoch 1/1000, Loss: 3.114971248166902
Epoch 2/1000, Loss: 3.0999887883663177
Epoch 3/1000, Loss: 3.085448737655367
Epoch 4/1000, Loss: 3.071343047278268
Epoch 5/1000, Loss: 3.05766330233642
Epoch 6/1000, Loss: 3.044400434408869
Epoch 7/1000, Loss: 3.031544955713408
Epoch 8/1000, Loss: 3.0190867483615875
Epoch 9/1000, Loss: 3.007015194211687
Epoch 10/1000, Loss: 2.9953192685331618
Epoch 11/1000, Loss: 2.983987412282399
Epoch 12/1000, Loss: 2.9730077236890793
Epoch 13/1000, Loss: 2.962368062564305
Epoch 14/1000, Loss: 2.952055988567216
Epoch 15/1000, Loss: 2.9420589985592023
Epoch 16/1000, Loss: 2.9323644808360507
Epoch 17/1000, Loss: 2.922959883298193
Epoch 18/1000, Loss: 2.9138327730553493
Epoch 19/1000, Loss: 2.904970863035747
Epoch 20/1000, Loss: 2.8963620981999805
Epoch 21/1000, Loss: 2.8879946874720708
Epoch 22/1000, Loss: 2.8798573708959987
Epoch 23/1000, Los

In [4]:
print(predict(model, "káàkiri", vocab, 3, device))

['ayé', 'yípo', 'orún']


In [5]:
print(predict(model, "gbogbo", vocab, 3, device))

['sí', 'ayé', 'ró']
