In [None]:
import io
import nltk
from nltk import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import numpy as np
import numpy.linalg as lin
import pandas as pd
import unicodedata
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
import pickle
import warnings

In [None]:
nltk.download('stopwords')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))
ps=PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
w2v = KeyedVectors.load_word2vec_format('/content/w2v.bin', binary=True)

##PREPROCESS

In [None]:
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [None]:
def remove_special_characters(text):
    regex = re.compile('[^a-zA-Z\s]')
    text_returned = re.sub(regex,' ',text)
    return text_returned

In [None]:
def word_embedding(word):
    word=word.lower()
    try:
        return w2v.get_vector(word)
    except:
        return np.array([0.0]*300)

In [None]:
def sentence_embedding(sentence):
    sentence=strip_accents(sentence)
    sentence=remove_special_characters(sentence)
    words=word_tokenize(sentence)
    if len(words)>0:
        words=[word  for word in words if word not in stop_words]
        sentence_embedding=[word_embedding(word) for word in words]
        return np.array(list(map(lambda x: sum(x)/len(x), zip(*sentence_embedding))))
    return np.array([0]*300)

In [None]:
data=pd.read_csv('/content/English.csv')

## Sentence by Sentence

In [None]:
verse_embeddings_sentence=[]
verse_embeddings_max=[]
verse_embeddings_mean=[]
for i in range(len(data)):
    text=data.loc[i]['Commentary']
    text=strip_accents(text)
    sentences=sent_tokenize(text)
    embeddings=[sentence_embedding(sentence) for sentence in sentences]
    #sentence
    verse_embeddings_sentence.append(embeddings)
    #Max Pooling
    norms=[lin.norm(i) for i in embeddings]
    index=norms.index(max(norms))
    verse_embeddings_max.append(embeddings[index])
    #Mean Pooling
    embeddings=np.array(list(map(lambda x: sum(x)/len(x), zip(*embeddings))))
    verse_embeddings_mean.append(embeddings)

##Whole Sentence

In [None]:
verse_embeddings_whole=[]
for i in range(len(data)):
    text=data.loc[i]['Commentary']
    sentence=strip_accents(text)
    embeddings=sentence_embedding(sentence)
    verse_embeddings_whole.append(embeddings)

In [None]:
file=open('sentence.pkl','wb')
pickle.dump(verse_embeddings_sentence,file)
file.close()

In [None]:
file=open('whole.pkl','wb')
pickle.dump(verse_embeddings_whole,file)
file.close()

In [None]:
file=open('max.pkl','wb')
pickle.dump(verse_embeddings_max,file)
file.close()
file=open('mean.pkl','wb')
pickle.dump(verse_embeddings_mean,file)
file.close()

## model training


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

nltk.download('stopwords')

def load_data(file_path):
    data = pd.read_csv('/content/English.csv')
    return data

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text), re.I|re.A)  # Remove non-alphabetic characters
    text = text.lower() # Lowercase
    tokens = simple_preprocess(text, deacc=True, min_len=3) # Tokenize
    stop_words = set(stopwords.words('english')) # Remove Stopwords
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

file_path = '/content/English.csv'
data = load_data(file_path)
data['tokens'] = data['Commentary'].astype(str).apply(preprocess_text)
sentences = data['tokens'].tolist()

# Word2Vec parameters
vector_size = 300
window = 5
min_count = 1
workers = 4
sg = 0
epochs = 10

model = Word2Vec(sentences, vector_size=vector_size, window=window, min_count=min_count, workers=workers, sg=sg, epochs=epochs)

model.save("bhagavad_gita_word2vec.model")

model.wv.save_word2vec_format('w2v.bin', binary=True)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
