# Library importation

In [95]:
import pdb
#import tensorflow as tf
import re
import string
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import PyPDF2
from pathlib import Path
import multiprocessing as ml
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# Function definition 

In [4]:
def read_pdf(file_path):
    # Read and extract text from a PDF file
    with open(file_path, "rb") as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() if page.extract_text() else ""
    return text

In [5]:
def read_csv(file_path):
    # Read and extract text from a CSV file
    df = pd.read_csv(file_path)
    text = ' '. join(df['content'].astype(str))
    return text

In [6]:
def read_txt(file_path):
    with open(file_path, 'r') as file:
        text= file.read()
        return text

In [7]:
def get_path(folder_path):
    mpath = Path(folder_path)
    files_to_process = []
    for subfolder in mpath.iterdir():
        if subfolder.is_dir():
            files_to_process.extend(subfolder.glob('*.csv'))
            files_to_process.extend(subfolder.glob('*.pdf'))
            files_to_process.extend(subfolder.glob('*.txt'))
    return files_to_process

In [8]:
def get_text(texts):
    total_text = ''
    for path in texts:
        if path.suffix == '.csv':
            total_text += read_csv(path)
        elif path.suffix == '.pdf':
            total_text += read_pdf(path)
        elif path.suffix == '.txt':
            total_text += read_txt(path)    
    return total_text

In [9]:
def clean_sentences(sentences):
    clean_sentences = []
    for st in sentences:
        tokens = st.translate(str.maketrans('', '', string.punctuation)).split()
        tokens = [word.lower() for word in tokens if word.isalpha()]
        if tokens:
            clean_sentences.append(tokens)
    return clean_sentences

In [22]:
def model_word2vector(sentences):
    model500 = Word2Vec(sentences, vector_size=500, window=5, min_count= 4, workers=ml.cpu_count())
    model3 = Word2Vec(sentences, vector_size=2, window=5, min_count= 4, workers=ml.cpu_count())
    return model500, model3

In [23]:
def clean_sentences(sentences):
    clean_sentences = []
    for st in sentences:
        tokens = st.translate(str.maketrans('', '', string.punctuation)).split()
        tokens = [word.lower() for word in tokens if word.isalpha()]
        if tokens:
            clean_sentences.append(tokens)
    return clean_sentences

# Data preparation and model Training

In [24]:
  main_folder_path = 'data' 

In [25]:
    all_texts = get_path(main_folder_path)
    text = get_text(all_texts)
    sentences = text.split('.')

    sentences = clean_sentences(sentences)
    model500, model3 = model_word2vector(sentences)


# Models and Embeding save

In [26]:
model500.save('model500.model')
model3.save('model3.model')

In [28]:
model500 = Word2Vec.load('model500.model')
model3 = Word2Vec.load('model3.model')

In [30]:
model500.wv.save_word2vec_format('model500.txt', binary=False)
model500.wv.save_word2vec_format('model500.bin', binary=True)
model3.wv.save_word2vec_format('model3.txt', binary=False)
model3.wv.save_word2vec_format('model3.bin', binary=True)

# Results test

In [31]:
sentences[:100]

[['april', 'april', 'apr'],
 ['is',
  'the',
  'fourth',
  'month',
  'of',
  'the',
  'year',
  'in',
  'the',
  'julian',
  'and',
  'gregorian',
  'calendars',
  'and',
  'comes',
  'between',
  'march',
  'and',
  'may'],
 ['it', 'is', 'one', 'of', 'four', 'months', 'to', 'have', 'days'],
 ['april',
  'always',
  'begins',
  'on',
  'the',
  'same',
  'day',
  'of',
  'the',
  'week',
  'as',
  'july',
  'and',
  'additionally',
  'january',
  'in',
  'leap',
  'years'],
 ['april',
  'always',
  'ends',
  'on',
  'the',
  'same',
  'day',
  'of',
  'the',
  'week',
  'as',
  'december'],
 ['april',
  'comes',
  'between',
  'march',
  'and',
  'may',
  'making',
  'it',
  'the',
  'fourth',
  'month',
  'of',
  'the',
  'year'],
 ['it',
  'also',
  'comes',
  'first',
  'in',
  'the',
  'year',
  'out',
  'of',
  'the',
  'four',
  'months',
  'that',
  'have',
  'days',
  'as',
  'june',
  'september',
  'and',
  'november',
  'are',
  'later',
  'in',
  'the',
  'year'],
 ['april

In [32]:
vect = model500.wv['game']

In [33]:
vect

array([ 2.10725784e+00,  1.89680839e+00, -2.65838957e+00,  1.95327914e+00,
       -1.66984773e+00, -1.44440961e+00, -4.31468159e-01,  8.39326143e-01,
       -9.25114453e-01,  8.50797594e-01, -9.43068922e-01,  9.31364000e-01,
       -7.87166119e-01, -6.72083795e-01, -1.44624019e+00, -3.33258599e-01,
       -5.25735497e-01, -1.44950235e+00,  3.22843581e-01, -1.10283709e+00,
        2.03726745e+00, -9.27105770e-02,  4.51772392e-01, -3.11496615e-01,
        1.14290297e+00, -4.88826632e-01,  9.07555521e-01,  1.51509392e+00,
        7.73398578e-01,  1.51512980e+00,  5.86390615e-01,  4.60843533e-01,
       -1.83446482e-02, -7.05435514e-01, -4.74932909e-01, -1.07805125e-01,
        4.80949432e-01, -2.46366858e-01, -2.80223433e-02,  1.01796508e+00,
        4.35613394e-01,  1.24707770e+00,  1.81853676e+00, -2.32930839e-01,
       -2.23781653e-02,  5.64266086e-01, -2.60105824e+00, -4.95550781e-01,
        1.25903738e+00, -6.81383014e-02, -4.52734649e-01,  2.31259808e-01,
        1.31629169e+00, -

In [34]:
game_nw = model500.wv.most_similar('game', topn=20)
game_nw

[('games', 0.7180423140525818),
 ('gameplay', 0.5921841263771057),
 ('multiplayer', 0.5827704668045044),
 ('mode', 0.5642474889755249),
 ('nintendo', 0.5506405234336853),
 ('console', 0.541068434715271),
 ('ds', 0.5315804481506348),
 ('firstperson', 0.5241313576698303),
 ('pokémon', 0.518710196018219),
 ('rpg', 0.5171893239021301),
 ('puzzle', 0.5162062644958496),
 ('playstation', 0.5125581622123718),
 ('sega', 0.5011301636695862),
 ('dlc', 0.5001646280288696),
 ('minecraft', 0.4977705478668213),
 ('shooter', 0.49577319622039795),
 ('singleplayer', 0.4943183958530426),
 ('roleplaying', 0.4856371283531189),
 ('videogame', 0.48436230421066284),
 ('sonic', 0.4838115870952606)]

In [36]:
king_nw = model500.wv.most_similar('king', topn=20)
king_nw

[('prince', 0.6080267429351807),
 ('ruler', 0.6053209900856018),
 ('queen', 0.6035491824150085),
 ('emperor', 0.5890554189682007),
 ('pharaoh', 0.57773756980896),
 ('kings', 0.5705621242523193),
 ('vi', 0.5657829642295837),
 ('throne', 0.5585757493972778),
 ('iii', 0.5448864102363586),
 ('lord', 0.5194017291069031),
 ('vii', 0.5147203207015991),
 ('fulk', 0.5093024373054504),
 ('monarch', 0.508232593536377),
 ('sultan', 0.5065828561782837),
 ('regent', 0.5059827566146851),
 ('wessex', 0.5055989027023315),
 ('northumbria', 0.5041124820709229),
 ('mercia', 0.5002672076225281),
 ('duke', 0.49940136075019836),
 ('kingship', 0.49407893419265747)]

In [37]:
queen_nw = model500.wv.most_similar('queen', topn=20)
queen_nw

[('princess', 0.6842767596244812),
 ('consort', 0.6407124400138855),
 ('king', 0.6035492420196533),
 ('elizabeth', 0.5711863040924072),
 ('empress', 0.5549060106277466),
 ('margrethe', 0.5422132611274719),
 ('countess', 0.5337541699409485),
 ('crown', 0.5255757570266724),
 ('monarch', 0.5162689685821533),
 ('duchess', 0.5094940066337585),
 ('prince', 0.5072727203369141),
 ('spades', 0.5020029544830322),
 ('lady', 0.48446476459503174),
 ('jubilee', 0.48352178931236267),
 ('regnant', 0.47923967242240906),
 ('mary', 0.47914919257164),
 ('regent', 0.4766712188720703),
 ('orleansbraganza', 0.4759351313114166),
 ('dowager', 0.46881017088890076),
 ('bride', 0.4663589596748352)]

In [40]:
man_nw = model500.wv.most_similar('man', topn=20)
man_nw

[('woman', 0.6998342871665955),
 ('girl', 0.5921055674552917),
 ('boy', 0.572533905506134),
 ('person', 0.5443896055221558),
 ('men', 0.5394736528396606),
 ('policeman', 0.497673898935318),
 ('creature', 0.4876663386821747),
 ('dog', 0.46195554733276367),
 ('soldier', 0.4587596654891968),
 ('someone', 0.4513213336467743),
 ('thief', 0.45065006613731384),
 ('mask', 0.4500892460346222),
 ('horse', 0.446686327457428),
 ('cat', 0.44589564204216003),
 ('fisherman', 0.4397120773792267),
 ('teenager', 0.4351902902126312),
 ('wizard', 0.43498751521110535),
 ('himself', 0.4339897930622101),
 ('sorcerer', 0.43274062871932983),
 ('somebody', 0.4200808107852936)]

# Word Generation

In [64]:
queen_vec = model500.wv['king'] - model500.wv['man'] + model500.wv['women']

In [65]:
queen_vec

array([ 2.97140479e-01,  1.09050286e+00, -3.17724586e+00, -1.23579693e+00,
        6.16001725e-01,  8.35163414e-01,  1.13798499e+00, -6.73759282e-01,
       -1.58801079e-01, -1.92316926e+00, -1.21838975e+00, -1.60541606e+00,
        8.81904244e-01, -1.92605495e+00,  1.87655365e+00,  1.26584768e-01,
        3.06267977e+00,  8.58463466e-01,  4.87583339e-01,  1.26731873e+00,
       -2.18453932e+00,  1.76515925e+00,  8.50111067e-01,  5.92115402e-01,
        5.67491412e-01,  1.66743886e+00, -3.62432146e+00,  4.55023110e-01,
       -1.93100631e-01,  1.64911747e+00,  1.28614354e+00,  2.04928970e+00,
        1.13108754e+00, -2.41194904e-01,  1.62983716e-01, -1.01127684e-01,
       -3.63498235e+00,  1.19219947e+00, -6.64474666e-01, -5.79971910e-01,
       -9.55317438e-01, -4.45659220e-01,  1.05945992e+00,  5.78917503e-01,
       -1.55063361e-01, -4.93622422e-01, -1.76791525e+00,  1.55182004e+00,
       -1.79064453e+00,  8.42220485e-01, -4.65382218e-01,  1.81804872e+00,
        1.25967398e-01,  

In [57]:
queen_real_vec = model500.wv['queen']

In [58]:
queen_real_vec

array([ 6.56349599e-01, -9.45434034e-01, -1.57498717e+00, -6.99356794e-01,
       -1.12084627e+00, -5.90598226e-01,  1.52649772e+00, -1.19427168e+00,
        4.21832293e-01, -1.08342671e+00,  1.16573222e-01, -6.92027390e-01,
        8.30630660e-01, -5.92471838e-01, -9.40282226e-01, -8.00029516e-01,
        1.48993039e+00,  9.38598931e-01, -2.23967150e-01,  1.31676340e+00,
       -5.85213304e-01,  2.27925867e-01,  4.44263697e-01, -9.32893515e-01,
        6.77061856e-01,  1.80085063e+00, -1.23680568e+00,  9.02125061e-01,
       -1.74181312e-02,  4.28734660e-01,  9.87404406e-01,  7.63870776e-01,
        1.15787971e+00,  4.92443264e-01, -5.35351574e-01, -2.19941884e-01,
       -1.71931815e+00,  1.13790430e-01, -7.12246895e-01,  4.31459010e-01,
        2.04940110e-01, -1.89498997e+00,  5.66967607e-01, -7.65978158e-01,
        4.24899131e-01,  1.00798691e-02, -5.09240210e-01,  4.37689386e-02,
       -1.01622570e+00, -9.51126933e-01, -1.58162582e+00,  1.29499507e+00,
       -1.10882080e+00,  

In [72]:
#similarity = model500.wv.similarity(quee_vec, quee_real_vec)
queens_dot = np.dot(queen_real_vec, queen_vec)
queen_mag = np.linalg.norm(queen_vec)
queen_real_mag = np.linalg.norm(queen_real_vec)
queens_cosine = queens_dot / (queen_mag*queen_real_mag)

In [73]:
queens_cosine

0.5063322

# Basic interaction with the model (Analogy)

In [87]:

def analogy(vec1, vec2, vec3):
    similarity = model500.wv.most_similar(positive=[vec1, vec3], negative=[vec2])
    print(f"{vec1} is to {vec2} as {similarity[0][0]} is a {vec3}")

In [88]:
nlgy = analogy('king', 'man', 'woman') 

king is to man as queen is a woman


# 2D model view

In [97]:
load_embedig = KeyedVectors.load_word2vec_format('model3.txt', binary=False)

In [99]:
words = list(load_embedig.index_to_key)

In [101]:
vectors = [load_embedig[word] for word in words]

In [None]:
plt.figure(figsize=(20, 12))
for i, label in enumerate(words):
    x, y = vectors[i]
    plt.scatter(x, y, marker='x')
    plt.annotate(label, (x, y), textcoords = 'offset points', xytext=(0,5), ha='center', fontsize=8)

plt.title("word vectos Visualization")
plt.xlabel('x')
plt.ylabel('y')
plt.grid(True)
plt.show()