In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from datasets import load_dataset

ds = load_dataset("lucadiliello/newsqa")

In [None]:
ds['train'][5]

In [None]:
!pip uninstall -y numpy
!pip install numpy==1.26.4
!pip install datasets gensim nltk

In [None]:
import nltk
from datasets import load_dataset
from gensim.models import Word2Vec, FastText
import string
import re
import csv

nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from datasets import load_dataset

dataset = load_dataset("lucadiliello/newsqa")

In [None]:

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    cleaned_tokens = []
    for token in tokens:
        if token not in string.punctuation and token not in stop_words and len(token) > 1:
            cleaned_tokens.append(token)
    return cleaned_tokens

print("Extracting and preprocessing corpus...")
train_contexts = list(dataset['train']['context'])
validation_contexts = list(dataset['validation']['context'])
all_contexts = set(train_contexts + validation_contexts)
sentences = [preprocess_text(context) for context in all_contexts]
sentences = [s for s in sentences if s]

print(f"Total unique articles processed: {len(sentences)}")

if sentences:
    print("Example of a processed article (first 20 tokens):")
    print(sentences[0][:20])
else:
    print("No valid sentences found after preprocessing.")

Extracting and preprocessing corpus...
Total unique articles processed: 11475
Example of a processed article (first 20 tokens):
['aol', 'autos', '--', 'magazines', 'full', 'beauty', 'health', 'secrets', 'tips', 'improve', 'inner', 'health', 'polish', 'paint', 'external', 'persona', 'designed', 'keep', 'feeling', 'young']


In [None]:
print("\n--- Experiment 1: Training Word2Vec (Skip-gram) ---")
model_w2v = Word2Vec(sentences, 
                     vector_size=100, 
                     window=5, 
                     min_count=5, 
                     workers=4, 
                     sg=1)

model_w2v.save("word2vec_sg.model")
print("Word2Vec model trained and saved.")

try:
    print("W2V similar to 'india':", model_w2v.wv.most_similar('india', topn=5))
except KeyError:
    print("'india' not in vocabulary (or filtered by min_count).")


--- Experiment 1: Training Word2Vec (Skip-gram) ---
Word2Vec model trained and saved.
W2V similar to 'india': [('mumbai', 0.7410497665405273), ('delhi', 0.736895740032196), ('indian', 0.6777184009552002), ('pakistan', 0.6498389840126038), ('bangladesh', 0.6376136541366577)]


In [None]:
print("\n--- Experiment 2: Training FastText (Skip-gram) ---")
model_ft = FastText(sentences, 
                    vector_size=100, 
                    window=5, 
                    min_count=5, 
                    workers=4, 
                    sg=1)

model_ft.save("fasttext_sg.model")
print("FastText model trained and saved.")




--- Experiment 2: Training FastText (Skip-gram) ---
FastText model trained and saved.
FastText similar to 'india': [('indian', 0.7934386134147644), ('indira', 0.7849792242050171), ('delhi', 0.750071108341217), ('mumbai', 0.7243711948394775), ('indigo', 0.6882209777832031)]
FastText vector for OOV word 'indiaaaaaa' (shape): (100,)


In [None]:
def save_embeddings_to_csv(model_wv, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["word", "embedding"])
        
        for word in model_wv.index_to_key:
            vector = model_wv[word]
            
            vector_str = '[' + ','.join(map(str, vector)) + ']'
            
            writer.writerow([word, vector_str])


save_embeddings_to_csv(model_w2v.wv, "word2vec_sg_embeddings.csv")

save_embeddings_to_csv(model_ft.wv, "fasttext_sg_embeddings.csv")
