# Practical 2

## Section 1: Data Preparation

**Import Pandas library**

In [5]:
import pandas as pd

In [1]:
ROOT = "https://drive.google.com/drive/folders/1zyrm2nH9kFNAsQ3Gh3RvmW9uZydTt3VP" # Your Working Directory
import os
os.chdir(ROOT)

FileNotFoundError: [Errno 2] No such file or directory: 'https://drive.google.com/drive/folders/1zyrm2nH9kFNAsQ3Gh3RvmW9uZydTt3VP'

In [14]:
# Opens the text.txt file in read mode

with open('text.txt', 'r', encoding='utf-8') as f: # Remember your data set path should be specified if not in same working directory
    texts = f.readlines()

In [15]:
texts

['When life gives you lemons, make lemonade! 🙂\n',
 'She bought 2 lemons for $1 at Maven Market.\n',
 'A dozen lemons will make a gallon of lemonade. [AllRecipes]\n',
 'lemon, lemon, lemons, lemon, lemon, lemons\n',
 "He's running to the market to get a lemon — there's a great sale today.\n",
 'Does Maven Market carry Eureka lemons or Meyer lemons?\n',
 'An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]\n',
 'iced tea is my favorite\n']

In [20]:
# Remove the line breaker

with open("text.txt", "r", encoding="utf-8") as f:
    data = [line.strip() for line in f if line.strip()]


In [22]:
# Convert list to DataFrame

data_df = pd.DataFrame(data, columns=['sentence'])
data_df


Unnamed: 0,sentence
0,"When life gives you lemons, make lemonade! 🙂"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade....
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon — th...
5,Does Maven Market carry Eureka lemons or Meyer...
6,"An Arnold Palmer is half lemonade, half iced t..."
7,iced tea is my favorite


In [23]:
texts[:10]

['When life gives you lemons, make lemonade! 🙂',
 'She bought 2 lemons for $1 at Maven Market.',
 'A dozen lemons will make a gallon of lemonade. [AllRecipes]',
 'lemon, lemon, lemons, lemon, lemon, lemons',
 "He's running to the market to get a lemon — there's a great sale today.",
 'Does Maven Market carry Eureka lemons or Meyer lemons?',
 'An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]',
 'iced tea is my favorite']

`Data Quality Assessment` by 
- Counting total documents and total words.

- Collecting unique words and their frequencies.

- Measuring sentence lengths and average sentence length.

- Calculating vocabulary size (unique words count).

- Finding the top 20 most frequent words.

- Computing quality scores like vocabulary diversity, average word frequency, and rare word ratio.

In [32]:
def assess_data_quality(data):
    """Analyze text data quality for Word2Vec training"""

    stats = {
        'total_documents': len(texts),
        'total_words': 0,
        'unique_words': set(),
        'sentence_lengths': [],
        'word_frequencies': {}
    }

    for text in texts:
        words = text.lower().split()
        stats['total_words'] += len(words)
        stats['sentence_lengths'].append(len(words))
        stats['unique_words'].update(words)

        for word in words:
            stats['word_frequencies'][word] = stats['word_frequencies'].get(word, 0) + 1

    stats['vocabulary_size'] = len(stats['unique_words'])
    stats['avg_sentence_length'] = sum(stats['sentence_lengths']) / len(stats['sentence_lengths'])

    # Find most common words
    sorted_words = sorted(stats['word_frequencies'].items(), key=lambda x: x[1], reverse=True)
    stats['top_words'] = sorted_words[:20]

    # Quality indicators
    stats['quality_score'] = {
        'vocabulary_diversity': stats['vocabulary_size'] / stats['total_words'],
        'avg_word_frequency': stats['total_words'] / stats['vocabulary_size'],
        'rare_words_ratio': sum(1 for count in stats['word_frequencies'].values() if count == 1) / stats['vocabulary_size']
    }

    return stats

# Example usage
quality_report = assess_data_quality(texts)
print(f"Total documents: {quality_report['total_documents']:,}")
print(f"Vocabulary size: {quality_report['vocabulary_size']:,}")
print(f"Unique Words: {quality_report['unique_words']}")
print(f"Average sentence length: {quality_report['avg_sentence_length']:.1f}")
print(f"Vocabulary diversity: {quality_report['quality_score']['vocabulary_diversity']:.4f}")

Total documents: 8
Vocabulary size: 55
Unique Words: {'lemonade!', '[allrecipes]', 'lemonade,', 'get', 'the', 'maven', 'at', 'will', 'market', '$1', 'for', 'tea.', 'life', 'arnold', 'a', '2', 'gallon', 'lemon', 'favorite', "there's", 'make', 'lemonade.', 'you', '[wikipedia]', 'when', 'running', 'she', 'meyer', 'carry', 'or', 'bought', 'of', '🙂', 'iced', 'sale', 'to', 'is', 'great', "he's", 'my', 'market.', 'lemons?', 'an', 'lemons', 'lemons,', 'tea', 'eureka', 'gives', 'does', '—', 'palmer', 'lemon,', 'today.', 'dozen', 'half'}
Average sentence length: 9.0
Vocabulary diversity: 0.7639


## Section 2: Text Preprocessing Pipeline

In [29]:
!pip install nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [33]:
#Import Packages
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
import nltk

In [35]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/chimigyeltshen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chimigyeltshen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chimigyeltshen/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/chimigyeltshen/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/chimigyeltshen/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [36]:
class AdvancedTextPreprocessor:
    """Comprehensive text preprocessing for Word2Vec training"""

    def __init__(self,
                 lowercase=True,
                 remove_punctuation=True,
                 remove_numbers=False,
                 remove_stopwords=False,
                 min_word_length=2,
                 max_word_length=50,
                 lemmatize=False,
                 remove_urls=True,
                 remove_emails=True,
                 keep_sentences=True):

        self.lowercase = lowercase
        self.remove_punctuation = remove_punctuation
        self.remove_numbers = remove_numbers
        self.remove_stopwords = remove_stopwords
        self.min_word_length = min_word_length
        self.max_word_length = max_word_length
        self.lemmatize = lemmatize
        self.remove_urls = remove_urls
        self.remove_emails = remove_emails
        self.keep_sentences = keep_sentences

        if remove_stopwords:
            self.stop_words = set(stopwords.words('english'))

        if lemmatize:
            self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        """Clean individual text string"""

        # Remove URLs
        if self.remove_urls:
            text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        # Remove email addresses
        if self.remove_emails:
            text = re.sub(r'\S+@\S+', '', text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        #Combined
         #(r'https?://\S+|www\.\S+|<.*?>|\S+@\S+\.\S+|@\w+|#\w+|[^A-Za-z0-9\s])

        return text

    def tokenize_text(self, text):
        """Tokenize text into sentences or words"""

        if self.keep_sentences:
            # Tokenize into sentences first
            sentences = sent_tokenize(text)
            processed_sentences = []

            for sentence in sentences:
                words = self.process_sentence(sentence)
                if len(words) >= 3:  # Keep sentences with at least 3 words
                    processed_sentences.append(words)

            return processed_sentences
        else:
            # Return single list of words
            return self.process_sentence(text)

    def process_sentence(self, sentence):
        """Process individual sentence"""

        # Lowercase
        if self.lowercase:
            sentence = sentence.lower()

        # Tokenize into words
        words = word_tokenize(sentence)

        processed_words = []
        for word in words:

            # Remove punctuation
            if self.remove_punctuation:
                word = word.translate(str.maketrans('', '', string.punctuation))

            # Skip if empty after punctuation removal
            if not word:
                continue

            # Remove numbers
            if self.remove_numbers and word.isdigit():
                continue

            # Check word length
            if len(word) < self.min_word_length or len(word) > self.max_word_length:
                continue

            # Remove stopwords
            if self.remove_stopwords and word in self.stop_words:
                continue

            # Lemmatize
            if self.lemmatize:
                word = self.lemmatizer.lemmatize(word)

            processed_words.append(word)

        return processed_words

    def preprocess_corpus(self, texts):
        """Preprocess entire corpus"""

        all_sentences = []

        for text in texts:
            if not isinstance(text, str):
                continue

            # Clean text
            cleaned_text = self.clean_text(text)

            # Tokenize and process
            processed = self.tokenize_text(cleaned_text)

            if self.keep_sentences:
                all_sentences.extend(processed)
            else:
                all_sentences.append(processed)

        return all_sentences

In [37]:
# Example usage
preprocessor = AdvancedTextPreprocessor(
    lowercase=True,
    remove_punctuation = True,
    remove_numbers=True,
    remove_stopwords=False,  # Keep stopwords for Word2Vec
    lemmatize=False,  # Usually not needed for Word2Vec
    keep_sentences=True
)

# Processing corpus
processed_sentences = preprocessor.preprocess_corpus(texts)
print(f"Processed {len(processed_sentences)} sentences")
print(f"Sample sentence: {processed_sentences[0]}")

Processed 8 sentences
Sample sentence: ['when', 'life', 'gives', 'you', 'lemons', 'make', 'lemonade']


In [38]:
processed_sentences[:3]

[['when', 'life', 'gives', 'you', 'lemons', 'make', 'lemonade'],
 ['she', 'bought', 'lemons', 'for', 'at', 'maven', 'market'],
 ['dozen', 'lemons', 'will', 'make', 'gallon', 'of', 'lemonade']]

## Training Parameters

Parameters Selection Guidelines

In [39]:
def recommend_parameters(corpus_size, vocab_size, domain_type, computing_resources):
    """
    Recommend Word2Vec parameters based on corpus characteristics

    Args:
        corpus_size: Number of sentences/documents
        vocab_size: Unique words in vocabulary
        domain_type: 'general', 'technical', 'social_media', 'academic'
        computing_resources: 'limited', 'moderate', 'high'
    """

    recommendations = {}

    # Vector size based on corpus and vocab size
    if corpus_size < 10000:
        recommendations['vector_size'] = 50
    elif corpus_size < 100000:
        recommendations['vector_size'] = 100
    elif corpus_size < 1000000:
        recommendations['vector_size'] = 200
    else:
        recommendations['vector_size'] = 300

    # Window size based on domain
    domain_windows = {
        'general': 5,
        'technical': 3,  # More syntactic focus
        'social_media': 4,
        'academic': 6    # More semantic focus
    }
    recommendations['window'] = domain_windows.get(domain_type, 5)

    # Min count based on corpus size
    if corpus_size < 10000:
        recommendations['min_count'] = 1
    elif corpus_size < 100000:
        recommendations['min_count'] = 2
    elif corpus_size < 1000000:
        recommendations['min_count'] = 5
    else:
        recommendations['min_count'] = 10

    # Algorithm selection
    if domain_type in ['technical', 'academic']:
        recommendations['sg'] = 1  # Skip-gram for rare technical terms
    else:
        recommendations['sg'] = 0  # CBOW for general text

    # Epochs based on corpus size and resources
    if computing_resources == 'limited':
        recommendations['epochs'] = 5
    elif corpus_size < 100000:
        recommendations['epochs'] = 15
    else:
        recommendations['epochs'] = 10

    # Hierarchical softmax vs negative sampling
    if vocab_size > 100000:
        recommendations['hs'] = 1
        recommendations['negative'] = 0
    else:
        recommendations['hs'] = 0
        recommendations['negative'] = 10

    return recommendations

In [40]:
corpus_size = len(processed_sentences)
print(f"Corpus Size: {corpus_size}")

# Calculate vocabulary size (unique words in vocabulary)
vocab = set(word for sentence in processed_sentences for word in sentence)
vocab_size = len(vocab)
print(f"Vocabulary Size: {vocab_size}")

Corpus Size: 8
Vocabulary Size: 41


In [41]:
# For this task
params = recommend_parameters(
    corpus_size=corpus_size,
    vocab_size=vocab_size,
    domain_type='general',
    computing_resources='moderate'
)
print("Recommended parameters:", params)

Recommended parameters: {'vector_size': 50, 'window': 5, 'min_count': 1, 'sg': 0, 'epochs': 15, 'hs': 0, 'negative': 10}


## Step-by-Step Implementation

Basic Training Implementation

In [None]:
pip install gensim

SyntaxError: invalid syntax (4286653784.py, line 1)

In [59]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
import time
import multiprocessing

class EpochLogger(CallbackAny2Vec):
    """Callback to log information about training progress"""

    def __init__(self):
        self.epoch = 0
        self.start_time = time.time()

    def on_epoch_begin(self, model):
        print(f"Epoch #{self.epoch} start")

    def on_epoch_end(self, model):
        elapsed = time.time() - self.start_time
        print(f"Epoch #{self.epoch} end - Time elapsed: {elapsed:.2f}s")
        self.epoch += 1

def train_word2vec_model(sentences, save_path=None, **params):
    """
    Train Word2Vec model with given parameters

    Args:
        sentences: List of tokenized sentences
        save_path: Path to save the model
        **params: Word2Vec parameters
    """

    # Set default parameters
    default_params = {
        'vector_size': 100,
        'window': 5,
        'min_count': 5,
        'workers': multiprocessing.cpu_count() - 1,
        'sg': 0,  # CBOW
        'epochs': 10,
        'alpha': 0.025,
        'min_alpha': 0.0001,
        'hs': 0,
        'negative': 10
    }

    # Update with provided parameters
    default_params.update(params)

    print("Training Word2Vec model with parameters:")
    for key, value in default_params.items():
        print(f"  {key}: {value}")

    # Add callback for progress monitoring
    epoch_logger = EpochLogger()

    # Train the model
    print(f"\nTraining on {len(sentences)} sentences...")
    start_time = time.time()

    model = Word2Vec(
        sentences=sentences,
        callbacks=[epoch_logger],
        **default_params
    )

    training_time = time.time() - start_time
    print(f"\nTraining completed in {training_time:.2f} seconds")
    print(f"Vocabulary size: {len(model.wv)} words")

    # Save model if path provided
    if save_path:
        model.save(save_path)
        print(f"Model saved to {save_path}")

    return model

ModuleNotFoundError: No module named 'gensim'