In [1]:
import sys
# !{sys.executable} -m pip install [modulename]

In [1]:
import os
import subprocess
import collections

### 1. Data preprocessing

**Download the dataset**

Use the provided download corpus.py script to download the Text8 corpus.

In [5]:
WORKING_DIR = os.getcwd()
DATA_DIR = WORKING_DIR + "/data"
DATA_PATH = DATA_DIR + "/text8_20m.txt"
DOWNLOAD_CORPUS_SCRIPT = DATA_DIR +"/download_corpus.py"

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

if os.path.exists(DATA_PATH):
    print("Data file exists")
else:
    print("Data file does not exist. Attempting to download...")
    
    os.chdir(DATA_DIR)
    try:
        result = subprocess.run(
            ["python", DOWNLOAD_CORPUS_SCRIPT], # Check if you need to use python3 instead of python
            capture_output=True, text=True
        )
    finally:
        os.chdir(WORKING_DIR)

    print("STDOUT:", result.stdout)
    print("STDERR:", result.stderr)

    if os.path.exists(DATA_PATH):
        print("Data file downloaded successfully")
    else:
        print("Failed to download the data file")

# Finally, load the dataset
print("Loading the dataset...")
with open(DATA_PATH, "r") as f:
    text = f.read()
print("Dataset loaded successfully")

Data file exists
Loading the dataset...
Dataset loaded successfully


**Text processing**

Split the text into words using whitespace as the delimiter

In [27]:
words = text.split()
print("Number of words in the dataset:", len(words))
print("First 10 words in the dataset:", words[:10])


Number of words in the dataset: 2000000
First 10 words in the dataset: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


**Building the Vocabulary**

Count the frequency of each word in the dataset and keep the top 60000 most frequent, the rest 'UNK' value


In [31]:

def build_vocabulary(words, vocabulary_size):
    vocabulary = collections.Counter(words).most_common(vocabulary_size)
    vocabulary_set = set(word for word, _ in vocabulary)
    words_sanitized = [word if word in vocabulary_set else "UNK" for word in words]

    vocabulary.append(("UNK", words_sanitized.count("UNK")))
    vocabulary = {index: { 'word': word, 'count': count } for index, (word, count) in enumerate(vocabulary)}

    word_to_index = {word['word']: i for i, word in vocabulary.items()}
    words_index = [word_to_index[word] for word in words_sanitized]
    

    return vocabulary, words_index

VOCABULARY_SIZE = 60000
vocabulary, words_index = build_vocabulary(words, VOCABULARY_SIZE)

print("words: ", words[:80])
print("words_c:", words_index[:80])
# print("Vocabulary:", vocabulary)

words:  ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative', 'way', 'to', 'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as', 'a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived', 'from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king']
words_c: [1335, 2861, 12, 6, 193, 1, 4066, 48, 59, 136, 128, 864, 594, 7222, 161, 0, 21681, 1, 0, 88, 851, 3, 0, 14533, 41947, 1, 0, 154, 851, 3549, 0, 193, 10, 181, 59, 4, 6, 11299, 216, 5, 1336, 104, 360, 21, 59, 2795, 372, 5, 2862, 0, 811, 1, 391, 28, 41, 37, 51, 416, 102, 12, 6, 

**Generating Training Data for the Skip-Gram Model**

Create the Skp-Gram Pairs for each element of the words in the indexed dataset

In [32]:
C = 2 # Window size

def generate_training_data(words_index, C):
    training_pair = []
    for i, word_index in enumerate(words_index):
        context_word_indices = [words_index[j] for j in range(i - C, i + C + 1) if j != i and 0 <= j < len(words_index)]
        training_pair.append((word_index, context_word_indices))

    return training_pair

t_data = generate_training_data(words_index, C)
print("Training data:", t_data[:10])

Training data: [(1335, [2861, 12]), (2861, [1335, 12, 6]), (12, [1335, 2861, 6, 193]), (6, [2861, 12, 193, 1]), (193, [12, 6, 1, 4066]), (1, [6, 193, 4066, 48]), (4066, [193, 1, 48, 59]), (48, [1, 4066, 59, 136]), (59, [4066, 48, 136, 128]), (136, [48, 59, 128, 864])]


**Preparing for Negative Sampling**

Compute the Unigram Distribution

In [None]:
def generate_unigram_distribution(words_index, vocabulary):
    freqs_w = [vocabulary[word_index]['count'] for word_index in words_index]
    N = sum(freqs_w)
    unigram_distribution = [freq_w / N for freq_w in freqs_w]
    return unigram_distribution

unigram_distribution = generate_unigram_distribution(words_index, vocabulary)
print("Unigram distribution:", unigram_distribution[:10])

Unigram distribution: [4.171022419034171e-09, 1.9674634052047974e-09, 4.00260755154864e-07, 9.70169321529187e-07, 2.0645249331949006e-08, 1.9081771745946262e-06, 1.285409424733801e-09, 9.157886330093264e-08, 6.76545082936423e-08, 2.8357705880351812e-08]


Smooth the Unigram Distribution

In [43]:
def smooth_unigram_distribution(unigram_distribution, alpha):
  sum_unigram_distribution = sum(Uw ** alpha for Uw in unigram_distribution)
  smoothed_unigram_distribution = [ Uw ** alpha / sum_unigram_distribution for Uw in unigram_distribution]
  return smoothed_unigram_distribution

ALPHA = 0.75
smoothed_unigram_distribution = smooth_unigram_distribution(unigram_distribution, ALPHA)
print("Smoothed unigram distribution:", smoothed_unigram_distribution[:10])

Smoothed unigram distribution: [1.777254096045828e-08, 1.0115739295913317e-08, 5.449101386466149e-07, 1.0585302748307073e-06, 5.8977007123427317e-08, 1.7580517431391346e-06, 7.3510392327936466e-09, 1.8026610173557644e-07, 1.4364489201883038e-07, 7.48291739562492e-08]


### 2. Implement the Skip-Gram Model

### 3. Training the Model

### 4. Evaluation

### 5. Write a report