# Import Necessary Packages

In [81]:
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [82]:
with open('./CBOW/CBOW.txt', 'r', encoding='utf-8') as file:
    text = file.read()
    
text

'The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. \n\nFurther, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. \n\nThe reproductive number – the number of secondary infections generated from one infected individual – is understood to be betwe

# Preprocessing text

In [68]:
# convert text to lowercase

text = text.lower()

text = text.replace('\n','')
text = text.replace('(','')
text = text.replace(')','')
text = text.replace(',','')
text = text.replace('-','')

text

'the speed of transmission is an important point of difference between the two viruses. influenza has a shorter median incubation period the time from infection to appearance of symptoms and a shorter serial interval the time between successive cases than covid19 virus. the serial interval for covid19 virus is estimated to be 56 days while for influenza virus the serial interval is 3 days. this means that influenza can spread faster than covid19. further transmission in the first 35 days of illness or potentially presymptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. in contrast while we are learning that there are people who can shed covid19 virus 2448 hours prior to symptom onset at present this does not appear to be a major driver of transmission. the reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 and 2.5 for covid19 vi

In [69]:
cleaned_text = ''

for word in text:
    if not word.isdigit():
        cleaned_text += word


cleaned_text

'the speed of transmission is an important point of difference between the two viruses. influenza has a shorter median incubation period the time from infection to appearance of symptoms and a shorter serial interval the time between successive cases than covid virus. the serial interval for covid virus is estimated to be  days while for influenza virus the serial interval is  days. this means that influenza can spread faster than covid. further transmission in the first  days of illness or potentially presymptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. in contrast while we are learning that there are people who can shed covid virus  hours prior to symptom onset at present this does not appear to be a major driver of transmission. the reproductive number – the number of secondary infections generated from one infected individual – is understood to be between  and . for covid virus higher than for in

In [70]:
# Splitting the text into sentences
sentences = cleaned_text.split('.')
sentences

['the speed of transmission is an important point of difference between the two viruses',
 ' influenza has a shorter median incubation period the time from infection to appearance of symptoms and a shorter serial interval the time between successive cases than covid virus',
 ' the serial interval for covid virus is estimated to be  days while for influenza virus the serial interval is  days',
 ' this means that influenza can spread faster than covid',
 ' further transmission in the first  days of illness or potentially presymptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza',
 ' in contrast while we are learning that there are people who can shed covid virus  hours prior to symptom onset at present this does not appear to be a major driver of transmission',
 ' the reproductive number – the number of secondary infections generated from one infected individual – is understood to be between  and ',
 ' for c

In [71]:
# Tokenizing the sentences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[1, 34, 2, 7, 8, 35, 36, 37, 2, 38, 12, 1, 39, 20], [3, 40, 10, 21, 41, 42, 43, 1, 22, 23, 44, 9, 24, 2, 25, 11, 10, 21, 13, 14, 1, 22, 12, 45, 46, 15, 4, 5], [1, 13, 14, 6, 4, 5, 8, 47, 9, 16, 17, 26, 6, 3, 5, 1, 13, 14, 8, 17], [27, 48, 28, 3, 29, 49, 50, 15, 4], [51, 7, 30, 1, 52, 17, 2, 53, 54, 55, 56, 7, 57, 2, 1, 5, 58, 1, 24, 2, 25, 18, 8, 10, 31, 32, 2, 7, 6, 3], [30, 59, 26, 60, 19, 61, 28, 62, 19, 63, 64, 29, 65, 4, 5, 66, 67, 9, 68, 69, 70, 71, 27, 72, 73, 74, 9, 16, 10, 31, 32, 2, 7], [1, 75, 33, 18, 1, 33, 2, 76, 77, 78, 23, 79, 80, 81, 18, 8, 82, 9, 16, 12, 11], [6, 4, 5, 83, 15, 6, 3], [84, 85, 6, 86, 4, 11, 3, 20, 19, 87, 88, 11, 89, 90, 91, 92, 93, 94], []]


In [72]:
idx2word = tokenizer.index_word
word2idx = tokenizer.word_index

print(idx2word, '\n')
print(word2idx)

{1: 'the', 2: 'of', 3: 'influenza', 4: 'covid', 5: 'virus', 6: 'for', 7: 'transmission', 8: 'is', 9: 'to', 10: 'a', 11: 'and', 12: 'between', 13: 'serial', 14: 'interval', 15: 'than', 16: 'be', 17: 'days', 18: '–', 19: 'are', 20: 'viruses', 21: 'shorter', 22: 'time', 23: 'from', 24: 'appearance', 25: 'symptoms', 26: 'while', 27: 'this', 28: 'that', 29: 'can', 30: 'in', 31: 'major', 32: 'driver', 33: 'number', 34: 'speed', 35: 'an', 36: 'important', 37: 'point', 38: 'difference', 39: 'two', 40: 'has', 41: 'median', 42: 'incubation', 43: 'period', 44: 'infection', 45: 'successive', 46: 'cases', 47: 'estimated', 48: 'means', 49: 'spread', 50: 'faster', 51: 'further', 52: 'first', 53: 'illness', 54: 'or', 55: 'potentially', 56: 'presymptomatic', 57: '–transmission', 58: 'before', 59: 'contrast', 60: 'we', 61: 'learning', 62: 'there', 63: 'people', 64: 'who', 65: 'shed', 66: 'hours', 67: 'prior', 68: 'symptom', 69: 'onset', 70: 'at', 71: 'present', 72: 'does', 73: 'not', 74: 'appear', 75: '

# Creating contexts and targets

In [73]:
vocab_size = len(tokenizer.word_index) + 1
emb_size = 100
context_size = 3

contexts = []
targets = []

for sequence in sequences:
    if len(sequence) >= 2* context_size + 1:
        for i in range(context_size, len(sequence) - context_size):
            target = sequence[i]
            context = sequence[i-context_size:i] + sequence[i+1:i+context_size+1]
            contexts.append(context)
            targets.append(target)
            
print(contexts, "\n")
print(targets)

[[1, 34, 2, 8, 35, 36], [34, 2, 7, 35, 36, 37], [2, 7, 8, 36, 37, 2], [7, 8, 35, 37, 2, 38], [8, 35, 36, 2, 38, 12], [35, 36, 37, 38, 12, 1], [36, 37, 2, 12, 1, 39], [37, 2, 38, 1, 39, 20], [3, 40, 10, 41, 42, 43], [40, 10, 21, 42, 43, 1], [10, 21, 41, 43, 1, 22], [21, 41, 42, 1, 22, 23], [41, 42, 43, 22, 23, 44], [42, 43, 1, 23, 44, 9], [43, 1, 22, 44, 9, 24], [1, 22, 23, 9, 24, 2], [22, 23, 44, 24, 2, 25], [23, 44, 9, 2, 25, 11], [44, 9, 24, 25, 11, 10], [9, 24, 2, 11, 10, 21], [24, 2, 25, 10, 21, 13], [2, 25, 11, 21, 13, 14], [25, 11, 10, 13, 14, 1], [11, 10, 21, 14, 1, 22], [10, 21, 13, 1, 22, 12], [21, 13, 14, 22, 12, 45], [13, 14, 1, 12, 45, 46], [14, 1, 22, 45, 46, 15], [1, 22, 12, 46, 15, 4], [22, 12, 45, 15, 4, 5], [1, 13, 14, 4, 5, 8], [13, 14, 6, 5, 8, 47], [14, 6, 4, 8, 47, 9], [6, 4, 5, 47, 9, 16], [4, 5, 8, 9, 16, 17], [5, 8, 47, 16, 17, 26], [8, 47, 9, 17, 26, 6], [47, 9, 16, 26, 6, 3], [9, 16, 17, 6, 3, 5], [16, 17, 26, 3, 5, 1], [17, 26, 6, 5, 1, 13], [26, 6, 3, 1, 13,

# Creating training and testing data

In [74]:
X = np.array(contexts)
y = np.array(targets)

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [76]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((94, 6), (32, 6), (94,), (32,))

# Defining model

In [77]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=2 * context_size),
    Lambda(lambda x: tf.reduce_mean(x, axis=1)),
    Dense(128, activation='relu'),
    Dense(vocab_size, activation='softmax')
])



# Train the model

In [78]:
model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy')

In [79]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30, verbose=0)

# Making predictions

In [80]:
test_words = []

for idx in X_test[5]:
    test_words.append(idx2word.get(idx))

# Reshape X_test[0] to have a batch dimension
input_data = np.expand_dims(X_test[5], axis=0)

pred = model.predict(input_data)
pred = np.argmax(pred[0])

print("pred ", test_words, "\n=", idx2word.get(pred), "\n\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step
pred  ['that', 'there', 'are', 'who', 'can', 'shed'] 
= are 


