## Preprocessing..

In [8]:
import glob
import os
import pandas as pd
import re
import torch
# Read the CSV file
folder = 'inaugural/'
# Get a list of all text files in the folder
text_files = glob.glob(os.path.join(folder, '*.txt'))

# Initialize an empty string to hold the contents of all files
all_text = ''

# Loop through the list of files and read each one
for file in text_files:
    with open(file, 'r', encoding='latin-1') as f:
        all_text += f.read()


In [9]:
import re
import nltk

wss_truc = [] # this is the list that will contain the text without the empty lines
for line in all_text.split('\n'):
    if not re.match(r'^\s*$', line):
        # Split the line into sentences and add them to wss_truc
        wss_truc.extend(nltk.tokenize.sent_tokenize(line))

# Now wss_truc is a list of sentences from all_text but without the empty or whitespace-only lines


In [10]:
import numpy as np

# Calculate the length of each sentence
lengths = [len(s) for s in wss_truc]

# Calculate the first and third quartiles
Q1, Q3 = np.percentile(lengths, [25, 75])

# Calculate the interquartile range
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


print('Q1:', Q1)
print('Q3:', Q3)
print('IQR:', IQR)
print('Lower bound:', lower_bound)
print('Upper bound:', upper_bound)
print("Before: ",len(wss_truc))
# Filter out sentences that are too short or too long
wss_truc = [s for s in wss_truc if lower_bound <= len(s) <= upper_bound]
print("After: ",len(wss_truc))


# Now filtered_sentences contains the sentences from wss_truc without outliers

Q1: 75.0
Q3: 194.0
IQR: 119.0
Lower bound: -103.5
Upper bound: 372.5
Before:  5245
After:  5014


In [11]:
from nltk.tokenize import sent_tokenize, word_tokenize


# Initialize variables
longest_sentence = ""
max_words = 0

# Iterate over each sentence
for sentence in wss_truc:
    sentence = sentence.lower()
    # Split the sentence into words
    words = word_tokenize(sentence.lower())
    
    # Calculate the length of the sentence in terms of words
    num_words = len(words)
    
    # Check if the current sentence is longer than the previous longest sentence
    if num_words > max_words:
        longest_sentence = sentence
        max_words = num_words

# Print the longest sentence
print("Longest Sentence (in terms of words):")
print(longest_sentence)
print("Number of Words:", max_words)


Longest Sentence (in terms of words):
from this day forward, let each of us make a solemn commitment in his own heart: to bear his responsibility, to do his part, to live his ideals -- so that together, we can see the dawn of a new age of progress for america, and together, as we celebrate our 200th anniversary as a nation, we can do so proud in the fulfillment of our promise to ourselves and to the world.
Number of Words: 82


In [12]:
print(wss_truc[123])
len(wss_truc)

It has given new inspiration to the power of self-help in both races by making labor more honorable to the one and more necessary to the other.


5014

In [13]:
from gensim.models import Word2Vec
# Create a list of sentences
sentences = wss_truc
sentences = (([word_tokenize(sentence) for sentence in sentences]))
print(sentences[9])
# Train Word2Vec model
model = Word2Vec(sentences, min_count=1, vector_size=100)


# Get the vector representation of a word
vector = model.wv['freedom']

print(vector)


['And', 'so', 'shall', 'America', '--', 'in', 'the', 'sight', 'of', 'all', 'men', 'of', 'good', 'will', '--', 'prove', 'true', 'to', 'the', 'honorable', 'purposes', 'that', 'bind', 'and', 'rule', 'us', 'as', 'a', 'people', 'in', 'all', 'this', 'time', 'of', 'trial', 'through', 'which', 'we', 'pass', '.']
[-0.28405246  0.4119024   0.10824949  0.16703472 -0.13946846 -0.66802764
  0.6123879   1.2793777  -0.3515644  -0.5937289  -0.07727722 -0.8684774
 -0.08548225  0.29978842  0.32530463 -0.5317908   0.54499584 -0.3512206
 -0.16638929 -1.185823    0.38119495  0.14687824  0.6630655  -0.41976824
  0.01379933  0.15328032 -0.34413353 -0.09186853 -0.6370527   0.15738194
  0.7906737  -0.0321282   0.30916205 -0.51986784 -0.18198511  0.55623424
  0.18361825 -0.32970887 -0.08519449 -0.6310024   0.08940866 -0.49410096
 -0.5017738  -0.1898621   0.83101463 -0.23273668 -0.37591234 -0.12444883
  0.19622968  0.12744991  0.15044947 -0.47755298 -0.23138498 -0.00989754
 -0.21064231  0.29296628  0.2589443  -0

In [25]:
from torch.nn.utils.rnn import pad_sequence


# Convert words to indices
# simply a sentence made of indices of the words, not the words themselves
# embedded words.
sentences_indices = [[model.wv.key_to_index[word] for word in sentence] for sentence in sentences]
print ("Sentence Indices length: ", len(sentences_indices), "\nFirst sentence: ", sentences_indices[0])
# Convert lists to tensors
sentences_tensors = [torch.tensor(sentence) for sentence in sentences_indices]
print ("Sentence Tensors length: ", len(sentences_tensors), "\nFirst sentence: ", sentences_tensors[0])

# Pad sequences
padded_sentences = pad_sequence(sentences_tensors, batch_first=True, padding_value=0)
print("Padded sentences size: ",padded_sentences.size())
print("Padded 999th sentence: \n",padded_sentences[999])
print ("index to key: \n",model.wv.index_to_key[sentences_tensors[999][0]])
print (model.wv.get_vector(model.wv.index_to_key[sentences_tensors[999][0]]))
print (len(sentences_tensors)) 


Sentence Indices length:  5014 
First sentence:  [26, 6471, 2, 1719]
Sentence Tensors length:  5014 
First sentence:  tensor([  26, 6471,    2, 1719])
Padded sentences size:  torch.Size([5014, 82])
Padded 999th sentence: 
 tensor([ 145,   12,  925,    8,  337,    5, 3080,    4, 2551,    1,    0, 3068,
          17,  961,  101,    3,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0])
index to key: 
 If
[-0.14499563  0.22462133  0.0657555   0.07042288 -0.0598119  -0.44081974
  0.3427372   0.7950321  -0.17258732 -0.3573246  -0.08286595 -0.5512895
 -0.04575876  0.20186509  0.17450239 -0.32679158  0.31442824 -0.

In [26]:
def itv(index):
    return model.wv.get_vector(model.wv.index_to_key[index])

In [27]:
def vti(vector):
    return model.wv.key_to_index[model.wv.similar_by_vector(vector)[0][0]] 

In [29]:

embdedded_sentences = torch.zeros((len(sentences_tensors), max_words, 100))
total_words = 0
for i in range(len(sentences_tensors)):
    for j in range(len(sentences_tensors[i])):
        vector = torch.tensor(itv(sentences_tensors[i][j]), dtype=torch.float)
        for k in range(100):
            embdedded_sentences[i][j][k] = vector[k]
        total_words += 1
print("Total words processed: ", total_words)

Total words:  129754


In [None]:
embdedded_sentences[999][0]

tensor([-0.1219,  0.1976,  0.0597,  0.0494, -0.0550, -0.4029,  0.3109,  0.7377,
        -0.1582, -0.3444, -0.0861, -0.5294, -0.0469,  0.1828,  0.1634, -0.3110,
         0.3070, -0.1975, -0.0727, -0.6460,  0.2329,  0.0934,  0.3645, -0.2665,
         0.0218,  0.0956, -0.2143, -0.0535, -0.3105,  0.1085,  0.4380, -0.0138,
         0.1978, -0.3154, -0.1243,  0.3496,  0.0967, -0.2195, -0.0612, -0.3483,
         0.0698, -0.2885, -0.2461, -0.0734,  0.4502, -0.1692, -0.2203, -0.1232,
         0.0825,  0.0777,  0.1409, -0.2501, -0.0866,  0.0113, -0.1511,  0.1809,
         0.1712, -0.0750, -0.1969,  0.0489,  0.0992, -0.3028,  0.2615,  0.0447,
        -0.3551,  0.4530, -0.0941,  0.3319, -0.4229,  0.3616, -0.2558,  0.1492,
         0.5296,  0.0683,  0.2779,  0.1013,  0.0280, -0.1226, -0.3087, -0.0450,
        -0.2077, -0.1520, -0.1887,  0.4424, -0.1546, -0.1546,  0.1604,  0.0649,
         0.2432,  0.1539,  0.3663,  0.2740,  0.1383, -0.0531,  0.5977,  0.2444,
         0.0865, -0.2631,  0.0374, -0.10

In [None]:
emb_x, emb_y = [], []
for sentence in embdedded_sentences:
    for char_index in range(len(sentence)-1):
        emb_x.append(sentence[char_index]) 
        emb_y.append(sentence[char_index+1])

In [None]:
train_size = int(0.8 * len(emb_x))
train_x, test_x = emb_x[:train_size], emb_x[train_size:]
train_y, test_y = emb_y[:train_size], emb_y[train_size:]
train_x = torch.stack(train_x)
train_y = torch.stack(train_y)
test_x = torch.stack(test_x)
test_y = torch.stack(test_y)



In [None]:
# Define the RNN architecture
import torch.nn as nn
rnn = nn.RNN(input_size=100, hidden_size=50, num_layers=3, batch_first=True)
# Define dense layer
dense_layer = nn.Linear(50, 9490)


In [None]:
optimizer = torch.optim.Adam(list(rnn.parameters()) + list(dense_layer.parameters()), lr=0.1)
hidden = None
criterion = nn.CrossEntropyLoss()


In [None]:
## get vocabulary size from sentences
vocab_size = len(model.wv.key_to_index)
print(vocab_size)

9490


In [None]:
# class RNNModel(nn.Module):
#     def __init__(self, rnn_units, vocab_size):
#         super(RNNModel, self).__init__()
#         self.rnn = nn.RNN(input_size=rnn_units, hidden_size=rnn_units, batch_first=True)
#         self.fc = nn.Linear(rnn_units, vocab_size)

#     def forward(self, x):
#         output, hidden = self.rnn(x)
#         output = self.fc(output[:, -1, :])  # Use the last timestep
#         return output
    
# rnn = RNNModel(vocab_size=9490, rnn_units=256)

In [None]:
import torch
import numpy as np
import torch.nn as nn

# Assume rnn, optimizer, criterion, train_x, train_y have been defined

batch_size = 10
iters = 300
hidden = None  # Initial hidden state

for epoch in range(iters):
    batch_indices = np.random.choice(len(train_x), batch_size, replace=False)
    batch_x = train_x[batch_indices]
    batch_y = train_y[batch_indices]

    optimizer.zero_grad()

    # Forward pass
    output, hidden = rnn(batch_x,hidden)  # output shape: (batch_size, sequence_length, hidden_size)
    output = dense_layer(output)  # output shape now: (batch_size, sequence_length, vocab_size)

    # Reshape for loss calculation
    print(output.shape)

    output = output.reshape(-1, vocab_size)  # Flatten output: (batch_size * sequence_length, vocab_size)
    batch_y = batch_y.view(-1)  # Flatten targets to match output: (batch_size * sequence_length)

    # Compute loss, backpropagate, and update weights
    # Assume sequence_length is defined
    sequence_length = batch_x.shape[1]
    output = output.reshape(batch_size * sequence_length, vocab_size)
    batch_y = batch_y.view(batch_size * sequence_length)
    loss = criterion(output, batch_y)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print('Epoch:', epoch, 'Loss:', loss.item())


torch.Size([10, 9490])


RuntimeError: shape '[94900, 9490]' is invalid for input of size 94900