## Preprocessing..

In [31]:
import glob
import os
import pandas as pd
import re
import torch
# Read the CSV file
folder = 'inaugural/'
# Get a list of all text files in the folder
text_files = glob.glob(os.path.join(folder, '*.txt'))

# Initialize an empty string to hold the contents of all files
all_text = ''

# Loop through the list of files and read each one
for file in text_files:
    with open(file, 'r', encoding='latin-1') as f:
        all_text += f.read()


In [32]:
import re
import nltk

wss_truc = [] # this is the list that will contain the text without the empty lines
for line in all_text.split('\n'):
    if not re.match(r'^\s*$', line):
        # Split the line into sentences and add them to wss_truc
        wss_truc.extend(nltk.tokenize.sent_tokenize(line))

# Now wss_truc is a list of sentences from all_text but without the empty or whitespace-only lines


In [33]:
import numpy as np

# Calculate the length of each sentence
lengths = [len(s) for s in wss_truc]

# Calculate the first and third quartiles
Q1, Q3 = np.percentile(lengths, [25, 75])

# Calculate the interquartile range
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


print('Q1:', Q1)
print('Q3:', Q3)
print('IQR:', IQR)
print('Lower bound:', lower_bound)
print('Upper bound:', upper_bound)
print("Before: ",len(wss_truc))
# Filter out sentences that are too short or too long
wss_truc = [s for s in wss_truc if lower_bound <= len(s) <= upper_bound]
print("After: ",len(wss_truc))


# Now filtered_sentences contains the sentences from wss_truc without outliers

Q1: 75.0
Q3: 194.0
IQR: 119.0
Lower bound: -103.5
Upper bound: 372.5
Before:  5245
After:  5014


In [34]:
from nltk.tokenize import sent_tokenize, word_tokenize


# Initialize variables
longest_sentence = ""
max_words = 0

# Iterate over each sentence
for sentence in wss_truc:
    sentence = sentence.lower()
    # Split the sentence into words
    words = word_tokenize(sentence.lower())
    
    # Calculate the length of the sentence in terms of words
    num_words = len(words)
    
    # Check if the current sentence is longer than the previous longest sentence
    if num_words > max_words:
        longest_sentence = sentence
        max_words = num_words

# Print the longest sentence
print("Longest Sentence (in terms of words):")
print(longest_sentence)
print("Number of Words:", max_words)


Longest Sentence (in terms of words):
from this day forward, let each of us make a solemn commitment in his own heart: to bear his responsibility, to do his part, to live his ideals -- so that together, we can see the dawn of a new age of progress for america, and together, as we celebrate our 200th anniversary as a nation, we can do so proud in the fulfillment of our promise to ourselves and to the world.
Number of Words: 82


In [35]:
print(wss_truc[123])
len(wss_truc)

It has given new inspiration to the power of self-help in both races by making labor more honorable to the one and more necessary to the other.


5014

In [36]:
from gensim.models import Word2Vec
# Create a list of sentences
sentences = wss_truc
sentences = (([word_tokenize(sentence) for sentence in sentences]))
print(sentences[9])
# Train Word2Vec model
model = Word2Vec(sentences, min_count=1, vector_size=100)


# Get the vector representation of a word
vector = model.wv['freedom']

print(vector)


['And', 'so', 'shall', 'America', '--', 'in', 'the', 'sight', 'of', 'all', 'men', 'of', 'good', 'will', '--', 'prove', 'true', 'to', 'the', 'honorable', 'purposes', 'that', 'bind', 'and', 'rule', 'us', 'as', 'a', 'people', 'in', 'all', 'this', 'time', 'of', 'trial', 'through', 'which', 'we', 'pass', '.']
[-0.28473315  0.44451743  0.10804686  0.16759656 -0.16301374 -0.69708246
  0.64475054  1.37534    -0.38064182 -0.63269645 -0.07735743 -0.91599035
 -0.09009782  0.28997844  0.33454528 -0.55048513  0.54270643 -0.34319982
 -0.16251072 -1.1748464   0.376412    0.1347105   0.6570753  -0.41065648
 -0.00812421  0.16120565 -0.3598451  -0.09833887 -0.6427525   0.14379099
  0.78957313 -0.02207439  0.3437416  -0.5434185  -0.19112729  0.5663944
  0.21047543 -0.3409867  -0.10815226 -0.64986265  0.10797101 -0.49992904
 -0.50319135 -0.19659083  0.8656051  -0.2389644  -0.38818136 -0.13980746
  0.18878916  0.12746745  0.13509993 -0.4933366  -0.25579247  0.00315018
 -0.20153457  0.2993274   0.26391265 -

In [37]:
from torch.nn.utils.rnn import pad_sequence


# Convert words to indices
# simply a sentence made of indices of the words, not the words themselves
# embedded words.
sentences_indices = [[model.wv.key_to_index[word] for word in sentence] for sentence in sentences]
print ("Sentence Indices length: ", len(sentences_indices), "\nFirst sentence: ", sentences_indices[0])
# Convert lists to tensors
sentences_tensors = [torch.tensor(sentence) for sentence in sentences_indices]
print ("Sentence Tensors length: ", len(sentences_tensors), "\nFirst sentence: ", sentences_tensors[0])

# Pad sequences
padded_sentences = pad_sequence(sentences_tensors, batch_first=True, padding_value=0)
print("Padded sentences size: ",padded_sentences.size())
print("Padded 999th sentence: \n",padded_sentences[999])
print ("index to key: \n",model.wv.index_to_key[sentences_tensors[999][0]])
print (model.wv.get_vector(model.wv.index_to_key[sentences_tensors[999][0]]))
print (len(sentences_tensors)) 


Sentence Indices length:  5014 
First sentence:  [26, 6471, 2, 1719]
Sentence Tensors length:  5014 
First sentence:  tensor([  26, 6471,    2, 1719])
Padded sentences size:  torch.Size([5014, 82])
Padded 999th sentence: 
 tensor([ 145,   12,  925,    8,  337,    5, 3080,    4, 2551,    1,    0, 3068,
          17,  961,  101,    3,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0])
index to key: 
 If
[-0.11634699  0.2099309   0.06340787  0.05908985 -0.06434754 -0.42074615
  0.32349724  0.7728614  -0.16076288 -0.360425   -0.080445   -0.5496361
 -0.04884283  0.18747592  0.16605605 -0.3207983   0.33070526 -0.

In [38]:
def itv(index):
    return model.wv.get_vector(model.wv.index_to_key[index])

In [39]:
def vti(vector):
    return model.wv.key_to_index[model.wv.similar_by_vector(vector)[0][0]] 

In [40]:

embdedded_sentences = torch.zeros((len(sentences_tensors), max_words, 100))
total_words = 0
for i in range(len(sentences_tensors)):
    for j in range(len(sentences_tensors[i])):
        vector = torch.tensor(itv(sentences_tensors[i][j]), dtype=torch.float)
        for k in range(100):
            embdedded_sentences[i][j][k] = vector[k]
        total_words += 1
print("Total words processed: ", total_words)

Total words processed:  129754


In [41]:
embdedded_sentences[999][0]

tensor([-0.1163,  0.2099,  0.0634,  0.0591, -0.0643, -0.4207,  0.3235,  0.7729,
        -0.1608, -0.3604, -0.0804, -0.5496, -0.0488,  0.1875,  0.1661, -0.3208,
         0.3307, -0.2135, -0.0811, -0.6934,  0.2463,  0.1060,  0.3848, -0.2795,
         0.0169,  0.1056, -0.2390, -0.0557, -0.3365,  0.1126,  0.4702, -0.0196,
         0.2063, -0.3018, -0.1287,  0.3470,  0.0979, -0.2114, -0.0673, -0.3402,
         0.0690, -0.2659, -0.2284, -0.0716,  0.4203, -0.1636, -0.2095, -0.1249,
         0.0741,  0.0837,  0.1428, -0.2554, -0.0900,  0.0170, -0.1484,  0.1913,
         0.1671, -0.0684, -0.1963,  0.0501,  0.0938, -0.2987,  0.2485,  0.0448,
        -0.3371,  0.4224, -0.0957,  0.3081, -0.4059,  0.3437, -0.2440,  0.1398,
         0.4981,  0.0631,  0.2702,  0.0854,  0.0253, -0.1159, -0.2997, -0.0307,
        -0.1978, -0.1391, -0.1789,  0.4226, -0.1585, -0.1463,  0.1589,  0.0642,
         0.2334,  0.1517,  0.3650,  0.2714,  0.1445, -0.0447,  0.5874,  0.2398,
         0.0810, -0.2527,  0.0234, -0.10

In [42]:
emb_x, emb_y = [], []
for sentence in embdedded_sentences:
    for char_index in range(len(sentence)-1):
        emb_x.append(sentence[char_index]) 
        emb_y.append(sentence[char_index+1])

In [43]:
train_size = int(0.8 * len(emb_x))
train_x, test_x = emb_x[:train_size], emb_x[train_size:]
train_y, test_y = emb_y[:train_size], emb_y[train_size:]
train_x = torch.stack(train_x)
train_y = torch.stack(train_y)
test_x = torch.stack(test_x)
test_y = torch.stack(test_y)



In [44]:
# Define the RNN architecture
import torch.nn as nn
rnn = nn.RNN(input_size=100, hidden_size=50, num_layers=3, batch_first=True)
# Define dense layer
dense_layer = nn.Linear(50, 9490)


In [45]:
optimizer = torch.optim.Adam(list(rnn.parameters()) + list(dense_layer.parameters()), lr=0.1)
hidden = None
criterion = nn.CrossEntropyLoss()


In [46]:
## get vocabulary size from sentences
vocab_size = len(model.wv.key_to_index)
print(vocab_size)

9490


In [47]:
# class RNNModel(nn.Module):
#     def __init__(self, rnn_units, vocab_size):
#         super(RNNModel, self).__init__()
#         self.rnn = nn.RNN(input_size=rnn_units, hidden_size=rnn_units, batch_first=True)
#         self.fc = nn.Linear(rnn_units, vocab_size)

#     def forward(self, x):
#         output, hidden = self.rnn(x)
#         output = self.fc(output[:, -1, :])  # Use the last timestep
#         return output
    
# rnn = RNNModel(vocab_size=9490, rnn_units=256)

In [48]:
import torch
import numpy as np
import torch.nn as nn

# Assume rnn, optimizer, criterion, train_x, train_y have been defined

batch_size = 10
iters = 300
hidden = None  # Initial hidden state

for epoch in range(iters):
    batch_indices = np.random.choice(len(train_x), batch_size, replace=False)
    batch_x = train_x[batch_indices]
    batch_y = train_y[batch_indices]

    optimizer.zero_grad()

    # Forward pass
    output, hidden = rnn(batch_x,hidden)  # output shape: (batch_size, sequence_length, hidden_size)
    output = dense_layer(output)  # output shape now: (batch_size, sequence_length, vocab_size)

    # Reshape for loss calculation
    print(output.shape)

    output = output.reshape(-1, vocab_size)  # Flatten output: (batch_size * sequence_length, vocab_size)
    batch_y = batch_y.view(-1)  # Flatten targets to match output: (batch_size * sequence_length)

    # Compute loss, backpropagate, and update weights
    # Assume sequence_length is defined
    sequence_length = batch_x.shape[1]
    output = output.reshape(batch_size * sequence_length, vocab_size)
    batch_y = batch_y.view(batch_size * sequence_length)
    loss = criterion(output, batch_y)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print('Epoch:', epoch, 'Loss:', loss.item())


torch.Size([10, 9490])


RuntimeError: shape '[100, 9490]' is invalid for input of size 94900