## Preprocessing..

In [199]:
import glob
import os
import pandas as pd
import re
import torch
# Read the CSV file
folder = 'inaugural/'
# Get a list of all text files in the folder
text_files = glob.glob(os.path.join(folder, '*.txt'))

# Initialize an empty string to hold the contents of all files
all_text = ''

# Loop through the list of files and read each one
for file in text_files:
    with open(file, 'r', encoding='latin-1') as f:
        all_text += f.read()


In [282]:
import re
import nltk

wss_truc = [] # this is the list that will contain the text without the empty lines
for line in all_text.split('\n'):
    if not re.match(r'^\s*$', line):
        # Split the line into sentences and add them to wss_truc
        wss_truc.extend(nltk.tokenize.sent_tokenize(line))

# Now wss_truc is a list of sentences from all_text but without the empty or whitespace-only lines


In [202]:
import numpy as np

# Calculate the length of each sentence
lengths = [len(s) for s in wss_truc]

# Calculate the first and third quartiles
Q1, Q3 = np.percentile(lengths, [25, 75])

# Calculate the interquartile range
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


print('Q1:', Q1)
print('Q3:', Q3)
print('IQR:', IQR)
print('Lower bound:', lower_bound)
print('Upper bound:', upper_bound)
print("Before: ",len(wss_truc))
# Filter out sentences that are too short or too long
wss_truc = [s for s in wss_truc if lower_bound <= len(s) <= upper_bound]
print("After: ",len(wss_truc))


# Now filtered_sentences contains the sentences from wss_truc without outliers

Q1: 75.0
Q3: 194.0
IQR: 119.0
Lower bound: -103.5
Upper bound: 372.5
Before:  5245
After:  5014


In [203]:
from nltk.tokenize import sent_tokenize, word_tokenize


# Initialize variables
longest_sentence = ""
max_words = 0

# Iterate over each sentence
for sentence in wss_truc:
    # Split the sentence into words
    words = word_tokenize(sentence)
    
    # Calculate the length of the sentence in terms of words
    num_words = len(words)
    
    # Check if the current sentence is longer than the previous longest sentence
    if num_words > max_words:
        longest_sentence = sentence
        max_words = num_words

# Print the longest sentence
print("Longest Sentence (in terms of words):")
print(longest_sentence)
print("Number of Words:", max_words)


Longest Sentence (in terms of words):
From this day forward, let each of us make a solemn commitment in his own heart: to bear his responsibility, to do his part, to live his ideals -- so that together, we can see the dawn of a new age of progress for America, and together, as we celebrate our 200th anniversary as a nation, we can do so proud in the fulfillment of our promise to ourselves and to the world.
Number of Words: 82


In [204]:
print(wss_truc[123])
len(wss_truc)

It has given new inspiration to the power of self-help in both races by making labor more honorable to the one and more necessary to the other.


5014

In [205]:
from gensim.models import Word2Vec
# Create a list of sentences
sentences = wss_truc
sentences = (([word_tokenize(sentence) for sentence in sentences]))
print(sentences[0])
# Train Word2Vec model
model = Word2Vec(sentences, min_count=1, vector_size=100)


# Get the vector representation of a word
vector = model.wv['freedom']

print(vector)


['The', 'Price', 'of', 'Peace']
[-0.2525971   0.42700943  0.12013165  0.12152048 -0.21124654 -0.63928825
  0.58684766  1.2170982  -0.3955623  -0.6425838  -0.03964337 -0.8533821
 -0.05216161  0.24543197  0.29338062 -0.5560536   0.54698694 -0.35781538
 -0.16653399 -1.1446062   0.380421    0.17019045  0.6375555  -0.4100209
 -0.04102421  0.17423284 -0.3612731  -0.09456202 -0.6529709   0.14056787
  0.74061733 -0.03772156  0.3182855  -0.52975136 -0.170171    0.51363254
  0.1628626  -0.28580457 -0.12979947 -0.6265941   0.10117768 -0.5048628
 -0.47761884 -0.1641562   0.75065684 -0.22618842 -0.3606702  -0.13749631
  0.19925246  0.15306814  0.10901764 -0.46054494 -0.23847114  0.00395064
 -0.20914641  0.34535694  0.23675218 -0.16146532 -0.45600274  0.08340442
  0.12827031 -0.5198007   0.40631774  0.04078281 -0.5866992   0.76675445
 -0.1001404   0.62999004 -0.7631723   0.6220139  -0.38637537  0.23489694
  0.9435234   0.03456828  0.36894706  0.20237617 -0.02883388 -0.19564489
 -0.50460345 -0.141569

In [206]:
# Convert words to indices
sentences_indices = [[model.wv.key_to_index[word] for word in sentence] for sentence in sentences]

# Convert lists to tensors
sentences_tensors = [torch.tensor(sentence) for sentence in sentences_indices]

# Pad sequences
padded_sentences = pad_sequence(sentences_tensors, batch_first=True, padding_value=0)

print(padded_sentences[999])
print (model.wv.index_to_key[sentences_tensors[999][0]])
print (model.wv.get_vector(model.wv.index_to_key[sentences_tensors[999][0]]))
print (len(sentences_tensors))


tensor([ 145,   12,  925,    8,  337,    5, 3080,    4, 2551,    1,    0, 3068,
          17,  961,  101,    3,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0])
If
[-0.12183109  0.2202061   0.06920862  0.03178521 -0.10502056 -0.4375917
  0.33363536  0.7743538  -0.18722387 -0.3823133  -0.07506983 -0.5408861
 -0.03004371  0.1675987   0.1504226  -0.3310754   0.3228592  -0.21258025
 -0.08066671 -0.6511852   0.2444657   0.1138318   0.36982757 -0.27240878
 -0.00189271  0.1106787  -0.22765437 -0.0620686  -0.32771394  0.10808066
  0.42848128 -0.02413391  0.20846245 -0.34507936 -0.12417553  0.3508459
  0.08127075 

In [207]:
def itv(index):
    return model.wv.get_vector(model.wv.index_to_key[index])

In [208]:

embdedded_sentences = torch.zeros((len(sentences_tensors), max_words, 100))
for i in range(len(sentences_tensors)):
    for j in range(len(sentences_tensors[i])):
        vector = torch.tensor(itv(sentences_tensors[i][j]), dtype=torch.float)
        for k in range(100):
            embdedded_sentences[i][j][k] = vector[k]


In [209]:
embdedded_sentences[999]

tensor([[-0.1218,  0.2202,  0.0692,  ..., -0.2644,  0.0227, -0.1102],
        [-0.3499,  0.4991,  0.2468,  ..., -0.6421,  0.0628, -0.1999],
        [-0.0396,  0.0790,  0.0319,  ..., -0.0759,  0.0069, -0.0372],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [210]:
emb_x, emb_y = [], []
for sentence in embdedded_sentences:
    for char_index in range(len(sentence)-1):
        emb_x.append(sentence[char_index]) 
        emb_y.append(sentence[char_index+1])

In [240]:
train_size = int(0.8 * len(emb_x))
train_x, test_x = emb_x[:train_size], emb_x[train_size:]
train_y, test_y = emb_y[:train_size], emb_y[train_size:]
train_x = torch.stack(train_x)
train_y = torch.stack(train_y)
test_x = torch.stack(test_x)
test_y = torch.stack(test_y)



In [263]:
# Define the RNN architecture
rnn = nn.RNN(input_size=100, hidden_size=50, num_layers=3, batch_first=True)
# Define dense layer
dense_layer = nn.Linear(50, 9490)


In [264]:
optimizer = torch.optim.Adam(list(rnn.parameters()) + list(dense_layer.parameters()), lr=0.1)
hidden = None
criterion = nn.CrossEntropyLoss()


In [260]:
## get vocabulary size from sentences
vocab_size = len(model.wv.key_to_index)
print(vocab_size)

9490


In [248]:
# class RNNModel(nn.Module):
#     def __init__(self, rnn_units, vocab_size):
#         super(RNNModel, self).__init__()
#         self.rnn = nn.RNN(input_size=rnn_units, hidden_size=rnn_units, batch_first=True)
#         self.fc = nn.Linear(rnn_units, vocab_size)

#     def forward(self, x):
#         output, hidden = self.rnn(x)
#         output = self.fc(output[:, -1, :])  # Use the last timestep
#         return output
    
# rnn = RNNModel(vocab_size=9490, rnn_units=256)

In [281]:
import torch
import numpy as np
import torch.nn as nn

# Assume rnn, optimizer, criterion, train_x, train_y have been defined

batch_size = 32
iters = 300
hidden = None  # Initial hidden state

for epoch in range(iters):
    batch_indices = np.random.choice(len(train_x), batch_size, replace=False)
    batch_x = train_x[batch_indices]
    batch_y = train_y[batch_indices]

    optimizer.zero_grad()

    # Forward pass
    output, hidden = rnn(batch_x,hidden)  # output shape: (batch_size, sequence_length, hidden_size)
    output = dense_layer(output)  # output shape now: (batch_size, sequence_length, vocab_size)

    # Reshape for loss calculation
    print(output.shape)

    output = output.reshape(-1, vocab_size)  # Flatten output: (batch_size * sequence_length, vocab_size)
    batch_y = batch_y.view(-1)  # Flatten targets to match output: (batch_size * sequence_length)

    # Compute loss, backpropagate, and update weights
    loss = criterion(output, batch_y)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print('Epoch:', epoch, 'Loss:', loss.item())


ValueError: Expected input batch_size (32) to match target batch_size (3200).