In [31]:
with open("combine_poems.txt",encoding='utf-8') as f:
    text = f.read()
    
# want to remove integers from the text
import re
text = re.sub(r'\d+', '', text)


In [32]:
print("length of text: ", len(text))

length of text:  2151083


In [33]:
chars=sorted(list(set(text)))
vocab_size=len(chars)
print("number of unique characters: ", vocab_size)
print("characters: ", ''.join(chars))

number of unique characters:  94
characters:  
 !"&'()*,-./:;>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]^`abcdefghijklmnopqrstuvwxyz{}~£³´ÆÔäæèéëïöŒ–—‘’‹


In [77]:
# stoi={ch:i for i,ch in enumerate(chars)}
stoi={}
for i,ch in enumerate(chars):
    stoi[ch]=i
    
itos={}
for i,ch in enumerate(chars):
    itos[i]=ch
# itos={i:ch for i,ch in enumerate(chars)}
encode=lambda x: [stoi[ch] for ch in x]
decode=lambda x: ''.join([itos[i] for i in x])

print("encoded: ", encode("hello"))
print("decoded: ", decode(encode("hello")))

encoded:  [54, 51, 58, 58, 61]
decoded:  hello


In [35]:
import torch
data=torch.tensor(encode(text),dtype=torch.long)
print("shape of the data: ", data.shape)
print("first 10 characters: ", data[:10])

shape of the data:  torch.Size([2151083])
first 10 characters:  tensor([ 0, 32, 61, 51, 59,  1, 13,  1, 17,  1])


In [36]:
n=int(0.9*len(data))
train_data, val_data=data[:n], data[n:]

In [37]:
# can not train the model on the entire data at once
block_size=8
train_data[:block_size+1] 

tensor([ 0, 32, 61, 51, 59,  1, 13,  1, 17])

In [38]:
x=train_data[:block_size]
y=train_data[1:block_size+1]
print(x)
print(y)

tensor([ 0, 32, 61, 51, 59,  1, 13,  1])
tensor([32, 61, 51, 59,  1, 13,  1, 17])


In [39]:
for t in range(block_size):
    context=x[:t+1]
    target=y[t] # y is already one ahead of x
    print(context, "->", target)

tensor([0]) -> tensor(32)
tensor([ 0, 32]) -> tensor(61)
tensor([ 0, 32, 61]) -> tensor(51)
tensor([ 0, 32, 61, 51]) -> tensor(59)
tensor([ 0, 32, 61, 51, 59]) -> tensor(1)
tensor([ 0, 32, 61, 51, 59,  1]) -> tensor(13)
tensor([ 0, 32, 61, 51, 59,  1, 13]) -> tensor(1)
tensor([ 0, 32, 61, 51, 59,  1, 13,  1]) -> tensor(17)


In [100]:
torch.manual_seed(1337)
batch_size=4# how many independent streams of data we want to process in parallel
block_size=8 # what is the sequence length of each batch,or max contxt length of the prediction

def get_batch(split):
    data=train_data if split== 'train' else val_data
    ix=torch.randint(0,len(data)-block_size,(batch_size,))
    # print("printing ix: ", ix)

    x=torch.stack([data[i:i+block_size] for i in ix])
    y=torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

x,y=get_batch('train')
print("input:")
print(x.shape)
print(x)

print("target:")
print(y.shape)
print(y)


input:
torch.Size([4, 8])
tensor([[59, 14,  1, 71, 61, 60,  1,  5],
        [55, 60, 66, 61,  1, 52, 58, 47],
        [ 1, 69, 55, 66, 54,  1, 54, 55],
        [54, 51, 71,  1, 65, 51, 51,  1]])
target:
torch.Size([4, 8])
tensor([[14,  1, 71, 61, 60,  1,  5, 65],
        [60, 66, 61,  1, 52, 58, 47, 57],
        [69, 55, 66, 54,  1, 54, 55, 65],
        [51, 71,  1, 65, 51, 51,  1, 71]])


In [56]:
train_data[808595]

tensor(54)

In [101]:
## implement the bigram model
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class Bigram(nn.Module):
    def __init__(self,vocab_size):
        super().__init__() ## call the parent class constructor
        self.token_embedding_table=nn.Embedding(vocab_size,vocab_size)
        
    def forward(self, idx,targets=None):
        logits=self.token_embedding_table(idx) # B T C
        # print("logits", logits)
        
        if targets is None:
            loss=None
        else:
            
            # print("shape of logits: ", logits.shape)
            # but pytorch expects B C T
            B,T,C=logits.shape
        
            logits=logits.view(B*T,C)
            # print("shape of logits after view: ", logits.shape)
            # print("shape of targets: ", targets.shape)
            targets=targets.view(B*T)
            # print("shape of targets: ", targets.shape)
            loss=F.cross_entropy(logits,targets)
        return logits,loss
        

    def generate(self,idx,max_new_token):
        for _ in range(max_new_token):
            logits,loss=self(idx)
            logits=logits[:,-1,:] #only getting the (B,C)
            probs=F.softmax(logits,dim=-1) # B,C
            idx_next=torch.multinomial(probs,1) # B,1
            # print("idx: ", idx)
            idx=torch.cat([idx,idx_next],dim=1)
            
        return idx


In [102]:
m=Bigram(vocab_size)
print("shape of input", x.shape)
logit,loss=m(x,y)
print(logit.shape)
print(loss)

idx=torch.zeros((1,1),dtype=torch.long)
print("idx: ", idx)
print(decode(m.generate(idx=torch.zeros((1,1),dtype=torch.long),max_new_token=100)[0].tolist())) # [0] to unlock the first tensor

shape of input torch.Size([4, 8])
torch.Size([32, 94])
tensor(4.7964, grad_fn=<NllLossBackward0>)
idx:  tensor([[0]])

³""L?/:W,)`XW’~Œswë[- dæèé{pP—G*A[M*æejQ–£DVèx‹äoŒMRr?dD–T`£kWD/ær) jfU};z–tqdbDzi‘vbDZoiga^éKw&^,{a


In [112]:
# optimizer
optimizer=torch.optim.Adam(m.parameters(),lr=1e-3)
batch_size=32
for steps in range(1000):
    xb,yb=get_batch('train')
    logits,loss=m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print("loss: ", loss.item())

loss:  2.516305685043335


In [115]:
print(decode(m.generate(idx=torch.zeros((1,1),dtype=torch.long),max_new_token=100)[0].tolist()))




 "Slacoorowoubure pleve

Thalis de wisand  omatht;PYï‘ht g pls n athe wor



Arrratchj,  ndars

Au


In [12]:
#gpt tokenization 
import tiktoken
encode=tiktoken.get_encoding('gpt2')
encode.n_vocab

50257

In [116]:
## self attention
import torch
torch
B,T,C=4,8,2
x=torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [117]:
xbow=torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev=x[b,:t+1] # (t,C)
        xbow[b,t]=torch.mean(xprev,dim=0)

In [118]:
x[0]

tensor([[ 1.3646, -0.3629],
        [ 0.3509,  0.0096],
        [-0.3019,  0.8705],
        [ 0.5402,  0.3682],
        [ 0.0183, -0.0471],
        [-0.1879, -0.2887],
        [ 2.0596, -1.2550],
        [ 0.6449, -0.8334]])

In [119]:
xbow[0] # vertical average 

tensor([[ 1.3646, -0.3629],
        [ 0.8578, -0.1767],
        [ 0.4712,  0.1724],
        [ 0.4885,  0.2214],
        [ 0.3944,  0.1677],
        [ 0.2974,  0.0916],
        [ 0.5491, -0.1008],
        [ 0.5611, -0.1923]])

In [122]:
torch.manual_seed(1337)
a=torch.tril(torch.ones(3,3))
a=a/torch.sum(a,dim=1,keepdim=True)
b=torch.randint(0,10,(3,2)).float()
c=a@b

print('a=')
print(a)
print('b=') 
print(b)
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b=
tensor([[5., 7.],
        [2., 0.],
        [5., 3.]])
c=
tensor([[5.0000, 7.0000],
        [3.5000, 3.5000],
        [4.0000, 3.3333]])


In [125]:
totril=torch.tril(torch.ones(T,T))
wei=torch.zeros((T,T))
wei=wei.masked_fill(totril==0,float('-inf'))
wei=F.softmax(wei,dim=1)
xbow=wei@x
xbow[0]

tensor([[ 1.3646, -0.3629],
        [ 0.8578, -0.1767],
        [ 0.4712,  0.1724],
        [ 0.4885,  0.2214],
        [ 0.3944,  0.1677],
        [ 0.2974,  0.0916],
        [ 0.5491, -0.1008],
        [ 0.5611, -0.1923]])

In [146]:
# self attention improved version
torch.manual_seed(1337)
B,T,C=4,8,32
x=torch.randn(B,T,C)

head_size=16
key=nn.Linear(C,head_size,bias=False)
query=nn.Linear(C,head_size,bias=False)
value=nn.Linear(C,head_size,bias=False)
k=key(x) # B T 16
q=query(x) # B T 16
wei=q @ k.transpose(-2,-1) # (B,T,16) @ (B,16,T) ----> B T T

tril=torch.tril(torch.ones(T,T))
# wei=torch.zeros((T,T))
wei=wei.masked_fill(tril==0,float('-inf'))
wei=F.softmax(wei,dim=-1)

v=value(x) # B T 16
out=wei@v

out.shape



torch.Size([4, 8, 16])

In [147]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [9]:
from bs4 import BeautifulSoup
import requests
import csv
import time

def scrape_poems( url):
    # Base URL
    
    # Headers to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        # Make the request
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Create BeautifulSoup object
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the poem list container
        poem_list = soup.find('ul', class_='list-poems nuevolist orderprimero')
        
        if not poem_list:
            print("Could not find the poem list container")
            return
        
        # Get all poem links
        poem_links = poem_links = poem_list.find_all('a', rel='nofollow')
        
        # Create/open a text file to store the poems
        with open('combine_poems1.txt', 'w', encoding='utf-8') as file:
            
            # Iterate through each poem link
            for i, link in enumerate(poem_links, 1):
                poem_url = link['href']
                if not poem_url.startswith('http'):
                    poem_url = 'https:'+ poem_url
                
                try:
                    # Add delay between requests to be polite
                    time.sleep(2)
                    
                    # Get the poem page
                    poem_response = requests.get(poem_url, headers=headers)
                    poem_response.raise_for_status()
                    
                    # Parse the poem page
                    poem_soup = BeautifulSoup(poem_response.text, 'html.parser')
                    
                    # Get poem title
                    title = poem_soup.find('h1', class_='title-poem')
                    title_text = title.text.strip() if title else "Untitled"
                    
                    # Get poem content
                    poem_content = poem_soup.find('div', class_='poem-entry')
                    poem_text = poem_content.text.strip() if poem_content else "No content found"
                    
                    # Write to file with formatting
                    file.write(f"{'='*50}\n")
                    file.write(f"Poem {i}: {title_text}\n")
                    file.write(f"{'='*50}\n\n")
                    file.write(poem_text + "\n\n\n")
                    
                    print(f"Successfully scraped: {title_text}")
                    
                except Exception as e:
                    print(f"Error scraping poem at {poem_url}: {str(e)}")
                    continue
                    
    except Exception as e:
        print(f"An error occurred: {str(e)}")

# if __name__ == "__main__":
#     # urls=["https://mypoeticside.com/poets/charles-bukowski-poems","https://mypoeticside.com/poets/william-shakespeare-poems",
#         #   "https://mypoeticside.com/poets/sylvia-plath-poems","https://mypoeticside.com/poets/john-milton-poems","https://mypoeticside.com/poets/t-s-eliot-poems","https://mypoeticside.com/poets/emily-dickinson-poems"
#         #   ,"https://mypoeticside.com/poets/john-donne-poems","https://mypoeticside.com/poets/alfred-lord-tennyson-poems","https://mypoeticside.com/poets/rabindranath-tagore-poems","https://mypoeticside.com/poets/rudyard-kipling-poems"]
#     urls=["https://mypoeticside.com/poets/rudyard-kipling-poems"]
#     for url in urls:  
#         scrape_poems(url)

In [10]:
# printing all the alphabets
import string
char=[ i for i in string.ascii_lowercase]


In [11]:
import requests
from bs4 import BeautifulSoup
from typing import List

def get_poets(url: str) -> List[dict]:
    """
    Scrapes poet links from the given URL.
    
    Args:
        url (str): The URL to scrape poet links from
        
    Returns:
        List[dict]: List of dictionaries containing poet names and their URLs
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Create BeautifulSoup object
        soup = BeautifulSoup(response.text, 'html.parser')
    
        poem_list = soup.find('ul', class_='list-poems')
        if not poem_list:
            return []
            
        poet_links = poem_list.find_all('a')
        
        # Extract poet information
        poets = []
        for link in poet_links:
            poets.append({
                'name': link.text.strip(),
                'url': link['href']
            })
            
        return poets
        
    except requests.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return []
    except Exception as e:
        print(f"Error parsing the content: {e}")
        return []

# Example usage
if __name__ == "__main__":
    poets_list=[]
    for i in char:
        result=get_poets(f"https://mypoeticside.com/{i}-browse")
        poets_list.extend(result)
    
    for poet in poets_list:
        scrape_poems(poet['url'])
        
        
        
        
        
        
    

Successfully scraped: 1887
Successfully scraped: Along the field as we came by
Successfully scraped: As Through the Wild Green Hills of Wyre
Successfully scraped: As Through the Wild Green Hills of Wyre
Successfully scraped: Be Still, My Soul, Be Still
Successfully scraped: Bredon Hill
Successfully scraped: Bring, In This Timeless Grave to Throw
Successfully scraped: Could Man Be Drunk Forever
Successfully scraped: Diffugere Nives
Successfully scraped: Eight O'Clock
Successfully scraped: Epitaph On An Army of Mercenaries
Successfully scraped: Far In a Western Brookland
Successfully scraped: Farewell to Barn and Stack and Tree
Successfully scraped: Fragment of a Greek Tragedy
Successfully scraped: From Far, From Eve and Morning
Successfully scraped: Ho, everyone that thirsteth
Successfully scraped: Hughley Steeple
Successfully scraped: I Hoed and Trenched and Weeded
Successfully scraped: If By Chance Your Eye Offend You
Successfully scraped: If Truth in Hearts That Perish


KeyboardInterrupt: 