# Machine Learning with PyTorch and Scikit-Learn  
# -- Code Examples

## Package version checks

Add folder to path in order to load from the check_packages.py script:

In [1]:
import sys
sys.path.insert(0, '..')

Check recommended package versions:

Chapter 15: Modeling Sequential Data Using Recurrent Neural Networks (part 3/3)
========



**Outline**

- Implementing RNNs for sequence modeling in PyTorch
  - [Project two -- character-level language modeling in PyTorch](#Project-two----character-level-language-modeling-in-PyTorch)
    - [Preprocessing the dataset](#Preprocessing-the-dataset)
    - [Evaluation phase -- generating new text passages](#Evaluation-phase----generating-new-text-passages)
- [Summary](#Summary)

Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s).

In [2]:
from IPython.display import Image
%matplotlib inline

## Project two: character-level language modeling in PyTorch


### Preprocessing the dataset

In [3]:
import sys
import torch
import torch.nn as nn
from torch.utils.data.dataset import random_split
import pandas as pd
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data.dataset import random_split
from sklearn.preprocessing import StandardScaler
from collections import Counter
import csv

df = pd.read_csv('./data/Fake.csv')
df2 = pd.read_csv('./data/True.csv')
df.append(df2)
df = df.drop(['title','subject','date'], axis=1)

## Reading and processing text


sa = df[:50]
print(sa)
text = ' '.join([str(elem) for elem in sa.values])
char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))

                                                 text
0   Donald Trump just couldn t wish all Americans ...
1   House Intelligence Committee Chairman Devin Nu...
2   On Friday, it was revealed that former Milwauk...
3   On Christmas day, Donald Trump announced that ...
4   Pope Francis used his annual Christmas Day mes...
5   The number of cases of cops brutalizing and ki...
6   Donald Trump spent a good portion of his day a...
7   In the wake of yet another court decision that...
8   Many people have raised the alarm regarding th...
9   Just when you might have thought we d get a br...
10  A centerpiece of Donald Trump s campaign, and ...
11  Republicans are working overtime trying to sel...
12  Republicans have had seven years to come up wi...
13  The media has been talking all day about Trump...
14  Abigail Disney is an heiress with brass ovarie...
15  Donald Trump just signed the GOP tax scam into...
16  A new animatronic figure in the Hall of Presid...
17  Trump supporters and the

  df.append(df2)


In [4]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i,ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)

text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype=np.int32)

print('Text encoded shape: ', text_encoded.shape)

print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(char_array[text_encoded[15:21]]))

Text encoded shape:  (115542,)
['Donald Trump       == Encoding ==>  [57  7 34 75 74 61 72 64  0 50 78 81 73 76  0]
[70 81 79 80  0 63]  == Reverse  ==>  just c


In [5]:
for ex in text_encoded[:5]:
    print('{} -> {}'.format(ex, char_array[ex]))

57 -> [
7 -> '
34 -> D
75 -> o
74 -> n


In [6]:
seq_length = 40
chunk_size = seq_length + 1

text_chunks = [text_encoded[i:i+chunk_size] 
               for i in range(len(text_encoded)-chunk_size+1)] 

## inspection:
for seq in text_chunks[:1]:
    input_seq = seq[:seq_length]
    target = seq[seq_length] 
    print(input_seq, ' -> ', target)
    print(repr(''.join(char_array[input_seq])), 
          ' -> ', repr(''.join(char_array[target])))

[57  7 34 75 74 61 72 64  0 50 78 81 73 76  0 70 81 79 80  0 63 75 81 72
 64 74  0 80  0 83 69 79 68  0 61 72 72  0 31 73]  ->  65
"['Donald Trump just couldn t wish all Am"  ->  'e'


In [7]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]
        return text_chunk[:-1].long(), text_chunk[1:].long()
    
seq_dataset = TextDataset(torch.tensor(text_chunks))

  seq_dataset = TextDataset(torch.tensor(text_chunks))


In [8]:
for i, (seq, target) in enumerate(seq_dataset):
    print(' Input (x):', repr(''.join(char_array[seq])))
    print('Target (y):', repr(''.join(char_array[target])))
    print()
    if i == 1:
        break
    

 Input (x): "['Donald Trump just couldn t wish all Am"
Target (y): "'Donald Trump just couldn t wish all Ame"

 Input (x): "'Donald Trump just couldn t wish all Ame"
Target (y): 'Donald Trump just couldn t wish all Amer'



In [9]:
device = torch.device("cuda:0")
# device = 'cpu'

In [10]:
from torch.utils.data import DataLoader
 
batch_size = 64

torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)


### Building a character-level RNN model

In [11]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim) 
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
                           batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden.to(device), cell.to(device)
    
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size) 
model = model.to(device)
model

RNN(
  (embedding): Embedding(87, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=87, bias=True)
)

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

num_epochs = 100 

torch.manual_seed(1)

for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    seq_batch = seq_batch.to(device)
    target_batch = target_batch.to(device)
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell) 
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item()/seq_length
    if epoch % 10 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')
 

Epoch 0 loss: 4.4679
Epoch 10 loss: 2.6445
Epoch 20 loss: 2.4505
Epoch 30 loss: 2.2646


### Evaluation phase: generating new text passages

In [None]:
from torch.distributions.categorical import Categorical

torch.manual_seed(1)

logits = torch.tensor([[1.0, 1.0, 1.0]])

print('Probabilities:', nn.functional.softmax(logits, dim=1).numpy()[0])

m = Categorical(logits=logits)
samples = m.sample((10,))
 
print(samples.numpy())

In [None]:
torch.manual_seed(1)

logits = torch.tensor([[1.0, 1.0, 3.0]])

print('Probabilities:', nn.functional.softmax(logits, dim=1).numpy()[0])

m = Categorical(logits=logits)
samples = m.sample((10,))
 
print(samples.numpy())

In [None]:
def sample(model, starting_str, 
           len_generated_text=500, 
           scale_factor=1.0):

    encoded_input = torch.tensor([char2int[s] for s in starting_str])
    encoded_input = torch.reshape(encoded_input, (1, -1))

    generated_str = starting_str

    model.eval()
    hidden, cell = model.init_hidden(1)
    hidden = hidden.to('cpu')
    cell = cell.to('cpu')
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell) 
    
    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(last_char.view(1), hidden, cell) 
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        generated_str += str(char_array[last_char])
        
    return generated_str

torch.manual_seed(1)
model.to('cpu')
print(sample(model, starting_str='The island'))

* **Predictability vs. randomness**

In [None]:
logits = torch.tensor([[1.0, 1.0, 3.0]])

print('Probabilities before scaling:        ', nn.functional.softmax(logits, dim=1).numpy()[0])

print('Probabilities after scaling with 0.5:', nn.functional.softmax(0.5*logits, dim=1).numpy()[0])

print('Probabilities after scaling with 0.1:', nn.functional.softmax(0.1*logits, dim=1).numpy()[0])


In [None]:
torch.manual_seed(1)
print(sample(model, starting_str='The body', 
             scale_factor=3))

In [None]:
torch.manual_seed(1)
print(sample(model, starting_str='A man', 
             scale_factor=1))


...


# Summary

...




Readers may ignore the next cell.


In [None]:
! python ../.convert_notebook_to_script.py --input ch15_part3.ipynb --output ch15_part3.py