In [None]:
%pip install kaggle
%pip install  --pre torch torchvision  torchaudio --extra-index-url https://download.pytorch.org/whl/cu121
%pip install numpy pandas fastai  transformers datasets sentencepiece

In [None]:
path = Path('/notebooks/Titanic')  # Update this path

In [None]:
import torch
import numpy as np
import pandas as pd
from torch import tensor
import torch.nn.functional as F
from fastai.data.transforms import RandomSplitter
from pathlib import Path

# Load your data here

df = pd.read_csv(path/'train.csv')

# Data preparation
modes = df.mode().iloc[0]
df.fillna(modes, inplace=True)

# Ensure all columns are numeric or converted to numeric
df['LogFare'] = np.log(df['Fare'] + 1)
df = pd.get_dummies(df, columns=["Sex", "Pclass", "Embarked"])

# Defining independent and dependent variables
indep_cols = ['Age', 'SibSp', 'Parch', 'LogFare', 'Sex_male', 'Sex_female', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
t_dep = tensor(df.Survived.values)
t_indep = tensor(df[indep_cols].values.astype(np.float32))  # Ensure all values are float

# Normalizing the data
vals, _ = t_indep.max(dim=0)
t_indep = t_indep / vals

# Splitting data into training and validation sets
trn_split, val_split = RandomSplitter(seed=42)(df)
trn_indep, val_indep = t_indep[trn_split], t_indep[val_split]
trn_dep, val_dep = t_dep[trn_split], t_dep[val_split]

# Function definitions
def calc_preds(coeffs, indeps): 
    return (indeps @ coeffs).sum(axis=1)

def calc_loss(coeffs, indeps, deps): 
    return torch.abs(calc_preds(coeffs, indeps) - deps).mean()

def update_coeffs(coeffs, lr):
    coeffs.sub_(coeffs.grad * lr)
    coeffs.grad.zero_()

def one_epoch(coeffs, lr):
    loss = calc_loss(coeffs, trn_indep, trn_dep)
    loss.backward()
    with torch.no_grad(): 
        update_coeffs(coeffs, lr)

def init_coeffs():
    return (torch.rand(t_indep.shape[1], 1) - 0.5).requires_grad_()

# Training the model
def train_model(epochs=30, lr=0.01):
    coeffs = init_coeffs()
    for _ in range(epochs): 
        one_epoch(coeffs, lr=lr)
    return coeffs

coeffs = train_model(lr=0.2)

# Calculating accuracy
def acc(coeffs):
    preds = calc_preds(coeffs, val_indep)
    return (val_dep.bool() == (preds > 0.5)).float().mean()

print("Accuracy:", acc(coeffs).item())

In [None]:
import torch
import numpy as np
import pandas as pd
from torch import nn, tensor
import torch.nn.functional as F
from fastai.data.transforms import RandomSplitter
from pathlib import Path

# Load your data here

df = pd.read_csv(path/'train.csv')

# Data preparation
modes = df.mode().iloc[0]
df.fillna(modes, inplace=True)
df['LogFare'] = np.log(df['Fare'] + 1)
df = pd.get_dummies(df, columns=["Sex", "Pclass", "Embarked"])

# Defining independent and dependent variables
indep_cols = ['Age', 'SibSp', 'Parch', 'LogFare', 'Sex_male', 'Sex_female', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
t_dep = tensor(df.Survived.values, dtype=torch.float32)
t_indep = tensor(df[indep_cols].values.astype(np.float32))

# Normalizing the data
vals, _ = t_indep.max(dim=0)
t_indep = t_indep / vals

# Splitting data into training and validation sets
trn_split, val_split = RandomSplitter(seed=42)(df)
trn_indep, val_indep = t_indep[trn_split], t_indep[val_split]
trn_dep, val_dep = t_dep[trn_split], t_dep[val_split]

# Neural Network
class TitanicNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(TitanicNN, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = F.gelu(self.layer1(x))
        x = torch.sigmoid(self.layer2(x))
        return x

# Initialize the NN
input_size = len(indep_cols)
hidden_size = 10  # This is adjustable
output_size = 1
model = TitanicNN(input_size, hidden_size, output_size)

# Loss and Optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

# Training the model
epochs = 100
for epoch in range(epochs):
    # Forward pass
    outputs = model(trn_indep)
    loss = criterion(outputs, trn_dep.reshape(-1, 1))

    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print ('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, loss.item()))

# Testing the model
with torch.no_grad():
    val_outputs = model(val_indep)
    val_outputs = (val_outputs > 0.5).float()
    accuracy = (val_outputs.reshape(-1) == val_dep).float().mean()
    print(f'Accuracy: {accuracy.item()}')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer

# Load a dataset from Hugging Face (e.g., "wikitext")
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenized_texts = [torch.tensor(tokenizer.encode(text["text"])) for text in dataset.select(range(1000))]

# Padding sequences and creating dataloader
def collate_batch(batch):
    return pad_sequence(batch, padding_value=tokenizer.pad_token_id)

dataloader = DataLoader(tokenized_texts, batch_size=32, collate_fn=collate_batch)

# Define a simple RNN model
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, num_layers)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        output = self.fc(output)
        return output

# Hyperparameters
vocab_size = tokenizer.vocab_size
embed_size = 256
hidden_size = 512
num_layers = 2

# Initialize the model, loss function, and optimizer
model = RNNModel(vocab_size, embed_size, hidden_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    for batch in dataloader:
        optimizer.zero_grad()
        output = model(batch)
        loss = criterion(output.view(-1, vocab_size), batch.view(-1))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

In [None]:
import torch
import numpy as np
import pandas as pd
from torch import nn, tensor
import torch.nn.functional as F
from fastai.data.transforms import RandomSplitter
from pathlib import Path

# Function to generate text
def generate_text(model, start_str, length, temperature=1.0):
    model.eval()  # Set the model to evaluation mode

    # Tokenize and encode the start string
    input_ids = tokenizer.encode(start_str, return_tensors='pt')

    # Generate text
    with torch.no_grad():
        for _ in range(length):
            outputs = model(input_ids)
            predictions = outputs[:, -1, :]

            # Apply temperature to predictions and sample a token
            predictions = predictions / temperature
            predicted_id = torch.multinomial(F.softmax(predictions, dim=-1), num_samples=1)

            # Append predicted token to the input sequence and continue
            input_ids = torch.cat([input_ids, predicted_id], dim=-1)

    # Decode the tokens and return the text
    return tokenizer.decode(input_ids[0])

# Example usage
start_str = "The weather today"
generated_text = generate_text(model, start_str, length=50, temperature=0.7)
print(generated_text)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import sentencepiece as spm


# Load a dataset
dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")
text = " ".join(dataset["text"][:1000])  # Using a larger subset for better training

# Save the text to a file
with open("text_data.txt", "w", encoding="utf-8") as file:
    file.write(text)

# Train a SentencePiece tokenizer
spm.SentencePieceTrainer.train(input='text_data.txt', model_prefix='sp', vocab_size=5000, model_type='bpe')

# Load the trained model
sp = spm.SentencePieceProcessor(model_file='sp.model')

# Tokenize the dataset
tokenized_texts = [torch.tensor(sp.encode(text)) for text in dataset["text"][:1000]]

# Padding sequences and creating dataloader
def collate_batch(batch):
    return pad_sequence(batch, padding_value=sp.pad_id()).long()  # Convert to LongTensor

dataloader = DataLoader(tokenized_texts, batch_size=32, collate_fn=collate_batch)

# Define the LSTM model
class LSTMTitanic(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(LSTMTitanic, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        output, hidden = self.lstm(x, hidden)
        output = self.fc(output)
        return output, hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(num_layers, batch_size, hidden_size),
                torch.zeros(num_layers, batch_size, hidden_size))
        
        
max_token_id = max([token.max().item() for token in tokenized_texts if len(token) > 0])
print(f"Maximum token ID in the data: {max_token_id}")

# Hyperparameters
vocab_size = max(max_token_id + 1, vocab_size)
print(f"Vocabulary size: {vocab_size}")
embed_size = 256
hidden_size = 512
num_layers = 2
# Initialize the model, loss function, and optimizer
model = LSTMTitanic(vocab_size, embed_size, hidden_size, num_layers)  # Use this vocab size
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    hidden = model.init_hidden(32)
    for batch in dataloader:
        # Detach hidden states
        hidden = tuple([each.data for each in hidden])
        
        optimizer.zero_grad()
        output, hidden = model(batch, hidden)
        loss = criterion(output.view(-1, vocab_size), batch.view(-1))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

In [None]:
raw_datasets = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
raw_datasets["text"]

In [None]:

def get_training_corpus():
    return (
        raw_datasets["text"][i : i + 1000]
        for i in range(0, len(raw_datasets["text"]), 1000)
    )

In [None]:
training_corpus = get_training_corpus()

In [None]:
from tokenizers import SentencePieceBPETokenizer
import transformers
tk_tokenizer = SentencePieceBPETokenizer()
def batch_iterator(batch_size=1000):
    for i in range(0, len(raw_datasets), batch_size):
        yield raw_datasets[i : i + batch_size]["text"]
special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<cls>", "<sep>", "<mask>"]

tk_tokenizer.train_from_iterator(
    training_corpus,
    vocab_size=30_000,
    min_frequency=5,
    show_progress=True,
    limit_alphabet=500,
    special_tokens=special_tokens
)
tk_tokenizer.save("./tk-tokenizer-test")
tokenizer = transformers.PreTrainedTokenizerFast(tokenizer_object=tk_tokenizer, model_max_length=2048, special_tokens=special_tokens)
tokenizer.bos_token = "<s>"
tokenizer.bos_token_id = tk_tokenizer.token_to_id("<s>")
tokenizer.pad_token = "<pad>"
tokenizer.pad_token_id = tk_tokenizer.token_to_id("<pad>")
tokenizer.eos_token = "</s>"
tokenizer.eos_token_id = tk_tokenizer.token_to_id("</s>")
tokenizer.unk_token = "<unk>"
tokenizer.unk_token_id = tk_tokenizer.token_to_id("<unk>")
tokenizer.cls_token = "<cls>"
tokenizer.cls_token_id = tk_tokenizer.token_to_id("<cls>")
tokenizer.sep_token = "<sep>"
tokenizer.sep_token_id = tk_tokenizer.token_to_id("<sep>")
tokenizer.mask_token = "<mask>"
tokenizer.mask_token_id = tk_tokenizer.token_to_id("<mask>")
# and save for later!
tokenizer.save_pretrained("./tokenizer-test")

In [None]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast.from_pretrained("./tokenizer-test")


In [None]:
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train")
tokenized_texts = [torch.tensor(tokenizer.encode(text, truncation=True, max_length=2048)) for text in dataset["text"][:1000]]


In [None]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def collate_batch(batch):
    return pad_sequence(batch, padding_value=tokenizer.pad_token_id).long()

dataloader = DataLoader(tokenized_texts, batch_size=32, collate_fn=collate_batch)


In [None]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def collate_batch(batch):
    return pad_sequence(batch, padding_value=tokenizer.pad_token_id).long()

dataloader = DataLoader(tokenized_texts, batch_size=32, collate_fn=collate_batch)


In [None]:
vocab_size = tokenizer.vocab_size
model = LSTMTitanic(vocab_size, embed_size, hidden_size, num_layers)


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

num_epochs = 5
for epoch in range(num_epochs):
    for batch in dataloader:
        # Get the current batch size
        current_batch_size = batch.size(0)

        # Initialize or resize hidden state based on the current batch size
        hidden = model.init_hidden(current_batch_size)
        
        optimizer.zero_grad()
        output, hidden = model(batch, hidden)
        loss = criterion(output.view(-1, vocab_size), batch.view(-1))
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")


In [None]:
import torch.nn.functional as F
def generate_text(model, start_str, length, temperature=1.0):
    model.eval()  # Set the model to evaluation mode

    # Tokenize and encode the start string
    input_ids = torch.tensor(tokenizer.encode(start_str)).unsqueeze(0)  # Add batch dimension

    # Initialize hidden state
    hidden = model.init_hidden(1)  # Batch size is 1 for single sequence generation

    # Generate text
    model_output = []
    with torch.no_grad():
        for _ in range(length):
            output, hidden = model(input_ids, hidden)
            # Apply temperature to the last token's logits and sample a token
            token_logits = output[0, -1, :] / temperature
            predicted_token_id = torch.multinomial(F.softmax(token_logits, dim=-1), num_samples=1).item()

            # Append predicted token ID to model_output and input_ids
            model_output.append(predicted_token_id)
            input_ids = torch.cat([input_ids, torch.tensor([[predicted_token_id]])], dim=1)

    # Decode the tokens and return the text
    return tokenizer.decode(model_output)

# Example usage
start_str = "The weather today"
generated_text = generate_text(model, start_str, length=50, temperature=0.7)
print(generated_text)