# Импорты

In [None]:
import sys

IN_COLAB = "google.colab" in sys.modules
if IN_COLAB:
    !pip install pytorch_lightning
    !pip install gensim
    !pip install catboost

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

# Для более качественных графиков
%config InlineBackend.figure_format='retina'
plt.rcParams["figure.dpi"] = 150

# Нейронные сети - база

In [None]:
X = torch.tensor([10, 5, 20]).float()

num_inputs = 3
num_outputs = 1
neuron = nn.Linear(num_inputs, num_outputs)
neuron(X)

In [None]:
num_outputs = 1
neuron = nn.Linear(num_inputs, num_outputs)
neuron(X)

In [None]:
act = torch.nn.ReLU()
act(torch.tensor([12.0])), act(torch.tensor([-11.0]))

In [None]:
num_outputs = 10
neuron = torch.nn.Linear(num_inputs, num_outputs)
act(neuron(X))

In [None]:
network = nn.Sequential(
    nn.Linear(3, 10),
    nn.ReLU(),
    nn.Linear(10, 5),
    nn.ReLU(),
)
network(X)

Определим НН и обучим ее на простом датасете

In [None]:
class SinDataset(Dataset):
    def __init__(self, start, end, num_samples):
        self.x = torch.linspace(start, end, num_samples)[:, None]
        self.y = torch.sin(self.x)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]


class SinModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(1, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
        )
        self.criterion = nn.MSELoss()

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch):
        x, y = batch
        pred = self(x)
        loss = self.criterion(pred, y)
        return loss

    @torch.no_grad
    def validation_step(self, batch):
        x, y = batch
        pred = self(x)

        x = x.flatten().cpu()
        y = y.flatten().cpu()
        pred = pred.flatten().cpu().detach()

        plt.plot(x, pred, label="Предсказание")
        plt.plot(x, y, linestyle="--", label="Таргет")
        plt.legend()
        plt.show()

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=0.01)


dataset = SinDataset(start=-20, end=20, num_samples=5000)
dataloader = DataLoader(dataset, batch_size=1000)
val_dataloader = DataLoader(dataset, batch_size=5000, shuffle=False)

model = SinModel()
trainer = pl.Trainer(max_epochs=300, check_val_every_n_epoch=50)
trainer.fit(model, dataloader, val_dataloader)

__Задача__: попробуйте изменить архитектуру нн так, чтобы она предсказала правильно два цикла (например, значения на интервале -5, 7)

# Эмбеддинги для слов

In [None]:
import gensim.downloader as api
import numpy as np
import nltk

nltk.download("punkt_tab", quiet=True)
from nltk.tokenize import word_tokenize


def get_embeddings(model, text):
    """Get Word2Vec embeddings for the given text."""
    # Split the text into words and get embeddings for each word
    words = word_tokenize(text)
    word_to_embed = {}

    for word in words:
        if word in model:
            word_to_embed[word] = model[word]
        else:
            word_to_embed[word] = None

    return word_to_embed


text = "did not see you there dfdfdf"
embed_model = api.load("glove-wiki-gigaword-50")

word_embed_dict = get_embeddings(embed_model, text)

for word, embedding in word_embed_dict.items():
    print(f"{word} - {embedding}")

In [None]:
import gensim.downloader as api

available_models = api.info()["models"].keys()

print("Available embedding models:\n", *available_models, sep="\n")

__Задача__: найдите слово, у которого эмбеддинг ближе всего к значению: $v_{man} + v_{queen} - v_{king}$, где $v_{word}$ - эмбеддинг слова `word`

## Twitter sentiment classification

[Ссылка на датасет](https://www.kaggle.com/datasets/saurabhshahane/twitter-sentiment-dataset?select=Twitter_Data.csv)

In [None]:
df = pd.read_csv("Twitter_Data.csv")
df = df.sample(frac=0.25, random_state=42)
df

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm


def preprocess(df, embed_model):
    embeds = []
    for _, row in tqdm(df.iterrows(), total=df.shape[0]):
        word_embed_dict = get_embeddings(embed_model, row["clean_text"])

        row_embeds = [e for e in word_embed_dict.values() if e is not None]
        if row_embeds:
            # Среднее эмбеддингов в тексте
            embeds.append(np.stack(row_embeds).mean(0))
        else:
            embeds.append(np.zeros(embed_model.vector_size))

    X = np.stack(embeds)
    return X


df = df.dropna()
X = preprocess(df, embed_model)
y = df["category"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

model = CatBoostClassifier()
model.fit(X_train, y_train)

pred = model.predict(X_test)

In [None]:
f1_score(y_true=y_test, y_pred=pred, average="micro")

In [None]:
class TwitterDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x, dtype=torch.float)
        self.y = torch.tensor((y + 1).to_numpy(), dtype=torch.long)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]


class TwitterModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(50, 16),
            nn.ReLU(),
            # nn.Linear(16, 16),
            # nn.ReLU(),
            # nn.Linear(16, 16),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 16),
            nn.Linear(16, 16),
            nn.ReLU(),
            nn.Linear(16, 3),
        )
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        return self.model(x)

    def predict(self, x):
        x = self.forward(x)
        x = x.argmax(dim=1)
        return x

    def training_step(self, batch):
        x, y = batch
        pred = self(x)
        loss = self.criterion(pred, y)
        return loss

    @torch.no_grad
    def validation_step(self, batch):
        x, y = batch
        pred = self.predict(x)
        x = x.flatten().cpu()
        y = y.flatten().cpu()
        pred = pred.flatten().cpu().detach()

        f1 = f1_score(y_true=y, y_pred=pred, average="micro")
        print(f"f1: {f1}")

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=0.01)


train_dataset = TwitterDataset(X_train, y_train)
test_dataset = TwitterDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=1000)
test_loader = DataLoader(test_dataset, batch_size=12000, shuffle=False)

model = TwitterModel()
trainer = pl.Trainer(max_epochs=300, check_val_every_n_epoch=50)
trainer.fit(model, train_loader, test_loader)