In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

NLP

In [None]:
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import RandomForestClassifier
import numpy as np, torch

texts, labels = [...], [...]  # your dataset

tok = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2').eval()

def get_emb(text):
    with torch.no_grad():
        t = tok(text, return_tensors='pt', truncation=True, padding=True)
        return model(**t).last_hidden_state[:,0].squeeze().numpy()

X = np.vstack([get_emb(t) for t in texts])
clf = RandomForestClassifier().fit(X, labels)


🧠 NLP Pipeline #1: TF-IDF + Logistic Regression

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

tfidf_lr = Pipeline([
    ('tfidf', TfidfVectorizer(
        lowercase=True, stop_words='english',
        ngram_range=(1, 2), max_df=0.8, min_df=5)),
    ('clf', LogisticRegression(max_iter=1000))
])

tfidf_lr.fit(texts, labels)


2 TRANSFORMERS

In [None]:
import pandas as pd

# Загружаем пример CSV-файла
# Предположим, что в CSV две колонки: "text" и "label"
sample_data = {
    "text": [
        "This product is amazing!",
        "Terrible experience, will not buy again.",
        "Not bad, but not great either.",
        "Absolutely fantastic! Exceeded expectations.",
        "Waste of money, very disappointed."
    ],
    "label": [1, 0, 1, 1, 0]
}

df = pd.DataFrame(sample_data)
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Используем BERT
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert = AutoModel.from_pretrained('bert-base-uncased').eval()

# Получаем CLS embedding
def get_cls_emb(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        out = bert(**inputs)
    return out.last_hidden_state[:, 0].squeeze().numpy()  # [CLS] токен

# Получаем эмбеддинги
X_emb = np.vstack([get_cls_emb(t) for t in df["text"]])
y = df["label"].values

# Делим на train/test
X_train, X_test, y_train, y_test = train_test_split(X_emb, y, test_size=0.2, random_state=42)

# Обучаем классификатор
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Оцениваем
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy


🖼️ CV — Image Classification (ResNet + torchvision) ЕСЛИ по папкам

In [None]:
import torch, torchvision
from torchvision import datasets, transforms, models
from torch import nn, optim

train_tf = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor()
])

train_data = datasets.ImageFolder('train_dir/', transform=train_tf)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True)

model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, len(train_data.classes))  # new head
model = model.cuda()

criterion = nn.CrossEntropyLoss()
opt = optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(5):
    for X, y in train_loader:
        X, y = X.cuda(), y.cuda()
        opt.zero_grad()
        loss = criterion(model(X), y)
        loss.backward()
        opt.step()


CV - Image classification with csv format

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

# 1️⃣ Загрузка и препроцессинг CSV
df = pd.read_csv('train.csv')                      # Файл с колонками: label, pixel0...pixel783
X = df.drop('label', axis=1).values / 255.0        # Нормализация [0,1]
y = df['label'].values

# 2️⃣ Преобразуем в форму [N, 1, 28, 28]
X = X.reshape(-1, 1, 28, 28).astype(np.float32)
y = y.astype(np.int64)

# 3️⃣ Разделим на train / val
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 4️⃣ DataLoader'ы
train_ds = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
val_ds   = TensorDataset(torch.tensor(X_val),   torch.tensor(y_val))
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
val_dl   = DataLoader(val_ds, batch_size=64)

# 5️⃣ Простая CNN
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),  # 28x28 → 14x14
            nn.Conv2d(16, 32, kernel_size=3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),  # 14x14 → 7x7
            nn.Flatten(),
            nn.Linear(32 * 7 * 7, 128), nn.ReLU(),
            nn.Linear(128, 10)  # 10 классов
        )
    def forward(self, x): return self.net(x)

model = CNN().cuda()
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

# 6️⃣ Обучение
for epoch in range(5):
    model.train()
    for Xb, yb in train_dl:
        Xb, yb = Xb.cuda(), yb.cuda()
        opt.zero_grad()
        loss = loss_fn(model(Xb), yb)
        loss.backward()
        opt.step()

    # Валидация
    model.eval(); correct = 0; total = 0
    with torch.no_grad():
        for Xb, yb in val_dl:
            Xb, yb = Xb.cuda(), yb.cuda()
            pred = model(Xb).argmax(1)
            correct += (pred == yb).sum().item()
            total += yb.size(0)
    acc = correct / total
    print(f'Epoch {epoch+1}: val acc = {acc:.4f}')


In [None]:
📊 Tabular — XGBoost с Optuna (бинарная классификация)

In [None]:
import xgboost as xgb, optuna
from sklearn.model_selection import train_test_split

X, y = ... , ...  # your features and targets
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    params = {
        'objective': 'binary:logistic', 'eval_metric': 'auc', 'tree_method': 'hist',
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1)
    }
    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    cv = xgb.cv(params, dtrain, nfold=3, num_boost_round=200,
                early_stopping_rounds=20, seed=42)
    return cv['test-auc-mean'].max()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

final_model = xgb.XGBClassifier(**study.best_params)
final_model.fit(X_tr, y_tr)


Basic -> EDA