In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Загрузка данных
data = pd.read_csv('test_data_en.csv')

# Предварительная обработка данных
data.drop(columns=['Unnamed: 0'], inplace=True)  # Удаление ненужного столбца
X = data[['Задача en', 'Обстановка en', 'Оптимальный план en']]  # Выбор текстовых признаков
y = data['Успех предсказанного плана']  # Целевая переменная

# Преобразование текстовых данных в числовой формат
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(X.values.astype('U'))

# Разделение данных на обучающий и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Обучение модели
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Предсказание на тестовом наборе
y_pred = model.predict(X_test)

# Оценка модели
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [7]:
!pip3 install scikit-learn

Looking in indexes: https://nexus.services.mts.ru/repository/pip/simple, https://artifactory.mts.ru/artifactory/api/pypi/own-onetl-pypi-local/simple, https://artifactory.mts.ru/artifactory/api/pypi/python-libs-iga-sso-pypi-local/simple/, https://artifactory.mts.ru/artifactory/mts-integration-layer-mcc-pypi-local/simple/, https://sa0000adpran:****@artifactory.mts.ru/artifactory/api/pypi/adp_ran_repo-adp-ran-pypi-local/simple/, https://minio-pre-prod.msk.bd-cloud.mts.ru
Collecting scikit-learn
  Downloading https://nexus.services.mts.ru/repository/pip/packages/scikit-learn/1.4.1.post1/scikit_learn-1.4.1.post1-cp39-cp39-macosx_12_0_arm64.whl (10.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading https://nexus.services.mts.ru/repository/pip/packages/joblib/1.3.2/joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [18]:
X

Unnamed: 0,Задача en,Обстановка en,Оптимальный план en
0,Conduct a therapy session through drawing by n...,"table, paint-by-numbers kit, child, water, pap...",Find a table. Come to the table. Find the chil...
1,Prepare a salad for dinner,"Kitchen, table, vegetables, meat (optional), s...",Open the door. Walk to the kitchen. Close the ...
2,Cut the melon,"Kitchen, table, plate, melon, knife. The melon...",Open the door. Enter the kitchen. Close the do...
3,Production of textile products at a textile in...,"Production workshop, textile materials, sewing...",Open the door. Walk to the production workshop...
4,Cutting glass for window frames at a glass fac...,"Glass factory production workshop, workplace, ...",Walk to the workplace. Find a sheet of glass. ...
...,...,...,...
1446,Pour white tea into a mug.,"gazebo, table, box, mug, kettle with white tea...",Find the box. Open the box. Find the mug in th...
1447,Clean the living room before guests arrive.,"living room with a sofa, armchairs, coffee tab...",Open the door. Enter the living room. Close th...
1448,Move the sofa to the window.,"living room, sofa, table, window, floor. The s...",Find a sofa. Walk to the sofa. Grab the sofa. ...
1449,Organize the sorting and storage of vegetables...,"Warehouse, vegetables, boxes, shelves, paper, ...",Come to the table. Find paper. Grab the paper....


In [14]:
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Загрузка предварительно обученной модели BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Подготовка данных
max_length = 128  # Максимальная длина последовательности
X_tokenized = [tokenizer.encode(text, add_special_tokens=True, max_length=max_length, truncation=True) for text in X['Оптимальный план en']]
X_padded = torch.tensor([text + [0]*(max_length - len(text)) for text in X_tokenized])
y_tensor = torch.tensor(y.values)

# Разделение данных на обучающий и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_tensor, test_size=0.2, random_state=42)

# Создание DataLoader для обучающего и тестового наборов
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=16)

# Создание и обучение модели классификации на основе BERT
class BERTClassifier(nn.Module):
    def __init__(self):
        super(BERTClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, 2)  # Выходной слой для двух классов: успешно/неуспешно

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # Берем выходной вектор [CLS]
        pooled_output = self.dropout(pooled_output)
        logits = self.linear(pooled_output)
        return logits

model = BERTClassifier()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Обучение модели
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
for epoch in range(5):  # Пример: 5 эпох
    model.train()
    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, labels = batch
        attention_mask = (input_ids != 0).float()
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

# Оценка модели
model.eval()
predictions = []
true_labels = []
for batch in test_loader:
    batch = tuple(t.to(device) for t in batch)
    input_ids, labels = batch
    attention_mask = (input_ids != 0).float()
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
    predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, predictions)
print("Accuracy:", accuracy)
# алгоритмические сравнения, catboost's, 



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Accuracy: 0.6769759450171822


In [13]:
!pip3 install transformers torch

Looking in indexes: https://nexus.services.mts.ru/repository/pip/simple, https://artifactory.mts.ru/artifactory/api/pypi/own-onetl-pypi-local/simple, https://artifactory.mts.ru/artifactory/api/pypi/python-libs-iga-sso-pypi-local/simple/, https://artifactory.mts.ru/artifactory/mts-integration-layer-mcc-pypi-local/simple/, https://sa0000adpran:****@artifactory.mts.ru/artifactory/api/pypi/adp_ran_repo-adp-ran-pypi-local/simple/, https://minio-pre-prod.msk.bd-cloud.mts.ru
Collecting torch
  Downloading https://nexus.services.mts.ru/repository/pip/packages/torch/2.2.2/torch-2.2.2-cp39-none-macosx_11_0_arm64.whl (59.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.7/59.7 MB[0m [31m893.8 kB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
Collecting sympy (from torch)
  Downloading https://nexus.services.mts.ru/repository/pip/packages/sympy/1.12/sympy-1.12-py3-none-any.whl (5.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m1.2 MB/