# Загрузка библиотек

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.0-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.13.1 colorlog-6.8.0 optuna-3.5.0


# Импорты

In [None]:
import pandas as pd
import numpy as np
import torch

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

import optuna
from functools import partial
from collections import OrderedDict
import scipy

import warnings
warnings.filterwarnings("ignore")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Загрузка даанных

In [None]:
train = pd.read_csv('/content/drive/MyDrive/automl/train_p.csv')
test = pd.read_csv('/content/drive/MyDrive/automl/test_p.csv')

In [None]:
train.head(3)

Unnamed: 0.1,Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,discourse_effectiveness_map
0,4546,61ba01f9c332,51634BBDF223,kn the planet but he also comes up with idea o...,Claim,Adequate,1
1,21829,ea75b33c79ed,65991D1DA389,When your phone is turned on someone can call ...,Evidence,Adequate,1
2,16662,18a514017021,20D0120E0F48,or else there will be people saying that the c...,Evidence,Adequate,1


In [None]:
test.head(3)

Unnamed: 0.1,Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,discourse_effectiveness_map
0,36023,9b1e3093f8c0,D282E21C2DB1,avoid bias,Claim,Effective,2
1,34198,714ae42d8d54,4C6B05BD5A81,Some students have a slower learning abilities...,Evidence,Adequate,1
2,15341,b35015e761cd,0CFDDC1CF75A,I think they should not let them to bring cell...,Position,Adequate,1


In [None]:
del train['Unnamed: 0']

In [None]:
del test['Unnamed: 0']

In [None]:
train.head(3)

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,discourse_effectiveness_map
0,61ba01f9c332,51634BBDF223,kn the planet but he also comes up with idea o...,Claim,Adequate,1
1,ea75b33c79ed,65991D1DA389,When your phone is turned on someone can call ...,Evidence,Adequate,1
2,18a514017021,20D0120E0F48,or else there will be people saying that the c...,Evidence,Adequate,1


In [None]:
test.head(3)

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,discourse_effectiveness_map
0,9b1e3093f8c0,D282E21C2DB1,avoid bias,Claim,Effective,2
1,714ae42d8d54,4C6B05BD5A81,Some students have a slower learning abilities...,Evidence,Adequate,1
2,b35015e761cd,0CFDDC1CF75A,I think they should not let them to bring cell...,Position,Adequate,1


# Модель и оптимизация

* TFIdf Vect
* LogReg Model
* Model Optimization with Optuna

In [None]:
text_col = "discourse_text"
target_col = "discourse_effectiveness"
target_enc = LabelEncoder()
target_enc.fit(train[target_col])

In [None]:
train[target_col] = target_enc.transform(train[target_col])

In [None]:
lenc = LabelEncoder()
lenc.fit(train.discourse_type)

train.discourse_type = lenc.transform(train.discourse_type)
test.discourse_type = lenc.transform(test.discourse_type)

In [None]:
len(train.discourse_id.unique()) == train.shape[0]

True

In [None]:
def objective(trial, dset: pd.DataFrame, text_col: str = "discourse_text", target_col: str = "discourse_effectiveness", alpha: float = 0.1):

    ngram = trial.suggest_categorical("preprocess_vect_ngram_range", [(1, 2), (1, 3), (2, 4), (2, 3), (1, 1)])
    max_feats = trial.suggest_int("preprocess_vect_max_features", 1e4, 1e6)
    lowercase = trial.suggest_categorical("preprocess_vect_lowercase", [True, False])
    stopwords = trial.suggest_categorical("preprocess_vect_stop_words", [None, "english"])

    C = trial.suggest_loguniform("model_lr_C", 0.1234, 1)
    class_weight = trial.suggest_categorical("model_lr_class_weight", ["balanced", None])

    scores = []
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42).split(dset[target_col], dset[target_col])
    for tr_ix, vl_ix in skf:
        train, vl_df = dset.iloc[tr_ix], dset.iloc[vl_ix]

        vect = TfidfVectorizer(ngram_range=ngram, max_features=max_feats, lowercase=lowercase)
        vect.fit(train[text_col])

        train_vecs = vect.transform(train[text_col])
        valid_vecs = vect.transform(vl_df[text_col])

        train_vecs = scipy.sparse.hstack([train_vecs, scipy.sparse.csr_matrix(train.discourse_type.values).T])
        valid_vecs = scipy.sparse.hstack([valid_vecs, scipy.sparse.csr_matrix(vl_df.discourse_type.values).T])

        train_y, valid_y = train[target_col], vl_df[target_col]

        model = LogisticRegression(C=C,
                                   class_weight=class_weight,
                                   n_jobs=-1,
                                   max_iter=5000)
        model.fit(train_vecs, train_y)
        probas = model.predict_proba(valid_vecs)
        score = log_loss(valid_y, probas)

        scores.append(score)

    mean = np.mean(scores)
    std = np.std(score)

    return mean + alpha * std

def parse_logs(params: dict):
    p = dict.fromkeys(["model", "preprocessing"])
    p["model"] = dict()
    p["preprocess"] = dict()
    for key, value in params.items():
        identifiers = key.split("_", 2)
        hyperparam = identifiers[-1]
        p[identifiers[0]][hyperparam] = value
    return p

In [None]:
study = optuna.create_study(direction="minimize")
objective = partial(objective, dset=train)
study.optimize(objective, n_trials=30)

[I 2023-12-28 08:36:17,839] A new study created in memory with name: no-name-14aecf24-a573-4968-a29f-dfdb092cea88
[I 2023-12-28 08:37:21,946] Trial 0 finished with value: 0.8466979394405981 and parameters: {'preprocess_vect_ngram_range': (1, 2), 'preprocess_vect_max_features': 991935, 'preprocess_vect_lowercase': False, 'preprocess_vect_stop_words': None, 'model_lr_C': 0.13232973324173522, 'model_lr_class_weight': None}. Best is trial 0 with value: 0.8466979394405981.
[I 2023-12-28 08:38:23,679] Trial 1 finished with value: 0.7966563506059474 and parameters: {'preprocess_vect_ngram_range': (1, 2), 'preprocess_vect_max_features': 134092, 'preprocess_vect_lowercase': True, 'preprocess_vect_stop_words': 'english', 'model_lr_C': 0.5929386451329182, 'model_lr_class_weight': None}. Best is trial 1 with value: 0.7966563506059474.
[I 2023-12-28 08:39:16,829] Trial 2 finished with value: 0.836791498845694 and parameters: {'preprocess_vect_ngram_range': (1, 2), 'preprocess_vect_max_features': 38

In [None]:
opt_params = parse_logs(study.best_params)
model_params = opt_params["model"]
preprocess_params = opt_params["preprocess"]

In [None]:
test_preds = []
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42).split(train[target_col], train[target_col])
for tr_ix, vl_ix in skf:
    train_df, _ = train.iloc[tr_ix], train.iloc[vl_ix]

    vect = TfidfVectorizer(**preprocess_params)
    vect.fit(train_df[text_col])

    train_vecs = vect.transform(train_df[text_col])
    test_vecs = vect.transform(test[text_col])

    train_vecs = scipy.sparse.hstack([train_vecs, scipy.sparse.csr_matrix(train_df.discourse_type.values).T])
    test_vecs = scipy.sparse.hstack([test_vecs, scipy.sparse.csr_matrix(test.discourse_type.values).T])

    train_y = train_df[target_col]

    model = LogisticRegression(**model_params, max_iter=5000)
    model.fit(train_vecs, train_y)
    probas = model.predict_proba(test_vecs)

    test_preds.append(probas)

In [None]:
prob_preds = np.mean(np.array(test_preds), axis=0)

In [None]:
prob_preds

array([[0.53206178, 0.29974147, 0.16819675],
       [0.41286943, 0.47995766, 0.10717291],
       [0.8829141 , 0.01571764, 0.10136826],
       ...,
       [0.40040439, 0.29407352, 0.30552209],
       [0.53159752, 0.07298454, 0.39541793],
       [0.57353934, 0.22936366, 0.19709701]])

In [None]:
sub = pd.DataFrame(prob_preds, columns=target_enc.classes_)
sub["discourse_id"] = test["discourse_id"]

In [None]:
sub

Unnamed: 0,Adequate,Effective,Ineffective,discourse_id
0,0.532062,0.299741,0.168197,9b1e3093f8c0
1,0.412869,0.479958,0.107173,714ae42d8d54
2,0.882914,0.015718,0.101368,b35015e761cd
3,0.489417,0.224186,0.286398,32245eafbe92
4,0.242814,0.722376,0.034810,7e946ef44860
...,...,...,...,...
7334,0.134792,0.058736,0.806471,d863083c5f22
7335,0.567083,0.172085,0.260832,57d92e1dddb3
7336,0.400404,0.294074,0.305522,02aca137ea9e
7337,0.531598,0.072985,0.395418,325a46d9f889


In [None]:
logloss = log_loss(test[target_col], prob_preds)
logloss

0.7832725916621214