In [25]:
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from catboost.utils import get_gpu_device_count

import warnings

In [30]:
RANDOM_STATE = 42
GSCV_LOG_REGRESSION_CONFIG = {
    'C': [.1, 1, 10, 50, 100, 250, 500, 1000],
    'solver': ['lbfgs', 'liblinear'],
    'penalty': ['l1', 'l2', 'elasticnet'],
}
GSCV_RAND_FOREST_CONFIG = {
    'max_depth': range(1, 11),
    'n_estimators': range(1, 202, 5),
}
GSCV_CAT_BOOSRT_CONFIG = {
    'learning_rate':[.1, .5, .8],
    'max_depth': range(1, 6),
    'n_estimators': range(50, 401, 50),
    'min_data_in_leaf': [50, 100, 150]
}
IS_TRAIN = False # Проводить ли заново подбор лучших параметров моделей

In [28]:
def research_model(model, params, features, target, metric, cv=5):
    gscv = GridSearchCV(
        model, params, cv=cv,
        scoring=metric,
        refit=False,
        return_train_score=True,
        verbose=2
    )
    gscv.fit(features, target)
    return pd.DataFrame(gscv.cv_results_).sort_values(
        by='mean_test_score', ascending=False
    )

In [6]:
data = pd.read_csv('data/toxic_comments.csv')

In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")



In [20]:
tokenized = data['text'].apply(
    lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True)
)

max_len = 0
for i in tokenized.values:
    max_len = max(max_len, len(i))

padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)

In [23]:
model = BertModel.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

batch_size = 300
embeddings = []
for i in tqdm(range(padded.shape[0] // batch_size)):
    batch = torch.LongTensor(padded[i * batch_size : (i + 1) * batch_size])
    attention_mask_batch = torch.LongTensor(attention_mask[i * batch_size : (i + 1) * batch_size])
    
    with torch.no_grad():
        batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        embeddings.append(batch_embeddings[0][:,0,:].numpy())

  0%|          | 2/53097 [00:15<116:36:23,  7.91s/it]


KeyboardInterrupt: 

In [None]:
features = np.concatenate(embeddings)
target = data['toxic']
features.shape

In [18]:
features = pd.DataFrame(np.random.randint(0, 512, size=(data.shape[0], 100)))
target = data['toxic']
features.shape

(159292, 100)

In [8]:
data['toxic'].value_counts(normalize=True)

0    0.898388
1    0.101612
Name: toxic, dtype: float64

In [9]:
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_one = features[target == 1]
    target_zeros = target[target == 0]
    target_one = target[target == 1]
    
    features_upsample = pd.concat([features_zeros] + [features_one] * repeat)
    target_upsample = pd.concat([target_zeros] + [target_one] * repeat)
    features_upsample, target_upsample = shuffle(
        features_upsample, target_upsample, random_state=RANDOM_STATE)
    
    return features_upsample, target_upsample

In [38]:
features_upsample, target_upsample = upsample(features, data['toxic'], 8)

In [41]:
target_upsample.value_counts(normalize=True)

0    0.524979
1    0.475021
Name: toxic, dtype: float64

In [45]:
features_train, features_valid, target_train, target_valid = train_test_split(
    features, target, test_size=0.2, stratify=target, random_state=RANDOM_STATE
)
features_train_upsample, features_valid_upsample, target_train_upsample, target_valid_upsample = train_test_split(
    features_upsample, target_upsample, test_size=0.2, stratify=target_upsample, random_state=RANDOM_STATE
)

## LogisticRegression

In [31]:
model_log_regression = LogisticRegression(
    max_iter=1000, random_state=RANDOM_STATE, n_jobs=-1, class_weight='balanced'
)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    log_reg_best_param = research_model(
        model_log_regression, GSCV_LOG_REGRESSION_CONFIG,
        features_train, target_train, 'f1'
    ).head(10)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END ....................C=0.1, penalty=l1, solver=lbfgs; total time=   0.3s
[CV] END ....................C=0.1, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ....................C=0.1, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ....................C=0.1, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ....................C=0.1, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ................C=0.1, penalty=l1, solver=liblinear; total time=  22.7s



KeyboardInterrupt



In [None]:
log_reg_best_param

In [48]:
model_log_regression = LogisticRegression(
    random_state=RANDOM_STATE, n_jobs=-1
)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    log_reg_upsample_best_param = research_model(
        model_log_regression, GSCV_LOG_REGRESSION_CONFIG,
        features_train_upsample, target_train_upsample, 'f1'
    ).head(10)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END ....................C=0.1, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ....................C=0.1, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ....................C=0.1, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ....................C=0.1, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ....................C=0.1, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ................C=0.1, penalty=l1, solver=liblinear; total time=  11.7s



KeyboardInterrupt



In [None]:
log_reg_upsample_best_param

## RanfomForestClassifier

In [49]:
model_random_forest = RandomForestClassifier(
    random_state=RANDOM_STATE, n_jobs=-1, class_weight='balanced'
)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    random_forest_best_param = research_model(
        model_random_forest, GSCV_RAND_FOREST_CONFIG,
        features_train, target_train, 'f1'
    ).head(10)

Fitting 5 folds for each of 410 candidates, totalling 2050 fits
[CV] END ........................max_depth=1, n_estimators=1; total time=   1.2s
[CV] END ........................max_depth=1, n_estimators=1; total time=   0.3s
[CV] END ........................max_depth=1, n_estimators=1; total time=   0.2s
[CV] END ........................max_depth=1, n_estimators=1; total time=   0.2s
[CV] END ........................max_depth=1, n_estimators=1; total time=   0.3s
[CV] END ........................max_depth=1, n_estimators=6; total time=   0.3s


KeyboardInterrupt: 

In [None]:
random_forest_best_param

In [50]:
model_random_forest = RandomForestClassifier(
    random_state=RANDOM_STATE, n_jobs=-1, class_weight='balanced'
)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    random_forest_upsample_best_param = research_model(
        model_random_forest, GSCV_RAND_FOREST_CONFIG,
        features_train_upsample, target_train_upsample, 'f1'
    ).head(10)

Fitting 5 folds for each of 410 candidates, totalling 2050 fits
[CV] END ........................max_depth=1, n_estimators=1; total time=   0.9s
[CV] END ........................max_depth=1, n_estimators=1; total time=   0.5s
[CV] END ........................max_depth=1, n_estimators=1; total time=   0.4s


KeyboardInterrupt: 

In [None]:
random_forest_upsample_best_param

## CatBoostClassifier

In [None]:
task_type = 'GPU' if get_gpu_device_count() else 'CPU'
model_cat_boost = CatBoostClassifier(
    random_state=RANDOM_STATE, task_type=task_type, auto_class_weights='Balanced'
)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    cat_boost_best_param = research_model(
        model_cat_boost, GSCV_CAT_BOOSRT_CONFIG,
        features_train, target_train, 'f1'
    ).head(10)

Fitting 5 folds for each of 410 candidates, totalling 2050 fits
[CV] END ........................max_depth=1, n_estimators=1; total time=   1.2s
[CV] END ........................max_depth=1, n_estimators=1; total time=   0.3s
[CV] END ........................max_depth=1, n_estimators=1; total time=   0.2s
[CV] END ........................max_depth=1, n_estimators=1; total time=   0.2s
[CV] END ........................max_depth=1, n_estimators=1; total time=   0.3s
[CV] END ........................max_depth=1, n_estimators=6; total time=   0.3s


KeyboardInterrupt: 

In [None]:
cat_boost_best_param

In [None]:
task_type = 'GPU' if get_gpu_device_count() else 'CPU'
model_cat_boost = CatBoostClassifier(
    random_state=RANDOM_STATE, task_type=task_type
)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    cat_boost_upsample_best_param = research_model(
        model_cat_boost, GSCV_CAT_BOOSRT_CONFIG,
        features_train_upsample, target_train_upsample, 'f1'
    ).head(10)

Fitting 5 folds for each of 410 candidates, totalling 2050 fits
[CV] END ........................max_depth=1, n_estimators=1; total time=   0.9s
[CV] END ........................max_depth=1, n_estimators=1; total time=   0.5s
[CV] END ........................max_depth=1, n_estimators=1; total time=   0.4s


KeyboardInterrupt: 

In [None]:
cat_boost_upsample_best_param