In [74]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/teknofest_train_final.csv", sep="|")
df = df.drop(df.loc[df.text.apply(lambda x: len(x) == 1)].index)
df.loc[(df.is_offensive == 1) & (df.target == "OTHER"), "is_offensive"] = 0

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [75]:
from sklearn.model_selection import StratifiedKFold

In [76]:
df["labels"] = df["is_offensive"].astype(str)+"__"+df["target"]
df.drop(["is_offensive", "id","target"], axis=1, inplace=True)

In [77]:
df.labels=df.labels.map({'1__INSULT':1, '1__RACIST':2, '1__SEXIST':3, '1__PROFANITY':4, '0__OTHER':5})

In [78]:
FOLDS=5
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

In [79]:
from IPython.display import clear_output

!pip install transformers
!pip install wordcloud

clear_output()

In [80]:
from transformers import AutoModel,AutoTokenizer
import torch
import torch.nn.functional as F

In [81]:
MODEL_NM = "dbmdz/bert-base-turkish-cased"
MAX_LEN  = 500
DEVICE="cuda"
BATCH_SIZE = 128
model = AutoModel.from_pretrained(MODEL_NM )
tokenizer = AutoTokenizer.from_pretrained(MODEL_NM )

Some weights of the model checkpoint at dbmdz/bert-base-turkish-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [82]:
from tqdm import tqdm

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

In [83]:
dftr = df.iloc[:10000]
dfte = df.iloc[10000:]

In [84]:
class EmbedDataset(torch.utils.data.Dataset):
    def __init__(self,df):
        self.df = df.reset_index(drop=True)
    def __len__(self):
        return len(self.df)
    def __getitem__(self,idx):
        text = self.df.loc[idx,"text"]
        tokens = tokenizer(
                text,
                None,
                add_special_tokens=True,
                padding='max_length',
                truncation=True,
                max_length=MAX_LEN,
                return_tensors="pt"
                )
        tokens = {k:v.squeeze(0) for k,v in tokens.items()}
        return tokens
# ds = EmbedDataset(dftr.head())
# ds[0]
ds_tr = EmbedDataset(dftr)
embed_dataloader_tr = torch.utils.data.DataLoader(ds_tr,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False)
ds_te = EmbedDataset(dfte)
embed_dataloader_te = torch.utils.data.DataLoader(ds_te,\
                        batch_size=BATCH_SIZE,\
                        shuffle=False)

In [85]:
import numpy as np

In [86]:
model = model.to(DEVICE)
model.eval()
all_train_text_feats = []
for batch in tqdm(embed_dataloader_tr,total=len(embed_dataloader_tr)):
    input_ids = batch["input_ids"].to(DEVICE)
    attention_mask = batch["attention_mask"].to(DEVICE)
    with torch.no_grad():
        model_output = model(input_ids=input_ids, attention_mask=attention_mask)
    sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
    # Normalize the embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
    all_train_text_feats.extend(sentence_embeddings)
all_train_text_feats = np.array(all_train_text_feats)
all_train_text_feats.shape

100%|██████████| 79/79 [01:16<00:00,  1.03it/s]


(10000, 768)

In [87]:
te_text_feats = []
for batch in tqdm(embed_dataloader_te,total=len(embed_dataloader_te)):
    input_ids = batch["input_ids"].to(DEVICE)
    attention_mask = batch["attention_mask"].to(DEVICE)
    with torch.no_grad():
        model_output = model(input_ids=input_ids,attention_mask=attention_mask)
    sentence_embeddings = mean_pooling(model_output, attention_mask.detach().cpu())
    # Normalize the embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    sentence_embeddings =  sentence_embeddings.squeeze(0).detach().cpu().numpy()
    te_text_feats.extend(sentence_embeddings)
te_text_feats = np.array(te_text_feats)
te_text_feats.shape

100%|██████████| 20/20 [00:18<00:00,  1.06it/s]


(2467, 768)

In [88]:
!pip install catboost

clear_output()

In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier


In [90]:
from sklearn.metrics import f1_score

In [91]:
preds = []
scores = []
def comp_score(y_true,y_pred):
    scores = []

    scores.append(f1_score(y_true,y_pred,  average="macro"))
    return np.mean(scores)



In [92]:
for i,(train_index, val_index) in enumerate(skf.split(dftr,dftr["labels"])):
    dftr.loc[val_index,'FOLD'] = i

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dftr.loc[val_index,'FOLD'] = i


In [93]:

for fold in tqdm(range(FOLDS),total=FOLDS):
    dftr_ = dftr[dftr["FOLD"]!=fold]
    dfev_ = dftr[dftr["FOLD"]==fold]
    
    tr_text_feats = all_train_text_feats[list(dftr_.index),:]
    ev_text_feats = all_train_text_feats[list(dfev_.index),:]
    
    clf = CatBoostClassifier(verbose=50, n_estimators=1000)
    clf.fit(tr_text_feats, dftr_["labels"].values)
    ev_preds = clf.predict(ev_text_feats)
    score = comp_score(dfev_["labels"].values,ev_preds)
    scores.append(score)
#     break
    print("Fold : {} EV score: {}".format(fold,score))
    preds.append(clf.predict(te_text_feats))
print(np.mean(scores))

  0%|          | 0/5 [00:00<?, ?it/s]

Learning rate set to 0.087979
0:	learn: 1.5371296	total: 200ms	remaining: 3m 19s
50:	learn: 0.6405569	total: 7.99s	remaining: 2m 28s
100:	learn: 0.4432160	total: 16.3s	remaining: 2m 25s
150:	learn: 0.3439092	total: 24.6s	remaining: 2m 18s
200:	learn: 0.2877209	total: 33s	remaining: 2m 11s
250:	learn: 0.2515121	total: 40.2s	remaining: 2m
300:	learn: 0.2273342	total: 47.5s	remaining: 1m 50s
350:	learn: 0.2070940	total: 54.6s	remaining: 1m 40s
400:	learn: 0.1913347	total: 1m 2s	remaining: 1m 33s
450:	learn: 0.1781241	total: 1m 10s	remaining: 1m 25s
500:	learn: 0.1662732	total: 1m 17s	remaining: 1m 17s
550:	learn: 0.1557439	total: 1m 24s	remaining: 1m 9s
600:	learn: 0.1469947	total: 1m 32s	remaining: 1m 1s
650:	learn: 0.1387122	total: 1m 39s	remaining: 53.1s
700:	learn: 0.1311419	total: 1m 46s	remaining: 45.3s
750:	learn: 0.1243203	total: 1m 53s	remaining: 37.6s
800:	learn: 0.1179122	total: 2m	remaining: 29.9s
850:	learn: 0.1118934	total: 2m 7s	remaining: 22.3s
900:	learn: 0.1061195	total:

 20%|██        | 1/5 [02:30<10:00, 150.04s/it]

999:	learn: 0.0964402	total: 2m 29s	remaining: 0us
Fold : 0 EV score: 0.8788310893758038
Learning rate set to 0.087979
0:	learn: 1.5348746	total: 195ms	remaining: 3m 14s
50:	learn: 0.6428295	total: 8.13s	remaining: 2m 31s
100:	learn: 0.4474185	total: 15.7s	remaining: 2m 19s
150:	learn: 0.3446912	total: 23.4s	remaining: 2m 11s
200:	learn: 0.2884217	total: 30.6s	remaining: 2m 1s
250:	learn: 0.2533573	total: 38.1s	remaining: 1m 53s
300:	learn: 0.2289251	total: 45.2s	remaining: 1m 44s
350:	learn: 0.2088733	total: 52.5s	remaining: 1m 37s
400:	learn: 0.1918798	total: 59.6s	remaining: 1m 29s
450:	learn: 0.1787359	total: 1m 6s	remaining: 1m 21s
500:	learn: 0.1672718	total: 1m 13s	remaining: 1m 13s
550:	learn: 0.1570542	total: 1m 20s	remaining: 1m 5s
600:	learn: 0.1474334	total: 1m 28s	remaining: 58.6s
650:	learn: 0.1392319	total: 1m 35s	remaining: 51s
700:	learn: 0.1318737	total: 1m 42s	remaining: 43.7s
750:	learn: 0.1253219	total: 1m 49s	remaining: 36.3s
800:	learn: 0.1185975	total: 1m 56s	re

 40%|████      | 2/5 [04:55<07:22, 147.47s/it]

999:	learn: 0.0977754	total: 2m 25s	remaining: 0us
Fold : 1 EV score: 0.8830180501428326
Learning rate set to 0.087979
0:	learn: 1.5369131	total: 218ms	remaining: 3m 38s
50:	learn: 0.6458848	total: 8.03s	remaining: 2m 29s
100:	learn: 0.4480996	total: 15.8s	remaining: 2m 20s
150:	learn: 0.3449178	total: 23.9s	remaining: 2m 14s
200:	learn: 0.2929895	total: 31.2s	remaining: 2m 4s
250:	learn: 0.2546988	total: 38.6s	remaining: 1m 55s
300:	learn: 0.2292836	total: 45.9s	remaining: 1m 46s
350:	learn: 0.2094275	total: 53.6s	remaining: 1m 39s
400:	learn: 0.1939415	total: 1m	remaining: 1m 30s
450:	learn: 0.1799204	total: 1m 8s	remaining: 1m 23s
500:	learn: 0.1673378	total: 1m 15s	remaining: 1m 15s
550:	learn: 0.1566969	total: 1m 22s	remaining: 1m 7s
600:	learn: 0.1468333	total: 1m 29s	remaining: 59.5s
650:	learn: 0.1393195	total: 1m 36s	remaining: 52s
700:	learn: 0.1317232	total: 1m 44s	remaining: 44.5s
750:	learn: 0.1248665	total: 1m 51s	remaining: 36.9s
800:	learn: 0.1179499	total: 1m 58s	remai

 60%|██████    | 3/5 [07:23<04:55, 147.53s/it]

999:	learn: 0.0965572	total: 2m 26s	remaining: 0us
Fold : 2 EV score: 0.8870832201131631
Learning rate set to 0.087979
0:	learn: 1.5382107	total: 193ms	remaining: 3m 13s
50:	learn: 0.6311672	total: 8.06s	remaining: 2m 30s
100:	learn: 0.4400602	total: 16s	remaining: 2m 22s
150:	learn: 0.3383885	total: 23.4s	remaining: 2m 11s
200:	learn: 0.2812949	total: 31s	remaining: 2m 3s
250:	learn: 0.2464717	total: 38.1s	remaining: 1m 53s
300:	learn: 0.2228739	total: 45.5s	remaining: 1m 45s
350:	learn: 0.2035315	total: 52.6s	remaining: 1m 37s
400:	learn: 0.1882127	total: 59.8s	remaining: 1m 29s
450:	learn: 0.1741956	total: 1m 6s	remaining: 1m 21s
500:	learn: 0.1622968	total: 1m 14s	remaining: 1m 13s
550:	learn: 0.1526710	total: 1m 21s	remaining: 1m 6s
600:	learn: 0.1435363	total: 1m 28s	remaining: 58.6s
650:	learn: 0.1359100	total: 1m 35s	remaining: 51.2s
700:	learn: 0.1281030	total: 1m 42s	remaining: 43.7s
750:	learn: 0.1206336	total: 1m 49s	remaining: 36.4s
800:	learn: 0.1137990	total: 1m 56s	rema

 80%|████████  | 4/5 [09:49<02:26, 146.82s/it]

999:	learn: 0.0930643	total: 2m 25s	remaining: 0us
Fold : 3 EV score: 0.8804176779030687
Learning rate set to 0.087979
0:	learn: 1.5382219	total: 197ms	remaining: 3m 16s
50:	learn: 0.6375495	total: 8.55s	remaining: 2m 39s
100:	learn: 0.4423508	total: 16.9s	remaining: 2m 30s
150:	learn: 0.3422550	total: 25s	remaining: 2m 20s
200:	learn: 0.2886034	total: 32.6s	remaining: 2m 9s
250:	learn: 0.2537803	total: 39.8s	remaining: 1m 58s
300:	learn: 0.2298773	total: 47.1s	remaining: 1m 49s
350:	learn: 0.2102518	total: 54.2s	remaining: 1m 40s
400:	learn: 0.1933707	total: 1m 1s	remaining: 1m 31s
450:	learn: 0.1789913	total: 1m 8s	remaining: 1m 23s
500:	learn: 0.1663340	total: 1m 15s	remaining: 1m 15s
550:	learn: 0.1555634	total: 1m 22s	remaining: 1m 7s
600:	learn: 0.1462420	total: 1m 30s	remaining: 59.8s
650:	learn: 0.1375832	total: 1m 37s	remaining: 52.1s
700:	learn: 0.1302463	total: 1m 44s	remaining: 44.5s
750:	learn: 0.1232009	total: 1m 51s	remaining: 37s
800:	learn: 0.1163200	total: 1m 58s	rema

100%|██████████| 5/5 [12:16<00:00, 147.30s/it]

999:	learn: 0.0949667	total: 2m 26s	remaining: 0us
Fold : 4 EV score: 0.8813340934908936
0.8821368262051523





In [94]:
dfte["preds"] = np.average(np.array(preds),axis=0,weights=[1/s for s in scores])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfte["preds"] = np.average(np.array(preds),axis=0,weights=[1/s for s in scores])


In [95]:
dfte["preds"] = dfte["preds"].apply(lambda x: round(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfte["preds"] = dfte["preds"].apply(lambda x: round(x))


In [96]:
f1_score(dfte["labels"], dfte["preds"], average="macro")

0.8374135740371728