In [1]:
from IPython.display import clear_output

!pip install nb_black

clear_output()

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd

%load_ext lab_black

In [4]:
df = pd.read_csv(
    "/content/drive/MyDrive/Colab Notebooks/Teknofest/NLP/teknofest_train_final.csv", sep="|"
)

In [5]:
df

Unnamed: 0,id,text,is_offensive,target
0,81c11060-a240-4d54-841b-9e2916039e85,çürük dişli,1,INSULT
1,be80ebbf-b322-4c3b-afa1-94932ea80731,Bu adamın islama ve müslümanlara verdiği zarar...,1,RACIST
2,f99e2513-83ed-4076-ac72-b9e2cff3f049,erkekler zora gelmez,1,SEXIST
3,83ed2b2e-b815-4f36-9fc4-80a9050cf2d0,Utanmazın götüne kazık sokmuşlar bu tıkırtı ne...,1,PROFANITY
4,d93e05f7-bfdd-4cdb-99d8-3048761b30ff,otomasyon< sistemlerine= doğrudan bağlanabilir,0,OTHER
...,...,...,...,...
12612,71eedfa1-8fa6-425c-b982-258c3b29c003,uyuma taklidi yapan tehlikeli bir hayvanın göz...,0,OTHER
12613,b38eed16-6501-4563-8b33-ff2e634bb8e5,yolda at kavga eden üç oğlan çocuğu görür,0,OTHER
12614,c8a051a8-94ef-4b64-a48e-54d0fa4f8323,sizin köpeklerinizin burnu bile daha iyi koku ...,0,OTHER
12615,513a7e6d-4207-4a16-9b47-972f26e23cfe,hayalleri gerçek etmek için birisinin delilik ...,0,OTHER


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12617 entries, 0 to 12616
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            12617 non-null  object
 1   text          12617 non-null  object
 2   is_offensive  12617 non-null  int64 
 3   target        12617 non-null  object
dtypes: int64(1), object(3)
memory usage: 394.4+ KB


In [7]:
len(df.id.unique())

12617

In [8]:
df.is_offensive.value_counts()

1    9018
0    3599
Name: is_offensive, dtype: int64

In [9]:
df.target.value_counts()

OTHER        3616
INSULT       2419
PROFANITY    2398
SEXIST       2112
RACIST       2072
Name: target, dtype: int64

## Preprocessing

#### Punctuation

In [10]:
import string

In [11]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
def remove_punctuation(text):
    punctuationfree = "".join([i for i in text if i not in string.punctuation])
    return punctuationfree

In [13]:
df.text = df.text.apply(lambda x: remove_punctuation(x))

#### Lower

In [14]:
df.text = df.text.str.lower()

#### Tokenization

In [15]:
import re


def tokenization(text):
    tokens = re.split(r"\s+", text)
    return tokens

In [16]:
df.text = df.text.apply(lambda x: tokenization(x))
df.text

0                                           [çürük, dişli]
1        [bu, adamın, islama, ve, müslümanlara, verdiği...
2                                 [erkekler, zora, gelmez]
3        [utanmazın, götüne, kazık, sokmuşlar, bu, tıkı...
4        [otomasyon, sistemlerine, doğrudan, bağlanabilir]
                               ...                        
12612    [uyuma, taklidi, yapan, tehlikeli, bir, hayvan...
12613    [yolda, at, kavga, eden, üç, oğlan, çocuğu, gö...
12614    [sizin, köpeklerinizin, burnu, bile, daha, iyi...
12615    [hayalleri, gerçek, etmek, için, birisinin, de...
12616    [deliklerden, birini, bulsan, diğerini, bulamı...
Name: text, Length: 12617, dtype: object

#### Stop Words

In [17]:
with open("/content/drive/MyDrive/Colab Notebooks/Teknofest/NLP/tr-stop-words.txt", "r") as f:
    words = f.readlines()
f.close()

In [18]:
stopwords = [word[:-1] for word in words]

In [19]:
def remove_stopwords(text):
    output = [i for i in text if i not in stopwords]
    return output

In [20]:
df.text = df.text.apply(lambda x: remove_stopwords(x))
df.text

0                                           [çürük, dişli]
1        [adamın, islama, müslümanlara, verdiği, zararı...
2                                 [erkekler, zora, gelmez]
3        [utanmazın, götüne, kazık, sokmuşlar, tıkırtı,...
4        [otomasyon, sistemlerine, doğrudan, bağlanabilir]
                               ...                        
12612    [uyuma, taklidi, yapan, tehlikeli, hayvanın, g...
12613             [yolda, at, kavga, oğlan, çocuğu, görür]
12614    [köpeklerinizin, burnu, iyi, koku, alıyor, biz...
12615    [hayalleri, gerçek, etmek, birisinin, delilik,...
12616    [deliklerden, birini, bulsan, diğerini, bulamı...
Name: text, Length: 12617, dtype: object

In [21]:
df

Unnamed: 0,id,text,is_offensive,target
0,81c11060-a240-4d54-841b-9e2916039e85,"[çürük, dişli]",1,INSULT
1,be80ebbf-b322-4c3b-afa1-94932ea80731,"[adamın, islama, müslümanlara, verdiği, zararı...",1,RACIST
2,f99e2513-83ed-4076-ac72-b9e2cff3f049,"[erkekler, zora, gelmez]",1,SEXIST
3,83ed2b2e-b815-4f36-9fc4-80a9050cf2d0,"[utanmazın, götüne, kazık, sokmuşlar, tıkırtı,...",1,PROFANITY
4,d93e05f7-bfdd-4cdb-99d8-3048761b30ff,"[otomasyon, sistemlerine, doğrudan, bağlanabilir]",0,OTHER
...,...,...,...,...
12612,71eedfa1-8fa6-425c-b982-258c3b29c003,"[uyuma, taklidi, yapan, tehlikeli, hayvanın, g...",0,OTHER
12613,b38eed16-6501-4563-8b33-ff2e634bb8e5,"[yolda, at, kavga, oğlan, çocuğu, görür]",0,OTHER
12614,c8a051a8-94ef-4b64-a48e-54d0fa4f8323,"[köpeklerinizin, burnu, iyi, koku, alıyor, biz...",0,OTHER
12615,513a7e6d-4207-4a16-9b47-972f26e23cfe,"[hayalleri, gerçek, etmek, birisinin, delilik,...",0,OTHER


## Feature Extraction

In [22]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [23]:
device

'cuda:0'

In [24]:
!pip install datasets
!pip install transformers

clear_output()

In [25]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

In [26]:
df = df.drop(["id", "target"], axis=1).rename(columns={"is_offensive": "label"})
df.text = df.text.apply(lambda x: " ".join(x))

In [27]:
df_train = df.loc[:11000]
df_test = df.loc[11000:]

In [28]:
ds = DatasetDict(
    {"train": Dataset.from_pandas(df_train), "test": Dataset.from_pandas(df_test)}
)

In [29]:
tokenizer = AutoTokenizer.from_pretrained("savasy/bert-base-turkish-sentiment-cased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [30]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [31]:
tokenized_datasets = ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/11001 [00:00<?, ? examples/s]

Map:   0%|          | 0/1617 [00:00<?, ? examples/s]

## Model

In [32]:
!pip install evaluate

clear_output()

In [33]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

In [34]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 11001
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1617
    })
})

In [35]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11001
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1617
    })
})

In [36]:
model = AutoModelForSequenceClassification.from_pretrained(
    "savasy/bert-base-turkish-sentiment-cased"
).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

In [37]:
metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [38]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [39]:
training_args = TrainingArguments(
    output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=3 
)

In [40]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [41]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.2294,0.479262,0.860853
2,0.1048,0.712219,0.852814
3,0.0364,0.856042,0.859617


TrainOutput(global_step=4128, training_loss=0.1421904355980629, metrics={'train_runtime': 3229.4362, 'train_samples_per_second': 10.219, 'train_steps_per_second': 1.278, 'total_flos': 8683454160046080.0, 'train_loss': 0.1421904355980629, 'epoch': 3.0})