In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-txt-classification/sample_submission.csv
/kaggle/input/nlp-txt-classification/train.csv
/kaggle/input/nlp-txt-classification/test.csv


In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0m

In [3]:
import numpy as np
import pandas as pd
import os
import re
import torch
import evaluate
from sklearn import preprocessing

import transformers
from transformers import AutoTokenizer
from transformers import Trainer
from transformers import TrainingArguments
from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    PretrainedConfig,
    default_data_collator   
)

import datasets
from datasets import load_dataset
from datasets import load_dataset_builder
from datasets import Dataset

torch.cuda.is_available()
os.environ["WANDB_DISABLED"] = "true"

In [4]:
INPUT_PATH = '/kaggle/input/nlp-txt-classification'
OUTPUT_DICT = '/kaggle/working'

In [5]:
train_df = pd.read_csv(os.path.join(INPUT_PATH, 'train.csv')).dropna()
test_df = pd.read_csv(os.path.join(INPUT_PATH, 'test.csv'))
sample_submission = pd.read_csv(os.path.join(INPUT_PATH, 'sample_submission.csv'))
train_df.shape, test_df.shape, sample_submission.shape

((41155, 3), (3798, 2), (3798, 2))

In [6]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,Text,Sentiment
0,0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,1,advice Talk to your neighbours family to excha...,Positive
2,2,Coronavirus Australia: Woolworths to give elde...,Positive
3,3,My food stock is not the only one which is emp...,Positive
4,4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [7]:
test_df.head()

Unnamed: 0,id,Text
0,787bc85b-20d4-46d8-84a0-562a2527f684,TRENDING: New Yorkers encounter empty supermar...
1,17e934cd-ba94-4d4f-9ac0-ead202abe241,When I couldn't find hand sanitizer at Fred Me...
2,5914534b-2b0f-4de8-bb8a-e25587697e0d,Find out how you can protect yourself and love...
3,cdf06cfe-29ae-48ee-ac6d-be448103ba45,#Panic buying hits #NewYork City as anxious sh...
4,aff63979-0256-4fb9-a2d9-86a3d3ca5470,#toiletpaper #dunnypaper #coronavirus #coronav...


In [8]:
LABELS = train_df['Sentiment'].unique()
LABELS_COUNT = len(train_df['Sentiment'].unique())

# Data pre-processing

### Clean text from trash

In [9]:
def cleaner(text :str):
    '''will clean from unicode, url, hashtags, numbers, punctuation, emoji,
    nicknames, lower'''
    text = re.sub(r'[.,#!$%\^&\*;:{}=\-_`~()]',r'',text) # punct.
    
    text = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', text)       
    text = re.sub(r'[^\x00-\x7f]',r'',text)# unicode
    
    text = re.sub(r'http\S+', r'', text)# url
    
    text = re.sub('@[A-Za-z0-9_-]+',r'',text)# nickname
    
    text = re.sub(r'#([^\s]+)', r'\1', text)# hashtag
    
    text = re.sub(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:',
                  '', text) # emoji
    
    text = ''.join([i for i in text if not i.isdigit()])# int.
    
    text = text.lower()
    return text

In [10]:
train_df['text'] = train_df['Text'].apply(cleaner)
test_df['text'] = test_df['Text'].apply(cleaner)

In [11]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,Text,Sentiment,text
0,0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,and and
1,1,advice Talk to your neighbours family to excha...,Positive,advice talk to your neighbours family to excha...
2,2,Coronavirus Australia: Woolworths to give elde...,Positive,coronavirus australia woolworths to give elder...
3,3,My food stock is not the only one which is emp...,Positive,my food stock is not the only one which is emp...
4,4,"Me, ready to go at supermarket during the #COV...",Extremely Negative,me ready to go at supermarket during the covid...


In [12]:
# need to encode labels and change column name
le = preprocessing.LabelEncoder()
le.fit(LABELS)

train_df['Sentiment'] = le.transform(train_df['Sentiment'])
train_df.rename(columns={'Sentiment': 'label'}, inplace=True)

### Load model and tokinizer

In [13]:
# tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,
                                                           num_labels=LABELS_COUNT,
                                                           ignore_mismatched_sizes=True)


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [14]:
#example
tokenizer.tokenize("This here's an example of using the BERT tokenizer")

['this',
 'here',
 "'",
 's',
 'an',
 'example',
 'of',
 'using',
 'the',
 'bert',
 'token',
 '##izer']

In [15]:
# tokenazing function
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

In [16]:
# tokenize all dataset and make padding for all rows
#train
train_dt = Dataset.from_pandas(train_df[['text','label']])
tokenized_train = train_dt.map(tokenize_function, batched=True)
tokenized_train = tokenized_train.remove_columns(["__index_level_0__"])

#test 
test_dt = Dataset.from_pandas(test_df)
tokenized_test = test_dt.map(tokenize_function, batched=True)
tokenized_test = tokenized_test.remove_columns(['id', 'Text'])


data_collator = DataCollatorWithPadding(tokenizer=tokenizer) #padding funct.

  0%|          | 0/42 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [17]:
# train test split
splitted_data= tokenized_train.train_test_split(test_size=0.2)
splitted_data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 32924
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8231
    })
})

# Model

In [18]:
# metrics count function
def compute_metrics(eval_preds):
    metric = evaluate.load('accuracy')
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [19]:
 # model parameters
training_args = TrainingArguments(
    "test-trainer",     
    evaluation_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    optim="adamw_torch",
    warmup_steps=600,
    weight_decay=0.01,
    logging_steps=1)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [20]:
# train model on our data
trainer = Trainer(
    model,
    training_args,
    train_dataset=splitted_data['train'],
    eval_dataset=splitted_data['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [21]:
transformers.logging.set_verbosity('CRITICAL')
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.4088,0.59535,0.787754
2,0.2857,0.439442,0.848986
3,0.1401,0.467537,0.856761




Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]



TrainOutput(global_step=3087, training_loss=0.5614587400243733, metrics={'train_runtime': 1072.7863, 'train_samples_per_second': 92.071, 'train_steps_per_second': 2.878, 'total_flos': 3239249507484792.0, 'train_loss': 0.5614587400243733, 'epoch': 3.0})

# Test predict

In [22]:
def get_prediction(text):
    inputs = tokenizer(text, truncation=True,padding=True, return_tensors="pt").to("cuda")
    outputs = model(**inputs)
    proba = outputs[0].softmax(1)
    return proba.argmax().item()

In [23]:
trainer.evaluate(tokenized_test)

{'eval_runtime': 10.87,
 'eval_samples_per_second': 349.401,
 'eval_steps_per_second': 5.52,
 'epoch': 3.0}

In [24]:
predictions = test_df['text'].apply(lambda text: get_prediction(text))
sample_submission['Sentiment'] = le.inverse_transform(predictions)
sample_submission.head()

Unnamed: 0,id,Sentiment
0,787bc85b-20d4-46d8-84a0-562a2527f684,Negative
1,17e934cd-ba94-4d4f-9ac0-ead202abe241,Positive
2,5914534b-2b0f-4de8-bb8a-e25587697e0d,Extremely Positive
3,cdf06cfe-29ae-48ee-ac6d-be448103ba45,Extremely Negative
4,aff63979-0256-4fb9-a2d9-86a3d3ca5470,Neutral


## Submission

In [25]:
sample_submission.to_csv(os.path.join(OUTPUT_DICT, 'submission.csv'), index=False)