In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [1]:
!pip install transformers
!pip install torch



In [3]:
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
from sklearn.model_selection import train_test_split

train_texts=train_data['text'].tolist()
train_labels=train_data['target'].tolist()

# Split the training Data
X_train, X_Val, y_train, y_val = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)

In [6]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

# load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


****Tokenize Data

In [9]:
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(X_Val, truncation=True, padding=True, max_length=512)

In [10]:
# Create Dataset Class
import torch

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item={key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels']=torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
    
train_dataset = NewsDataset(train_encodings, y_train)
val_dataset = NewsDataset(val_encodings,y_val)

In [12]:
# Train Model

training_args=TrainingArguments(
    output_dir='./results',
    report_to='none',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs'
)

trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

Step,Training Loss
500,0.4628
1000,0.3033


TrainOutput(global_step=1143, training_loss=0.36367513835169407, metrics={'train_runtime': 3907.8598, 'train_samples_per_second': 4.675, 'train_steps_per_second': 0.292, 'total_flos': 397060678455840.0, 'train_loss': 0.36367513835169407, 'epoch': 3.0})

In [13]:
# Evaluate Model

trainer.evaluate()

{'eval_loss': 0.5147328972816467,
 'eval_runtime': 77.8424,
 'eval_samples_per_second': 19.565,
 'eval_steps_per_second': 0.308,
 'epoch': 3.0}

In [16]:
from sklearn.metrics import f1_score

#Make predictions on the validation set
val_predictions = trainer.predict(val_dataset)
val_pred_labels=val_predictions.predictions.argmax(-1)

# Calculate F1 Score
f1=f1_score(y_val, val_pred_labels)
print(f'F1 Score : {f1}')

F1 Score : 0.80030959752322


In [21]:
test_encodings = tokenizer(test_data['text'].tolist(), truncation=True, padding=True, max_length=512)
test_dataset = NewsDataset(test_encodings, [0] * len(test_encodings['input_ids'])) # Dummy labels
predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(-1)

In [22]:
submission_df=pd.DataFrame({
    'id': test_data['id'],
    'target':predicted_labels
})

# Save
submission_df.to_csv('submission.csv', index=False)

In [None]:
import os
import shutil

working_dir='/kaggle/working'

# Iterate through all files and folders
for item in os.listdir(working_dir):
    item_path=os.path.join(working_dir, item)
    
    if item!= 'submission.csv':
        if os.path.isfile(item_path):
            os.remove(item_path)
        elif os.path.isdir(item_path):
            shutil.rmtree(item_path)
            
            
print("Cleared all except submission file")