## Android-Ios Question Classifier

---

The Android-Ios Question Classification model takes questions as input.
It's a model that classify whether the question is about Android or Ios.
<br>

- This model is fine-tuned with bert-base-cased model
- Learning is done on Ainize Workspace
- If you want to look up Demo web using this model, go to Ainize Endpoint

---

**Pretrained model**: [bert-base-cased](https://huggingface.co/bert-base-cased)
<br>
**Dataset**: [Kaggle](https://www.kaggle.com/xhlulu/question-classification-android-or-ios)
<br>
**Ainize Endpoint**: [Endpoint](https://main-android-ios-classification-east-h-shin.endpoint.ainize.ai/)
<br>

In [None]:
#!pip install -U transformers datasets scipy scikit-learn

In [None]:
import torch
from torch.utils.data import Dataset
import numpy as np
import pandas as pd
from transformers import  TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from torch import cuda
from sklearn.metrics import accuracy_score

In [None]:
device = 'cuda:0' if cuda.is_available() else 'cpu'

In [None]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [None]:
data = train_df[["Body", "Label"]]

In [None]:
data

In [None]:
data_dev = test_df[["Body", "Label"]]

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased", do_lower_case=False)
tokenizer.save_pretrained(".")

In [None]:
label_encoder = LabelEncoder()
data["Label"] = label_encoder.fit_transform(data["Label"])
data_dev["Label"] = label_encoder.fit_transform(data_dev["Label"])

In [None]:
mapping = dict(zip(range(len(label_encoder.classes_)), label_encoder.classes_))

In [None]:
mapping

In [None]:
num_labels=2
max_len = 512
batch_size = 8
num_epochs = 5
log_interval = 200
learning_rate =  5e-5

In [None]:
X_train = list(data["Body"])
Y_train = list(data["Label"])
X_val = list(data_dev["Body"])
y_val = list(data_dev["Label"])
X_train_tokenized = tokenizer(X_train, padding=True, truncation = True, max_length =max_len)
X_val_tokenized = tokenizer(X_val, padding=True, truncation = True, max_length =max_len)

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item
    
    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = Dataset(X_train_tokenized, Y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-cased',num_labels=num_labels).to("cuda")

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
  }

In [None]:
args = TrainingArguments(
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate =  learning_rate ,
    num_train_epochs=num_epochs,
    logging_steps= log_interval ,
    output_dir="output",
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='log',
    load_best_model_at_end=True,
    evaluation_strategy="steps"
)

In [None]:
model = model.to(device)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
model_path = "Android-Ios-Classfication-bert-base"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
def get_prediction(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_len, return_tensors="pt").to("cuda")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    return mapping[probs.argmax().item()]

In [None]:
text = """
    I bought the Goodnote
"""

In [None]:
print(get_prediction(text))