## Importing Libraries


In [1]:
!pip -q install simpletransformers

In [2]:
import pandas as pd
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import sklearn
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
import re
from sklearn.model_selection import train_test_split

In [3]:
!pip install -q openpyxl

## Loading Files From Dataset

In [4]:
df = pd.read_csv("../input/sih-sanfoundary-questions/train_questions_pcmb.csv")

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df['text'].tolist(), df['labels'].tolist(), test_size=0.10, random_state=42)

In [6]:
df_train = {
    'labels': y_train,
    'text': X_train
}
df_test = {
    'labels': y_test,
    'text': X_test
}

df_train = pd.DataFrame(df_train)
df_test = pd.DataFrame(df_test)

In [7]:
c2i = {
    "P" : 0,
    "C" : 1,
    "M" : 2,
    "B" : 3
}
i2c = {
    0: "P",
    1: "C",
    2: "M",
    3: "B"
}

In [8]:
df_train['labels'].replace(c2i, inplace = True)
df_test['labels'].replace(c2i, inplace = True)

In [9]:
print(df_train.shape, df_test.shape)

## Model Training

In [10]:
labels_num = len(df_train['labels'].unique())
print(labels_num)

In [11]:
model_args = ClassificationArgs()

In [12]:
model_args.overwrite_output_dir=True
model_args.eval_batch_size=8
model_args.train_batch_size=8
model_args.learning_rate=4e-5



In [13]:
model = ClassificationModel(
    'roberta',
    'roberta-base',
    num_labels=labels_num,
    args=model_args,
    tokenizer_type="roberta",
    tokenizer_name='roberta-base',
) 

In [14]:
for i in range(0,2):
    !rm -rf /kaggle/working/outputs
    model.train_model(df_train,acc=sklearn.metrics.classification_report)
    result, model_outputs, preds_list = model.eval_model(df_test,acc=sklearn.metrics.classification_report)
    for j in result.values():
        print(j)

In [15]:
preds = model.predict(list(df_test['text']))

In [21]:
import torch

In [22]:
torch.save(model.state_dict())

## Result Evaluation

In [16]:
from sklearn.metrics import accuracy_score

In [17]:
accuracy_score(df_test['labels'],preds[0])

In [18]:
reverse_dict = {y:x for x,y in i2c.items()}
reverse_dict
predictions = pd.Series(preds[0]).apply(lambda x:i2c[x])
df_result = df_test.copy()
df_result['predictions'] = predictions
df_result['labels'] = df_result['labels'].apply(lambda x:i2c[x])
df_result

In [19]:
df_result.to_csv("predictions_indic_bbc_art.csv")

In [20]:
'''import pickle
filename = 'roberta10_model.sav'
pickle.dump(model, open(filename, 'wb'))''''