![AIcrowd-Logo](https://raw.githubusercontent.com/AIcrowd/AIcrowd/master/app/assets/images/misc/aicrowd-horizontal.png)

This dataset and notebook correspond to the SCRBL Challenge on AICrowd

Author: Team BlitzCA

# Downloads and Installations

In [3]:
!pip install transformers==2.11.0
!pip install simpletransformers==0.34.4
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize



# DOWNLOAD DATASET

In [2]:
!wget -q https://s3.eu-central-1.wasabisys.com/aicrowd-practice-challenges/public/scrbl/v0.1/test.zip
!wget -q https://s3.eu-central-1.wasabisys.com/aicrowd-practice-challenges/public/scrbl/v0.1/train.zip
!wget -q https://s3.eu-central-1.wasabisys.com/aicrowd-practice-challenges/public/scrbl/v0.1/val.zip

In [4]:
!unzip -q train.zip
!unzip -q test.zip
!unzip -q val.zip

# Imports

In [5]:
import psutil
import humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()
gpu = GPUs[0]
def printm():
    process = psutil.Process(os.getpid())
    print("Gen RAM Free: " + humanize.naturalsize(psutil.virtual_memory().available), " |     Proc size: " + humanize.naturalsize(process.memory_info().rss))
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total     {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Gen RAM Free: 12.7 GB  |     Proc size: 161.2 MB
GPU RAM Free: 15079MB | Used: 0MB | Util   0% | Total     15079MB


In [6]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')
import gc
from scipy.special import softmax
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
import sklearn
from sklearn.metrics import log_loss
from sklearn.metrics import *
from sklearn.model_selection import *
import re
import random
import torch
pd.options.display.max_colwidth = 200

def set_seeds():
    random.seed(15)
    np.random.seed(15)
    torch.manual_seed(15)
    # torch.cuda.manual_seed(15)
    # torch.cuda.manual_seed_all(15)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Feature Engineering

In [13]:
train = pd.read_csv("train.csv" )
test = pd.read_csv("test.csv")
val = pd.read_csv("val.csv")

#full_train = pd.concat([train, val])

In [15]:
full_train = pd.concat([train, val]).reset_index(drop=True)

In [16]:
full_train['text'].duplicated().any()

True

In [17]:
full_train.drop_duplicates(subset=['text'], keep='first', inplace=True)

In [18]:
train1 = full_train
test1 = test
test1['label'] = 0

In [19]:
def labelconv(x):
  if x=='scrambled':return 1
  elif x=='unscrambled':return 0

train1['label'] = train1['label'].apply(labelconv)

# Model and Prediction

In [20]:
def get_model(model_type, model_name, n_epochs = 1, train_batch_size = 112, eval_batch_size = 144, seq_len = 134, lr = 4e-5):
  model = ClassificationModel(model_type, model_name,num_labels=2, args={'train_batch_size':train_batch_size,
                                                                         "eval_batch_size": eval_batch_size,
                                                                         'reprocess_input_data': True,
                                                                         'overwrite_output_dir': True,
                                                                         'fp16': False,
                                                                         'do_lower_case': False,
                                                                         'num_train_epochs': n_epochs,
                                                                         'max_seq_length': seq_len,
                                                                         'regression': False,
                                                                         'manual_seed': 2,
                                                                         "learning_rate":lr,
                                                                         "save_eval_checkpoints": False,
                                                                         "save_model_every_epoch": False,})
  return model

In [21]:
tmp = pd.DataFrame()
tmp['text'] = train1['text']
tmp['label'] = train1['label']

In [22]:
from sklearn.model_selection import *
tmp_trn, tmp_val = train_test_split(tmp, test_size=0.2, random_state=2)

In [None]:
model = get_model('roberta', 'roberta-base', n_epochs=1)
model.train_model(tmp_trn)
preds_val = model.eval_model(tmp_val)[1]

In [None]:
predic = model.predict(test1['text'])

In [None]:
submission = pd.DataFrame(predic[0])
submission[0] = submission[0].replace({0:'unscrambled', 1:'scrambled'})

In [None]:
submission.to_csv('best.csv',header=['label'],index=False)