In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('data/small_corpus.csv',index_col=0)
df['reviews']= df['reviews'].astype(str)

In [None]:
df.head()

In [None]:
def score_to_Target(value):
    if value >= 5:
        return 2
    if value <= 4 and value >= 2:
        return 1
    else:
        return 0

In [None]:
df['labels'] = df['ratings'].apply(lambda x:score_to_Target(x))

In [None]:
from sklearn.utils import shuffle
df = shuffle(df)

In [None]:
df.head()

In [None]:
import re
import string
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
df['text'] = df['reviews'].apply(lambda x:clean_text(x))

In [None]:
df.head()

In [None]:
!pip install simpletransformers -q

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

In [None]:
model_args = ClassificationArgs(num_train_epochs=1)

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df,
                                     stratify=df["labels"],
                                     random_state=42)

In [64]:
wandb.init(project="sentiment")



In [65]:
import sklearn
model_args = ClassificationArgs(num_train_epochs=5)
model_args.learning_rate = 1e-5
model_args.max_seq_length=128
model_args.sliding_window=True
model_args.overwrite_output_dir = True
model_args.no_cache = True
model_args.use_early_stopping = True
model_args.early_stopping_delta = 0.01
model_args.early_stopping_metric = "mcc"
model_args.early_stopping_metric_minimize = False
model_args.early_stopping_patience = 5
model_args.evaluate_during_training_steps = 1000
model_args.wandb_project = 'sentiment'

In [66]:
model = ClassificationModel(
    "roberta", "roberta-base", args=model_args,num_labels=3
)

In [67]:
model.train_model(train_df,acc=sklearn.metrics.accuracy_score)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Sliding window enabled


HBox(children=(FloatProgress(value=0.0, max=3375.0), HTML(value='')))




INFO:simpletransformers.classification.classification_model: 6896 features created from 3375 samples.


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=862.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=862.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=862.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=862.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=862.0, style=ProgressStyle(des…





INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to outputs/.


(4310, 0.5264778674545584)

In [68]:
result, model_outputs, wrong_predictions = model.eval_model(test_df,acc=sklearn.metrics.accuracy_score)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Sliding window enabled


HBox(children=(FloatProgress(value=0.0, max=1125.0), HTML(value='')))




INFO:simpletransformers.classification.classification_model: 1125 features created from 1125 samples.


HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=267.0, style=ProgressStyle(descr…




INFO:simpletransformers.classification.classification_model:{'mcc': 0.6335503732450465, 'acc': 0.7555555555555555, 'eval_loss': 1.0822037806148441}


# Sweep

In [69]:
sweep_config = {
    "name": "vanilla-sweep-batch-16",
    "method": "bayes",
    "metric": {"name": "eval_loss", "goal": "minimize"},
    "parameters": {
        "num_train_epochs": {"min": 1, "max": 10},
        "learning_rate": {"min": 0, "max": 4e-4},
    },
    "early_terminate": {"type": "hyperband", "min_iter": 6,},
}

In [None]:
import wandb
sweep_id = wandb.sweep(sweep_config, project="sentiment")

In [None]:
model_args = ClassificationArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.evaluate_during_training = True
model_args.manual_seed = 4
model_args.use_multiprocessing = True
model_args.train_batch_size = 16
model_args.eval_batch_size = 8
model_args.max_seq_length=128
model_args.sliding_window=True
model_args.overwrite_output_dir = True
model_args.no_cache = True
model_args.wandb_project = 'sentiment'

In [None]:
def train():
    wandb.init()

    # Create a TransformerModel
    model = ClassificationModel(
        "roberta",
        "roberta-base",
        num_labels=3,
        use_cuda=True,
        args=model_args,
        sweep_config=wandb.config,
    )
    model.train_model(train_df, eval_df=test_df)
    model.eval_model(test_df)
    wandb.join()

In [None]:
wandb.agent(sweep_id, train)