# Setting up our environment
First, we have to install the packages we'll use. But before installing them, we have to switch the Runtime environment from the default setting to GPU. Go to Runtime >> Change runtime type.

We'll use Google Drive to store our input and output data. So, before installing the required packages, we have to connect our Colab notebook to our Google Drive account.

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Now, we are ready to install from the requirements.txt file. The path to your folder starts with "/content/drive/My\ Drive/"

In [None]:
!pip install -r /content/drive/My\ Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/requirements.txt

Install apex.

In [None]:
%%writefile setup.sh

export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

In [None]:
!sh setup.sh

# Using transformers' built-in sentiment analyzer pipeline

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.metrics import classification_report
from transformers import pipeline

nlp = pipeline("sentiment-analysis")

df = pd.read_csv("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/processed/annotation.tsv", sep="\t")

reviews = df["reviews"]
rating_classes = df["rating_class"]


# there are two built-in categories, positive and negative
# we use the label probability to classify it into three categories
def classify_sentiment(text, threshold):
  s = nlp(text)[0]
  label = s["label"]
  score = s["score"]
  if label == "NEGATIVE" and score > threshold:
    return 0
  elif label == "POSITIVE" and score > threshold:
    return 2
  else:
    return 1


target_names = ["negative", "neutral", "positive"]
thresholds = [0.95]
# thresholds = [0.65, 0.75, 0.85, 0.95, 0.99]
reports = []
for th in thresholds:
  sentiment_values = [classify_sentiment(r, th) for r in reviews]
  report = classification_report(
      rating_classes, sentiment_values, target_names=target_names
      )
  reports.append(report)

for report in reports:
  print(report)


# Train your own classifier

First, let's make train and test corpora.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel

# read corpus
df = pd.read_csv("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/processed/training.tsv", sep="\t")
print(len(list(df["rating_class"])))
# train-test split on the data frame
train_df, test_df = train_test_split(df,
                                     stratify=df["rating_class"],
                                     random_state=42)

# save train and test corpora
with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/processed/train.tsv", "w") as outfile:
    outfile.write(train_df.to_csv(index=False, sep="\t"))

with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/processed/test.tsv", "w") as outfile:
    outfile.write(test_df.to_csv(index=False, sep="\t"))

Now, we can train our model using distilbert to vectorize the reviews.

In [None]:
import os
model = ClassificationModel(
    "distilbert",
    "distilbert-base-uncased",
    use_cuda=True,
    num_labels=3,
    args={
        "output_dir": "/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/outputs/",
        "best_model_dir": "/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/outputs/best_model/",
        "reprocess_input_data": True,
        "sliding_window": True,
        "overwrite_output_dir": True,
        "max_seq_length": 512,
        "num_train_epochs": 20,
        "train_batch_size": 20,
        "eval_batch_size": 20,
    },
)


model.train_model(train_df, test_df)


Let's evaluate our newly trained model

In [None]:
from sklearn.metrics import classification_report

result, model_outputs, wrong_predictions = model.eval_model(test_df)

target_names = ["negative", "neutral", "positive"]
predicted_class = [list(e[0]) for e in model_outputs]
predicted_class = [e.index(max(e)) for e in predicted_class]
print(
    classification_report(
        list(test_df["rating_class"]), predicted_class, target_names=target_names
    )
)


# Fine-tune distilbert

## Preprocessing

In [None]:
import random

import pandas as pd

# from transformers import DistilBertTokenizer

df_train = pd.read_csv("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/processed/train.tsv", sep="\t")
reviews_train = list(df_train["reviews"])
reviews_train = [r.lower().strip() for r in reviews_train]
ratings_train = df_train["rating_class"]

df_test = pd.read_csv("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/processed/test.tsv", sep="\t")
reviews_test = list(df_test["reviews"])
reviews_test = [r.lower().strip() for r in reviews_test]
ratings_test = df_test["rating_class"]

with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/raw/reviews_without_ratings.txt", "r") as f:
    reviews = f.read().split("\n")

evalset = random.sample(reviews, 500)
evalset = [r.lower().strip() for r in evalset]
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',
#                                           unk_token='<unk>')

# tokenized_train = [" ".join(tokenizer.tokenize(r)) for r in reviews_train]
# tokenized_test = [" ".join(tokenizer.tokenize(r)) for r in reviews_test]

all_train = "\n". join(reviews_train)
all_test = "\n". join(reviews_test)
all_eval = "\n".join(evalset)
with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/processed/train.txt", "w") as outfile:
    outfile.write(all_train)

with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/processed/test.txt", "w") as outfile:
    outfile.write(all_test)

with open("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/processed/eval.txt", "w") as outfile:
    outfile.write(all_eval)


## Train your own language model based on distilbert

In [None]:
from simpletransformers.language_modeling import LanguageModelingModel


train_args = {
    "output_dir": "/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/langmods/",
    "best_model_dir": "/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/langmods/best_model/",
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
     "num_train_epochs": 10,
     "evaluate_during_training": True,
}

model = LanguageModelingModel('distilbert', 'distilbert-base-uncased',
                              use_cuda=True,
                              args=train_args)
model.train_model("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/processed/train.txt",
                  eval_file="/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/processed/test.txt")

model.eval_model("/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/data/processed/eval.txt")


# Build a classifier using your own language model

In [None]:
model2 = ClassificationModel(
    model_type="distilbert",
    model_name="/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/langmods/best_model/",
    use_cuda=True,
    num_labels=3,
    args={
        "output_dir": "/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/outputs2/",
        "best_model_dir": "/content/drive/My Drive/crowintelligence/projektek/manning/sentiment_analysis_project/Colab/outputs2/best_model/",
        "evaluate_during_training": True,
        "reprocess_input_data": True,
        "sliding_window": True,
        "overwrite_output_dir": True,
        "max_seq_length": 512,
        "num_train_epochs": 20,
        "train_batch_size": 20,
        "eval_batch_size": 20,
    },
)

# takes a few epoch => about an hour
model2.train_model(train_df, eval_df=test_df)


## Evaluate it

In [None]:
from sklearn.metrics import classification_report

result2, model_outputs2, wrong_predictions2 = model2.eval_model(test_df)
target_names = ["negative", "neutral", "positive"]
predicted_class2 = [list(e[0]) for e in model_outputs2]
predicted_class2 = [e.index(max(e)) for e in predicted_class2]
print(
    classification_report(
        list(test_df["rating_class"]), predicted_class2, target_names=target_names
    )
)
