<a href="https://colab.research.google.com/github/DianaMoyano1/NLP-Sentiment_Extraction_Challenge/blob/master/Tutorial/Tutorial_SingleM_Template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SETUP


1. Click on Runtime --> Change runtime type
2. Select **GPU** under *Hardware accelerator*



Before running this notebook, make sure you have added the *'tweet-sentiment-extraction-tutorial'* shortcut to your Gdrive, inside the *'Colab Notebooks'* directory

![](https://serving.photos.photobox.com/817973499a3406a94b0556385350836dda810c16c1a20967bd25b50dd6e367b3dcb1e30b.jpg)

### Mount Your Own Gdrive

Below command will require you to validate your account, and it will provide you with a temporary access code to paste in the field

In [None]:
# Mount your local Google drive and show the models you have
from google.colab import drive
drive.mount('/gdrive')
%ls '/gdrive/My Drive/Colab Notebooks/tweet-sentiment-extraction-tutorial/models' 

In [None]:
#install the following packages. The --quiet command will reduce the output lines
!pip install transformers==2.11.0 --quiet
!pip install tensorflow==2.2.0 --quiet
!pip install tensorboardX --quiet
!pip install simpletransformers --quiet


!pip install scattertext --quiet 
!pip install plotly --quiet
!pip install dash --quiet
!pip install wordcloud --quiet

### Setup NVIDIA APEX

Tool to enable mixed precision training in Pytorch (the underlying structure for SimpleTransformers). More info here: https://github.com/NVIDIA/apex

In [None]:
%%writefile setup.sh
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

In [None]:
#this will take 7-10 mins to run
import timeit
start = timeit.default_timer()

!sh setup.sh --quiet

stop = timeit.default_timer()
print('Time: ', stop - start)  

### Import Packages

In [None]:
#Import packages
from os.path import join
import numpy as np 
import pandas as pd 
from apex import amp
import json


use_cuda = True ##If True, GPU will be used

### Load the Data

In [None]:
train_df = pd.read_csv('/gdrive/My Drive/Colab Notebooks/tweet-sentiment-extraction-tutorial/train.csv')
test_df = pd.read_csv('/gdrive/My Drive/Colab Notebooks/tweet-sentiment-extraction-tutorial/test.csv')

In [None]:
train_df.head()

In [None]:
test_df.head()

# DATA PREP

Split into train and validation sets

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state = 42)

In [None]:
#drop selected_text column from the validation dataset (it will be added back once we are comparing it to our predictions)
val_df_new = val_df.drop('selected_text', axis=1)

In [None]:
print(train_df.shape)
print(val_df_new.shape)
print(test_df.shape)

In [None]:
train = np.array(train_df)
val = np.array(val_df_new)
test = np.array(test_df)

### Initiate the SimpleTransformers Task



The SimpleTransformers library supports numerous tasks:  


- Sequence Classification
- Token Classification (NER)
- Question Answering
- Language Model Fine-Tuning
- Language Model Training
- Language Generation
- T5 Model
- Seq2Seq Tasks
- Multi-Modal Classification
- Conversational AI

In this case, we are performing a <ins>Question Answering</ins> task.

Supported model types:

- ALBERT
- BERT
- DistilBERT
- ELECTRA
- XLM
- XLNet

In [None]:
# Import the Question Answering model
from simpletransformers.question_answering import QuestionAnsweringModel

### Format the data under the SimpleTransformer's *Question&Answer* schema 



To input the dataset, we need to assign each column to specific inputs
- Context: The entire tweet
- Question: The sentiment (positive, negative or neutral). In other words, we are asking *\"What part of the entire tweet best represents this sentiment?\"*
- Answer: the label - the extracted text

The formated data is assigned to the variables *qa_train, qa_val* and *qa_test* respectively



In [None]:
#@title Create list for training

## Adapted from https://www.kaggle.com/cheongwoongkang/roberta-baseline-starter-simple-postprocessing
def find_all(input_str, search_str):
    l1 = []
    length = len(input_str)
    index = 0
    while index < length:
        i = input_str.find(search_str, index)
        if i == -1:
            return l1
        l1.append(i)
        index = i + 1
    return l1

def do_qa_train(train):

    output = []
    for line in train:
        context = line[1]

        qas = []
        question = line[-1]
        qid = line[0]
        answers = []
        answer = line[2]
        if type(answer) != str or type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answer_starts = find_all(context, answer)
        for answer_start in answer_starts:
            answers.append({'answer_start': answer_start, 'text': answer.lower()})
            break
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})

        output.append({'context': context.lower(), 'qas': qas})
        
    return output

qa_train = do_qa_train(train)


In [None]:
qa_train[1:3]


In [None]:
#@title Create val list
## Adapted from https://www.kaggle.com/cheongwoongkang/roberta-baseline-starter-simple-postprocessing
def do_qa_val(val):
    output = []
    for line in val:
        context = line[1]
        qas = []
        question = line[-1]
        qid = line[0]
        if type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answers = []
        answers.append({'answer_start': 1000000, 'text': '__None__'})
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})
        output.append({'context': context.lower(), 'qas': qas})
    return output

qa_val = do_qa_val(val)

In [None]:
#@title Create test list
## Adapted from https://www.kaggle.com/cheongwoongkang/roberta-baseline-starter-simple-postprocessing
def do_qa_test(test):
    output = []
    for line in test:
        context = line[1]
        qas = []
        question = line[-1]
        qid = line[0]
        if type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answers = []
        answers.append({'answer_start': 1000000, 'text': '__None__'})
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})
        output.append({'context': context.lower(), 'qas': qas})
    return output

qa_test = do_qa_test(test)

### Create a Logging Module --> More info [here](https://realpython.com/python-logging/#:~:text=The%20Logging%20Module,-The%20logging%20module&text=It%20is%20used%20by%20most,homogeneous%20log%20for%20your%20application.&text=With%20the%20logging%20module%20imported,that%20you%20want%20to%20see.)


Logs provide developers with an extra set of eyes that are constantly looking at the flow that an application is going through. They can store information, like which user or IP accessed the application.  

With the logging module imported, you can use something called a “logger” to log messages that you want to see. By default, there are 5 standard levels indicating the severity of events.
- DEBUG
- INFO
- WARNING
- ERROR
- CRITICAL

In this case, we picked INFO and WARNING

In [27]:
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

### Load, Train and Evaluate a SimpleTransformers' Pre-Trained Model **<ins>OR</ins>** Load and Evaluate a Richardson's Pre-Trained Model  

Follow the section that best applies to your case.  

# OPTION 1: Load, train and evaluate a SimpleTransformers' Pre-trained Model

Create a folder that will contain the new model's PyTorch and hyperameters files. Follow below instructions to assign a name to the *'NAME_OF_MODEL'*  folder:


>>**Basic Structure:**

>>Name_Model_Version  

>>>Where:
- Name: Your name
- Model: Based on the model names used in the official Transformers site: https://huggingface.co/transformers/pretrained_models.html
- Version: For notebooks with same name and model but different hyperparameters, include the version (A, B, C...)
  
  >>>Examples:
  - Lucas_distilroberta-base_A
  - Lucas_distilroberta-base_B
  - Landis_bert_A  


Supported model types for Question&Answering:

- ALBERT
- BERT
- DistilBERT
- ELECTRA
- XLM
- XLNet

In [28]:
# Change this BEFORE RUNNING *********************************************************************************************
YOUR_NAME = 'test'
YOUR_LETTER = 'B'     # identify your model A,B,C,D,E...
MODEL_ARCHITECTURE = 'distilbert'
MODEL_NAME = 'distilbert-base-uncased-distilled-squad'
# ************************************************************************************************************************

#Don't change below lines:
FULL_NAME = YOUR_NAME + '_' + MODEL_NAME + '_' + YOUR_LETTER 


ROOT = '/gdrive/My Drive/Colab Notebooks/my-folder/models' 
FULL_PATH = join(ROOT, FULL_NAME)

Below command will create a folder where all the model's files will be stored

In [29]:
#Change directory to "tweet-sentiment-extraction-tutorial/models"
%cd '{ROOT}'
#It creates the folder where the model components will be saved. If you have a folder with the same name, it will give you an error
%mkdir '{FULL_NAME}' 
#Change the workspace to the recently created folder
%cd '{FULL_PATH}' 

/gdrive/My Drive/Colab Notebooks/my-folder/models
[Errno 2] No such file or directory: '/gdrive/My Drive/Colab Notebooks/my-folder/models/test_distilbert-base-uncased-distilled-squad_B'
/gdrive/My Drive/Colab Notebooks/my-folder/models


In [30]:
#For more arguments, refer to this link --> https://simpletransformers.ai/docs/usage/#configuring-a-simple-transformers-model

args_train={'reprocess_input_data': True,
'overwrite_output_dir': True,
'learning_rate': 5e-5,
'num_train_epochs': 1,
'max_seq_length': 192,
'doc_stride': 64,
'fp16': False,
}

#Fit the model
model = QuestionAnsweringModel(MODEL_ARCHITECTURE, MODEL_NAME, args=args_train, use_cuda=use_cuda)

In [None]:
#Train the model
import timeit
start = timeit.default_timer()

model.train_model(qa_train)

stop = timeit.default_timer()
print('Time: ', stop - start)  

INFO:simpletransformers.question_answering.question_answering_model: Converting to features started.
convert squad examples to features: 100%|██████████| 21983/21983 [00:19<00:00, 1126.36it/s]
add example index and unique id: 100%|██████████| 21983/21983 [00:00<00:00, 801524.62it/s]


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=2748.0, style=ProgressStyle(descr…

Running loss: 2.037285



Running loss: 0.314654

In [None]:
#Predict the evaluation and test sets
predictions_val = model.predict(qa_val)
predictions_test = model.predict(qa_test)


Let's check the structure of the predictions

In [None]:
#It displays truncated long texts
pd.set_option('display.max_colwidth',100)

#Each ID contains multiple predicted extractions and their corresponding probabilities (prediction with highest probability is first)
pd.DataFrame.from_dict(predictions_val)[1]

Below commands will select the extracted text with the highest likelyhood (first item), as well as its corresponding probability

In [None]:
#@title Obtain output with the highest prob - Validation set

#Validation Set highest probability output
predictions_df_val = pd.DataFrame.from_dict(predictions_val)
text_val = pd.DataFrame(predictions_val[0])
prob_val = pd.DataFrame(predictions_val[1])
prop1_val = prob_val['probability'].tolist()
prop2_val = pd.DataFrame(prop1_val)
text1_val = text_val['answer'].tolist()
text2_val = pd.DataFrame(text1_val)

In [None]:
#@title Obtain output with the highest prob - Test set
predictions_df_test = pd.DataFrame.from_dict(predictions_test)
text_test = pd.DataFrame(predictions_test[0])
prob_test = pd.DataFrame(predictions_test[1])
prop1_test = prob_test['probability'].tolist()
prop2_test = pd.DataFrame(prop1_test)
text1_test = text_test['answer'].tolist()
text2_test = pd.DataFrame(text1_test)

In [None]:
# Make a copy of the validation and test sets so that we are not modifying the original sets
sub_val_df = val_df.copy()
sub_test_df = test_df.copy()

In [None]:
#Add the predicted result to the copied data frames 
sub_val_df['predicted_selected_text'] = text2_val[0].values
sub_test_df['predicted_selected_text'] = text2_test[0].values

In [None]:
#Add the probability of the prediction
sub_val_df['prob'] = prop2_val[0].values
sub_test_df['prob'] = prop2_test[0].values

## Evaluate Validation Test with Jaccard Score

In [None]:
# Check head of dataset
sub_val_df.head()

In [None]:
#Make a copy of the original validation set and reset indexes
df_js=sub_val_df.copy()
df_js=df_js.reset_index()

In [None]:
#Define the Jaccard Score function
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
#Obtain JS for the entire set
results = []
for i in range(len(df_js)):
    score = jaccard(df_js['selected_text'].iloc[i], df_js['predicted_selected_text'].iloc[i])
    results.append(score)
    
Jaccard_score = sum(results) / len(results)
Jaccard_score

## Prepare and Submit Test Set

In [None]:
# Check head of dataset
sub_test_df.head()

In [None]:
#Prepare file for submission
final_test=sub_test_df[['textID','predicted_selected_text']]
final_test.columns=['textID','selected_text']
final_test.head()

In [None]:
#Submit
final_test[['textID','selected_text']].to_csv('submission.csv', index=False)
print("Submission successful")

## Save trained model arguments and other files

In [None]:
#This line creates a JSON file that is required to load the model in the future
with open('args_train.json', 'w') as fp: 
    json.dump(args_train, fp)

In [None]:
#Additonal files if required
"""from google.colab import files
sub_val_df.to_csv('sub_val.csv') 
files.download('sub_val.csv')
sub_test_df.to_csv('sub_test.csv') 
files.download('sub_test.csv')
train_df.to_csv("new_train_df")"""

# OPTION 2: Load and Evaluate a Richardson's Pre-Trained Model

#### Distilbert --> A faster yet powerful version of BERT
https://arxiv.org/abs/1910.01108

#### SQuAD --> Standford Question Answering Dataset
https://rajpurkar.github.io/SQuAD-explorer/

In [None]:

ROOT= '/gdrive/My Drive/Colab Notebooks/tweet-sentiment-extraction-tutorial/models'
FOLDER_NAME= 'richardson_distilbert-base-uncased-distilled-squad_A'

FULL_PATH = join(ROOT, FOLDER_NAME)

#Change the workspace to the model folder
%cd '{FULL_PATH}' 

#Load the model's arguments list (required to setup the existing model) 
with open('args_train.json') as json_file: 
    train_args = json.load(json_file) 

## Setup loaded model

In [None]:
#Load the model
loaded_model = QuestionAnsweringModel(MODEL_ARCHITECTURE, 'outputs/', args=train_args, use_cuda=use_cuda)

In [None]:
#Predict the evaluation and test sets
predictions_val = loaded_model.predict(qa_val)
predictions_test = loaded_model.predict(qa_test)

Let's check the structure of the predictions

In [None]:
#It displays truncated long texts
pd.set_option('display.max_colwidth',100)

#Each ID contains multiple predicted extractions and their corresponding probabilities (prediction with highest probability is first)
pd.DataFrame.from_dict(predictions_val)[1]

Below commands will select the extracted text with the highest likelyhood (first item), as well as its corresponding probability

In [None]:
#@title Obtain output with the highest prob - Validation set
predictions_df_val = pd.DataFrame.from_dict(predictions_val)
text_val = pd.DataFrame(predictions_val[0])
prob_val = pd.DataFrame(predictions_val[1])
prop1_val = prob_val['probability'].tolist()
prop2_val = pd.DataFrame(prop1_val)
text1_val = text_val['answer'].tolist()
text2_val = pd.DataFrame(text1_val)


In [None]:
#@title Obtain output with the highest prob - Test set
predictions_df_test = pd.DataFrame.from_dict(predictions_test)
text_test = pd.DataFrame(predictions_test[0])
prob_test = pd.DataFrame(predictions_test[1])
prop1_test = prob_test['probability'].tolist()
prop2_test = pd.DataFrame(prop1_test)
text1_test = text_test['answer'].tolist()
text2_test = pd.DataFrame(text1_test)

In [None]:
# Make a copy of the validation and test sets so that we are not modifying the original sets
sub_val_df = val_df.copy()
sub_test_df = test_df.copy()

In [None]:
#Add the predicted result to the copied data frames 
sub_val_df['predicted_selected_text'] = text2_val[0].values
sub_test_df['predicted_selected_text'] = text2_test[0].values

In [None]:
#Add the probability of the prediction
sub_val_df['prob'] = prop2_val[0].values
sub_test_df['prob'] = prop2_test[0].values

## Evaluate Validation Test with Jaccard Score

In [None]:
# Check head of dataset
sub_val_df.head()

In [None]:
#Make a copy of the original validation set and reset indexes
df_js=sub_val_df.copy()
df_js=df_js.reset_index()

In [None]:
#Define the Jaccard Score function
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
#Obtain JS for the entire set
results = []
for i in range(len(df_js)):
    score = jaccard(df_js['selected_text'].iloc[i], df_js['predicted_selected_text'].iloc[i])
    results.append(score)
    
Jaccard_score = sum(results) / len(results)
Jaccard_score

## Prepare and Submit Test Set

In [None]:
# Check head of dataset
sub_test_df.head()

In [None]:
#Prepare file for submission
final_test=sub_test_df[['textID','predicted_selected_text']]
final_test.columns=['textID','selected_text']
final_test.head()

In [None]:
#Submit
final_test[['textID','selected_text']].to_csv('submission.csv', index=False)
print("Submission successful")

In [None]:
#Additonal files if required
"""from google.colab import files
sub_val_df.to_csv('sub_val.csv') 
files.download('sub_val.csv')
sub_test_df.to_csv('sub_test.csv') 
files.download('sub_test.csv')
train_df.to_csv("new_train_df")"""