<a href="https://colab.research.google.com/github/DianaMoyano1/NLP-Sentiment_Extraction_Challenge/blob/master/Workstream_1/Landis_ensemble_B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SECTION 1: Setup


In [0]:
#install the following first
!pip install transformers==2.11.0 --quiet
!pip install tensorflow==2.2.0 --quiet
!pip install tensorboardX --quiet
!pip install simpletransformers --quiet

### Setup NVIDIA APEX

Tool to enable mixed precision training. More info here: https://github.com/NVIDIA/apex

In [0]:
%%writefile setup.sh
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

In [0]:
#this will take 10mins to run
import timeit
start = timeit.default_timer()

!sh setup.sh --quiet

stop = timeit.default_timer()
print('Time: ', stop - start)  

### Import Packages

In [0]:
#Import packages
import numpy as np 
import pandas as pd 
from apex import amp
from glob import glob
import os
from random import random
from pathlib import Path
import json
import torch
from transformers import AutoModel, AutoTokenizer, BertTokenizer, AutoModelForQuestionAnswering
from transformers import TFBertModel, BertModel, DistilBertModel, XLNetModel, RobertaModel
from tensorboardX import SummaryWriter
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from os.path import join


use_cuda = True ##If True, GPU will be used

### Mount Your Own Gdrive

Below command will require you to validate your account, and it will provide you with a temporary access code to paste in the required field

In [0]:
from google.colab import drive
drive.mount('/gdrive')
%ls /gdrive

### Load the Data



Before running below command, make sure you have...
- Created a *'tweet-sentiment-extraction'* folder inside the *'Colab Notebooks'* directory
- Uploaded the *train.csv* and *test.csv* files to the *'tweet-sentiment-extraction'* folder 

Finally, make sure you have a folder called *'models'* inside the *'tweet-sentiment-extraction'* directory

In [0]:
train_df = pd.read_csv('/gdrive/My Drive/Colab Notebooks/tweet-sentiment-extraction/train.csv')
test_df = pd.read_csv('/gdrive/My Drive/Colab Notebooks/tweet-sentiment-extraction/test.csv')



#sub_df = pd.read_csv('/gdrive/My Drive/Colab Notebooks/tweet-sentiment-extraction/sample_submission.csv') #Optional

### Prepare the Data

Split into train and validation sets

In [0]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state = 42)

In [0]:
#drop selected_text column from the validation dataset (it will be later compared to the ground truth)
val_df_new = val_df.drop('selected_text', axis=1)

In [0]:
print(train_df.shape)
print(val_df_new.shape)
print(test_df.shape)

In [0]:
train = np.array(train_df)
val = np.array(val_df_new)
test = np.array(test_df)

### Initiate the SimpleTransformers Task

In [0]:
from simpletransformers.question_answering import QuestionAnsweringModel

### Create a Logging Module --> More info [here](https://realpython.com/python-logging/#:~:text=The%20Logging%20Module,-The%20logging%20module&text=It%20is%20used%20by%20most,homogeneous%20log%20for%20your%20application.&text=With%20the%20logging%20module%20imported,that%20you%20want%20to%20see.)


Logs provide developers with an extra set of eyes that are constantly looking at the flow that an application is going through. They can store information, like which user or IP accessed the application.  

With the logging module imported, you can use something called a “logger” to log messages that you want to see. By default, there are 5 standard levels indicating the severity of events.
- DEBUG
- INFO
- WARNING
- ERROR
- CRITICAL

In this case, we picked INFO and WARNING

In [0]:
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

## Save trained model arguments and other files

In [0]:
"""from google.colab import files
sub_val_df.to_csv('sub_val.csv') 
files.download('sub_val.csv')
sub_test_df.to_csv('sub_test.csv') 
files.download('sub_test.csv')
train_df.to_csv("new_train_df")"""

In [0]:
#This line creates a JSON file that is required when loading the model
with open('args_train.json', 'w') as fp: 
    json.dump(args_train, fp)

# SECTION 3: Load and Evaluate a Richardson's Pre-Trained Model

In [0]:

ROOT = '/gdrive/My Drive/Colab Notebooks/tweet-sentiment-extraction/models' #Don't change

FULL_PATH = join(ROOT, NAME_OF_MODEL)

#Change the workspace to the model folder
%cd '{FULL_PATH}' 

#Load the model's arguments list (required to setup the existing model) 
with open('args_train.json') as json_file: 
    train_args = json.load(json_file) 

#### Setup loaded model

Supported model types for Question&Answering:

- ALBERT
- BERT
- DistilBERT
- ELECTRA
- XLM
- XLNet

Related link: https://huggingface.co/transformers/pretrained_models.html

In [0]:
loaded_model = QuestionAnsweringModel('distilbert', 'outputs/checkpoint-2748-epoch-1', args=train_args, use_cuda=use_cuda)
loaded_model2 = QuestionAnsweringModel('albert', 'outputs/checkpoint-2748-epoch-1', args=train_args, use_cuda=use_cuda)
loaded_model3 = QuestionAnsweringModel('roberta', 'outputs/checkpoint-2748-epoch-1', args=train_args, use_cuda=use_cuda)

In [0]:
predictions_val = model.predict(qa_val)
predictions_test = model.predict(qa_test)

In [0]:
predictions_val2 = model2.predict(qa_val)
predictions_test2 = model2.predict(qa_test)

In [0]:
predictions_val3 = model3.predict(qa_val)
predictions_test3 = model3.predict(qa_test)

In [0]:
#@title Output with highest prob - Val and Test
#val
predictions_df_val = pd.DataFrame.from_dict(predictions_val)
text_val = pd.DataFrame(predictions_val[0])
prob_val = pd.DataFrame(predictions_val[1])
prop1_val = prob_val['probability'].tolist()
prop2_val = pd.DataFrame(prop1_val)
text1_val = text_val['answer'].tolist()
text2_val = pd.DataFrame(text1_val)
#test
predictions_df_test = pd.DataFrame.from_dict(predictions_test)
text_test = pd.DataFrame(predictions_test[0])
prob_test = pd.DataFrame(predictions_test[1])
prop1_test = prob_test['probability'].tolist()
prop2_test = pd.DataFrame(prop1_test)
text1_test = text_test['answer'].tolist()
text2_test = pd.DataFrame(text1_test)

In [0]:
#@title Output with highest prob - Val and Test
#val
predictions_df_val2 = pd.DataFrame.from_dict(predictions_val2)
text_val2 = pd.DataFrame(predictions_val2[0])
prob_val2 = pd.DataFrame(predictions_val2[1])
prop1_val2 = prob_val2['probability'].tolist()
prop2_val2 = pd.DataFrame(prop1_val2)
text1_val2 = text_val2['answer'].tolist()
text2_val2 = pd.DataFrame(text1_val2)
#test
predictions_df_test2 = pd.DataFrame.from_dict(predictions_test2)
text_test2 = pd.DataFrame(predictions_test2[0])
prob_test2 = pd.DataFrame(predictions_test2[1])
prop1_test2 = prob_test2['probability'].tolist()
prop2_test2 = pd.DataFrame(prop1_test2)
text1_test2 = text_test2['answer'].tolist()
text2_test2 = pd.DataFrame(text1_test2)

In [0]:
#@title Output with highest prob - Val and Test
#val
predictions_df_val3 = pd.DataFrame.from_dict(predictions_val3)
text_val3 = pd.DataFrame(predictions_val3[0])
prob_val3 = pd.DataFrame(predictions_val3[1])
prop1_val3 = prob_val3['probability'].tolist()
prop2_val3 = pd.DataFrame(prop1_val3)
text1_val3 = text_val3['answer'].tolist()
text2_val3 = pd.DataFrame(text1_val3)
#test
predictions_df_test3 = pd.DataFrame.from_dict(predictions_test3)
text_test3 = pd.DataFrame(predictions_test3[0])
prob_test3 = pd.DataFrame(predictions_test3[1])
prop1_test3 = prob_test3['probability'].tolist()
prop2_test3 = pd.DataFrame(prop1_test3)
text1_test3 = text_test3['answer'].tolist()
text2_test3 = pd.DataFrame(text1_test3)

In [0]:
sub_val_df = val_df.copy()
sub_test_df = test_df.copy()
sub_val_df2 = val_df.copy()
sub_test_df2 = test_df.copy()
sub_val_df3 = val_df.copy()
sub_test_df3 = test_df.copy()

In [0]:
#create files to export 
sub_val_df['selected_text_results'] = text2_val[0].values
sub_test_df['selected_text_results'] = text2_test[0].values
sub_val_df2['selected_text_results'] = text2_val2[0].values
sub_test_df2['selected_text_results'] = text2_test2[0].values
sub_val_df3['selected_text_results'] = text2_val3[0].values
sub_test_df3['selected_text_results'] = text2_test3[0].values

In [0]:
#create files to export 
sub_val_df['selected_text_results'] = text2_val[0].values
sub_test_df['selected_text_results'] = text2_test[0].values
sub_val_df2['selected_text_results'] = text2_val2[0].values
sub_test_df2['selected_text_results'] = text2_test2[0].values
sub_val_df3['selected_text_results'] = text2_val3[0].values
sub_test_df3['selected_text_results'] = text2_test3[0].values

In [0]:
sub_val_df = sub_val_df[['textID','selected_text','selected_text_results']]
sub_val_df2 = sub_val_df2[['textID','selected_text','selected_text_results']]
sub_val_df3 = sub_val_df3[['textID','selected_text','selected_text_results']]

In [0]:
final = sub_val_df.merge(sub_val_df2,on=['textID','selected_text'],how='left')
final = final.merge(sub_val_df3,on=['textID','selected_text'],how='left')

In [0]:
final

In [0]:
"""from google.colab import files
sub_val_df.to_csv('sub_val.csv') 
files.download('sub_val.csv')
sub_test_df.to_csv('sub_test.csv') 
files.download('sub_test.csv')
train_df.to_csv("new_train_df")"""

In [0]:
def votes_and_select(str1,str2,str3):
  try:
    #print(str1,str2,str3)
    s1_list = str1.split()
    s2_list = str2.split()
    s3_list = str3.split()
    length = max(len(s1_list),len(s2_list),len(s3_list))
    l = [s1_list,s2_list,s3_list]
    l2 = [str1,str2,str3]
    #print(length)
    list_of_zeros = [0]*length
    #print(list_of_zeros)
    max_list = [x for x in l if len(x) == length][0]
    other = []
    voting_list = [s1_list,s2_list,s3_list]
    voting_dict = {}
    for string in l2:
      if string in voting_dict.keys():
        voting_dict[string] = voting_dict[string]+1
      else:
        voting_dict[string] = 1
    all_values = voting_dict.values()
    max_value = max(all_values)
    #print(voting_dict)
    #print(max_value)    
    if max_value >= 2:
      final = [key for key in voting_dict if (voting_dict[key] == max_value)][0]
      if final == 'not_applicable':
        #print(final)
        for i in l2:
          if i !='not_applicable':
            final = i
        return final
      else:
        return final
    else:
      #print(max_list)
      for li in voting_list:
        #print(li)
        for s in li:
          #print(s)
          p = max_list.index(s)
          list_of_zeros[p] = list_of_zeros[p]+1
          #print(p)
          #print(list_of_zeros)
      index_list = find_index(2,list_of_zeros)
      #print(index_list)
      st = index_list[0]
      ed = index_list[-1]+1
      #print(ed)
      if st == ed:
        final_list = max_list[st]
      else:
        final_list = max_list[st:ed]
      final = ' '.join(final_list)
      return final
  except:
      return str3
    

In [0]:
def jaccard(str1, str2): 
    #print(str2)
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [0]:
final['selected_text_results_final'] = final.apply(lambda x: votes_and_select(x.selected_text_results_x, x.selected_text_results_y,x.selected_text_results), axis=1)
final['j_score'] = final.apply(lambda x: jaccard(x.selected_text, x.selected_text_results_final), axis=1)

In [0]:
print(len(final),final['j_score'].mean())