<a href="https://colab.research.google.com/github/DianaMoyano1/NLP-Sentiment_Extraction_Challenge/blob/master/Ensemble_with_highest_prob_%26_words_count_Eman.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SECTION 1: Setup


In [0]:
#install the following first
!pip install transformers==2.11.0 --quiet
!pip install tensorflow==2.2.0 --quiet
!pip install tensorboardX --quiet
!pip install simpletransformers --quiet

### Setup NVIDIA APEX

Tool to enable mixed precision training. More info here: https://github.com/NVIDIA/apex

In [0]:
%%writefile setup.sh
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

In [0]:
#this will take 10mins to run
import timeit
start = timeit.default_timer()

!sh setup.sh --quiet

stop = timeit.default_timer()
print('Time: ', stop - start)  

### Import Packages

In [0]:
#Import packages
import numpy as np 
import pandas as pd 
from apex import amp
from glob import glob
import os
from random import random
from pathlib import Path
import json
import torch
from transformers import AutoModel, AutoTokenizer, BertTokenizer, AutoModelForQuestionAnswering
from transformers import TFBertModel, BertModel, DistilBertModel, XLNetModel, RobertaModel
from tensorboardX import SummaryWriter
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from os.path import join


use_cuda = True ##If True, GPU will be used

### Mount Your Own Gdrive

Below command will require you to validate your account, and it will provide you with a temporary access code to paste in the required field

In [0]:
from google.colab import drive
drive.mount('/gdrive')
%ls /gdrive

### Load the Data



Before running below command, make sure you have...
- Created a *'tweet-sentiment-extraction'* folder inside the *'Colab Notebooks'* directory
- Uploaded the *train.csv* and *test.csv* files to the *'tweet-sentiment-extraction'* folder 

Finally, make sure you have a folder called *'models'* inside the *'tweet-sentiment-extraction'* directory

In [0]:
train_df = pd.read_csv('/gdrive/My Drive/Colab Notebooks/tweet-sentiment-extraction/train.csv')
test_df = pd.read_csv('/gdrive/My Drive/Colab Notebooks/tweet-sentiment-extraction/test.csv')

### Prepare the Data

Split into train and validation sets

In [0]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state = 42)

In [0]:
#drop selected_text column from the validation dataset (it will be later compared to the ground truth)
val_df_new = val_df.drop('selected_text', axis=1)

In [0]:
print(train_df.shape)
print(val_df_new.shape)
print(test_df.shape)

In [0]:
train = np.array(train_df)
val = np.array(val_df_new)
test = np.array(test_df)

In [0]:
#@title Create list for training

## Adapted from https://www.kaggle.com/cheongwoongkang/roberta-baseline-starter-simple-postprocessing
def find_all(input_str, search_str):
    l1 = []
    length = len(input_str)
    index = 0
    while index < length:
        i = input_str.find(search_str, index)
        if i == -1:
            return l1
        l1.append(i)
        index = i + 1
    return l1

def do_qa_train(train):

    output = []
    for line in train:
        context = line[1]

        qas = []
        question = line[-1]
        qid = line[0]
        answers = []
        answer = line[2]
        if type(answer) != str or type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answer_starts = find_all(context, answer)
        for answer_start in answer_starts:
            answers.append({'answer_start': answer_start, 'text': answer.lower()})
            break
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})

        output.append({'context': context.lower(), 'qas': qas})
        
    return output

qa_train = do_qa_train(train)


In [0]:
#@title Create val list
## Adapted from https://www.kaggle.com/cheongwoongkang/roberta-baseline-starter-simple-postprocessing
def do_qa_val(val):
    output = []
    for line in val:
        context = line[1]
        qas = []
        question = line[-1]
        qid = line[0]
        if type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answers = []
        answers.append({'answer_start': 1000000, 'text': '__None__'})
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})
        output.append({'context': context.lower(), 'qas': qas})
    return output

qa_val = do_qa_val(val)

In [0]:
#@title Create test list
## Adapted from https://www.kaggle.com/cheongwoongkang/roberta-baseline-starter-simple-postprocessing
def do_qa_test(test):
    output = []
    for line in test:
        context = line[1]
        qas = []
        question = line[-1]
        qid = line[0]
        if type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answers = []
        answers.append({'answer_start': 1000000, 'text': '__None__'})
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})
        output.append({'context': context.lower(), 'qas': qas})
    return output

qa_test = do_qa_test(test)

### Initiate the SimpleTransformers Task

In [0]:
from simpletransformers.question_answering import QuestionAnsweringModel

### Create a Logging Module --> More info [here](https://realpython.com/python-logging/#:~:text=The%20Logging%20Module,-The%20logging%20module&text=It%20is%20used%20by%20most,homogeneous%20log%20for%20your%20application.&text=With%20the%20logging%20module%20imported,that%20you%20want%20to%20see.)


Logs provide developers with an extra set of eyes that are constantly looking at the flow that an application is going through. They can store information, like which user or IP accessed the application.  

With the logging module imported, you can use something called a “logger” to log messages that you want to see. By default, there are 5 standard levels indicating the severity of events.
- DEBUG
- INFO
- WARNING
- ERROR
- CRITICAL

In this case, we picked INFO and WARNING

In [0]:
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

## Save trained model arguments and other files

In [0]:
"""from google.colab import files
sub_val_df.to_csv('sub_val.csv') 
files.download('sub_val.csv')
sub_test_df.to_csv('sub_test.csv') 
files.download('sub_test.csv')
train_df.to_csv("new_train_df")"""

In [0]:
#This line creates a JSON file that is required when loading the model
#with open('args_train.json', 'w') as fp: 
    #json.dump(args_train, fp)

# SECTION 3: Load and Evaluate a Richardson's Pre-Trained Models for Ensemble

In [0]:
ROOT = '/gdrive/My Drive/Colab Notebooks/Models For Ensemble' #Don't change
NAME_OF_MODEL1 = 'Diana_bert-base-cased_A' #change
MODEL_ARCHITECTURE1 = 'bert' #change

FULL_PATH1 = join(ROOT, NAME_OF_MODEL1)

#Change the workspace to the model folder
%cd '{FULL_PATH1}' 

#Load the model's arguments list (required to setup the existing model) 
with open('args_train.json') as json_file: 
    train_args1 = json.load(json_file) 

In [0]:
loaded_model1 = QuestionAnsweringModel(MODEL_ARCHITECTURE1, 'outputs/', args=train_args1, use_cuda=use_cuda)

In [0]:
ROOT = '/gdrive/My Drive/Colab Notebooks/Models For Ensemble' #Don't change
NAME_OF_MODEL2 = 'diana_distilbert-base-uncased-distilled-squad_A' # change
MODEL_ARCHITECTURE2 = 'distilbert' #change

FULL_PATH2 = join(ROOT, NAME_OF_MODEL2)

#Change the workspace to the model folder
%cd '{FULL_PATH2}' 

#Load the model's arguments list (required to setup the existing model) 
with open('args_train.json') as json_file: 
    train_args2 = json.load(json_file) 

In [0]:
loaded_model2 = QuestionAnsweringModel(MODEL_ARCHITECTURE2, 'outputs/', args=train_args2, use_cuda=use_cuda)

In [0]:
ROOT = '/gdrive/My Drive/Colab Notebooks/Models For Ensemble' #Don't change
NAME_OF_MODEL3 = 'lucas_roberta-large_B' #change
MODEL_ARCHITECTURE3 = 'roberta' #change

FULL_PATH3 = join(ROOT, NAME_OF_MODEL3)

#Change the workspace to the model folder
%cd '{FULL_PATH3}' 

#Load the model's arguments list (required to setup the existing model) 
with open('args_train.json') as json_file: 
    train_args3 = json.load(json_file) 

In [0]:
loaded_model3 = QuestionAnsweringModel(MODEL_ARCHITECTURE3, 'outputs/', args=train_args3, use_cuda=use_cuda)

#### Setup loaded model

Supported model types for Question&Answering:

- ALBERT
- BERT
- DistilBERT
- ELECTRA
- XLM
- XLNet

Related link: https://huggingface.co/transformers/pretrained_models.html

In [0]:
predictions_val = loaded_model1.predict(qa_val)
predictions_test = loaded_model1.predict(qa_test)

In [0]:
#@title Output with highest prob - Val and Test
#val
predictions_df_val = pd.DataFrame.from_dict(predictions_val)
text_val = pd.DataFrame(predictions_val[0])
prob_val = pd.DataFrame(predictions_val[1])
prop1_val = prob_val['probability'].tolist()
prop2_val = pd.DataFrame(prop1_val)
text1_val = text_val['answer'].tolist()
text2_val = pd.DataFrame(text1_val)
#test
predictions_df_test = pd.DataFrame.from_dict(predictions_test)
text_test = pd.DataFrame(predictions_test[0])
prob_test = pd.DataFrame(predictions_test[1])
prop1_test = prob_test['probability'].tolist()
prop2_test = pd.DataFrame(prop1_test)
text1_test = text_test['answer'].tolist()
text2_test = pd.DataFrame(text1_test)

In [0]:
predictions_val2 = loaded_model2.predict(qa_val)
predictions_test2 = loaded_model2.predict(qa_test)

In [0]:
#@title Output with highest prob - Val and Test
#val
predictions_df_val2 = pd.DataFrame.from_dict(predictions_val2)
text_val2 = pd.DataFrame(predictions_val2[0])
prob_val2 = pd.DataFrame(predictions_val2[1])
prop1_val2 = prob_val2['probability'].tolist()
prop2_val2 = pd.DataFrame(prop1_val2)
text1_val2 = text_val2['answer'].tolist()
text2_val2 = pd.DataFrame(text1_val2)
#test
predictions_df_test2 = pd.DataFrame.from_dict(predictions_test2)
text_test2 = pd.DataFrame(predictions_test2[0])
prob_test2 = pd.DataFrame(predictions_test2[1])
prop1_test2 = prob_test2['probability'].tolist()
prop2_test2 = pd.DataFrame(prop1_test2)
text1_test2 = text_test2['answer'].tolist()
text2_test2 = pd.DataFrame(text1_test2)

In [0]:
predictions_val3 = loaded_model3.predict(qa_val)
predictions_test3 = loaded_model3.predict(qa_test)

In [0]:
#@title Output with highest prob - Val and Test
#val
predictions_df_val3 = pd.DataFrame.from_dict(predictions_val3)
text_val3 = pd.DataFrame(predictions_val3[0])
prob_val3 = pd.DataFrame(predictions_val3[1])
prop1_val3 = prob_val3['probability'].tolist()
prop2_val3 = pd.DataFrame(prop1_val3)
text1_val3 = text_val3['answer'].tolist()
text2_val3 = pd.DataFrame(text1_val3)
#test
predictions_df_test3 = pd.DataFrame.from_dict(predictions_test3)
text_test3 = pd.DataFrame(predictions_test3[0])
prob_test3 = pd.DataFrame(predictions_test3[1])
prop1_test3 = prob_test3['probability'].tolist()
prop2_test3 = pd.DataFrame(prop1_test3)
text1_test3 = text_test3['answer'].tolist()
text2_test3 = pd.DataFrame(text1_test3)

In [0]:
sub_val_df = val_df.copy()
sub_test_df = test_df.copy()
sub_val_df2 = val_df.copy()
sub_test_df2 = test_df.copy()
sub_val_df3 = val_df.copy()
sub_test_df3 = test_df.copy()

In [0]:
#create files to export 
sub_val_df['selected_text_results'] = text2_val[0].values
sub_test_df['selected_text_results'] = text2_test[0].values
sub_val_df2['selected_text_results2'] = text2_val2[0].values
sub_test_df2['selected_text_results2'] = text2_test2[0].values
sub_val_df3['selected_text_results3'] = text2_val3[0].values
sub_test_df3['selected_text_results3'] = text2_test3[0].values

In [0]:
sub_val_df['prob'] = prop2_val[0].values
sub_test_df['prob'] = prop2_test[0].values
sub_val_df2['prob2'] = prop2_val2[0].values
sub_test_df2['prob2'] = prop2_test2[0].values
sub_val_df3['prob3'] = prop2_val3[0].values
sub_test_df3['prob3'] = prop2_test3[0].values

In [0]:
sub_val_df = sub_val_df[['textID','selected_text','selected_text_results', 'prob']]
sub_val_df2 = sub_val_df2[['textID','selected_text','selected_text_results2','prob2']]
sub_val_df3 = sub_val_df3[['textID','selected_text','selected_text_results3', 'prob3']]

In [0]:
final = sub_val_df.merge(sub_val_df2,on=['textID','selected_text'],how='left')
final = final.merge(sub_val_df3,on=['textID','selected_text'],how='left')

In [0]:
final.head(50)

In [0]:
def remove_stop_words(lines):
  stop_words = ['_']
  results = []
  for text in lines:
    tmp = text.split(' ')
    for x in range(0, len(tmp)):
      for st_w in stop_words:
        if st_w in tmp[x]:
          tmp[x] = ''
    results.append(" ".join(tmp))
  return results

In [0]:
remove_stop_words(a)

In [0]:
#by prob
selected_text_results_highest_prob = [] 
for i in np.arange(0,len(final)):
  max_prob = [final['prob'].iloc[i], final['prob2'].iloc[i], final['prob3'].iloc[i]]
  answers = [final['selected_text_results'].iloc[i], final['selected_text_results2'].iloc[i], final['selected_text_results3'].iloc[i]]
  highest_prob_answer = answers[np.argmax(max_prob)]
  selected_text_results_highest_prob.append(highest_prob_answer)

In [0]:
a = final['selected_text_results'].iloc[1].split()
b = final['selected_text_results2'].iloc[1].split()
c = final['selected_text_results3'].iloc[1].split()

In [0]:
test = a+b+c

In [0]:
import collections
def top5_words(text):
    counts = collections.Counter(text)
    return counts.most_common()

In [0]:
from collections import Counter
myDict = Counter(test)
res = Counter({k: v for k, v in myDict.items() if v > 1})
list_test = []
for key in res.keys():
  list_test.append(key)



In [0]:
#by word count
selected_text_results_unique_words = []

for i in np.arange(0,len(final)):
   a = final['selected_text_results'].iloc[i].split()
   b = final['selected_text_results2'].iloc[i].split()
   c = final['selected_text_results3'].iloc[i].split()
   unique_list = a + b + c 
   myDict = Counter(unique_list)
   res = Counter({k: v for k, v in myDict.items() if v > 1})
   list_keys = []
   for key in res.keys():
     list_keys.append(key)
   unique_sent = ' '.join(list_keys) 
   selected_text_results_unique_words.append(unique_sent)

In [0]:
final['selected_text_results_highest_prob'] = selected_text_results_highest_prob
final['selected_text_results_unique_words'] = selected_text_results_unique_words

In [0]:
def jaccard(str1, str2): 
    #print(str2)
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [0]:
a = 'are you good today'
b = 'are you good fine'
c = 'are you good today ok tomorrow'
print(jaccard(a,b))
print(jaccard(a,c))

In [0]:
#Obtain JS for the entire set
results = []
for i in range(len(final)):
    score = jaccard(final['selected_text'].iloc[i], final['selected_text_results_highest_prob'].iloc[i])
    results.append(score)
    
Jaccard_score = sum(results) / len(results)
Jaccard_score

In [0]:
#Obtain JS for the entire set
results = []
for i in range(len(final)):
    score = jaccard(final['selected_text'].iloc[i], final['selected_text_results_unique_words'].iloc[i])
    results.append(score)
    
Jaccard_score = sum(results) / len(results)
Jaccard_score