<a href="https://colab.research.google.com/github/DianaMoyano1/NLP-Sentiment_Extraction_Challenge/blob/master/Workstream_1/WS1_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup


In [0]:
#@title Install packages
#install the following first
!pip install transformers==2.11.0 --quiet
!pip install tensorflow==2.2.0 --quiet
!pip install tensorboardX --quiet
!pip install simpletransformers --quiet

In [0]:
%%writefile setup.sh
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

In [0]:
#this will take 10mins to run
!sh setup.sh --quiet

In [0]:
#@title Import packages
import numpy as np 
import pandas as pd 
from apex import amp
from glob import glob
import os
from random import random
from pathlib import Path
import json
import torch
from transformers import AutoModel, AutoTokenizer, BertTokenizer, AutoModelForQuestionAnswering
from transformers import TFBertModel, BertModel, DistilBertModel, XLNetModel, RobertaModel
from tensorboardX import SummaryWriter
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [0]:
from google.colab import drive
drive.mount('/gdrive')
%ls /gdrive

In [0]:
from os.path import join

NAME_OF_MODEL = 'YourName_S_distilbert-base-cased_JS60' ## TODO --> Change this


"""BASIC STRUCTURE
<TYPE>_<MODEL>_<JACCARD SCORE>
Where:
  TYPE: S for simple, E for Ensemble
  MODEL:  If simple model, use the name used in this site: https://huggingface.co/transformers/pretrained_models.html
          If ensemble, use short name of each split by a - (e.g. distilbert-alberta-XLNet)
  JACCARD SCORE: Start with a 'JS' follow by the percentage (no decimals)
  
  Examples:
  -Diana_S_distilroberta-base_JS72
  -Landis_E_distilbert-albert-bert_JS80"""

In [0]:
#Don't change anything

ROOT = '/gdrive/My Drive/Colab Notebooks/tweet-sentiment-extraction/models'

FULL_PATH = join(ROOT, NAME_OF_MODEL)

#Change the workspace to the "tweet-sentiment-extraction/models" folder
%cd '{ROOT}'
#It creates the folder where the model components will be saved. Make sure you don't have a folder with the same name
%mkdir '{NAME_OF_MODEL}' 
#Change the workspace to the recently created folder
%cd '{FULL_PATH}' 



In [0]:
## To delete. Reason --> done in above cell with URL
train_df = pd.read_csv('/gdrive/My Drive/Colab Notebooks/tweet-sentiment-extraction/train.csv')
test_df = pd.read_csv('/gdrive/My Drive/Colab Notebooks/tweet-sentiment-extraction/test.csv')
sub_df = pd.read_csv('/gdrive/My Drive/Colab Notebooks/tweet-sentiment-extraction/sample_submission.csv')




"""
SETTINGS
"""
use_cuda = True # whether to use GPU or not

In [0]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state = 42)

In [0]:
#drop selected_text column from table
val_df_new = val_df.drop('selected_text', axis=1)

In [0]:
print(train_df.shape)
print(val_df_new.shape)
print(test_df.shape)

In [0]:
train = np.array(train_df)
val = np.array(val_df_new)
test = np.array(test_df)

In [0]:
#@title Create list for training

## Adpated from https://www.kaggle.com/cheongwoongkang/roberta-baseline-starter-simple-postprocessing
def find_all(input_str, search_str):
    l1 = []
    length = len(input_str)
    index = 0
    while index < length:
        i = input_str.find(search_str, index)
        if i == -1:
            return l1
        l1.append(i)
        index = i + 1
    return l1

def do_qa_train(train):

    output = []
    for line in train:
        context = line[1]

        qas = []
        question = line[-1]
        qid = line[0]
        answers = []
        answer = line[2]
        if type(answer) != str or type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answer_starts = find_all(context, answer)
        for answer_start in answer_starts:
            answers.append({'answer_start': answer_start, 'text': answer.lower()})
            break
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})

        output.append({'context': context.lower(), 'qas': qas})
        
    return output

qa_train = do_qa_train(train)


In [0]:
#@title Create val list
def do_qa_val(val):
    output = []
    for line in val:
        context = line[1]
        qas = []
        question = line[-1]
        qid = line[0]
        if type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answers = []
        answers.append({'answer_start': 1000000, 'text': '__None__'})
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})
        output.append({'context': context.lower(), 'qas': qas})
    return output

qa_val = do_qa_val(val)

In [0]:
#@title Create test list
def do_qa_test(test):
    output = []
    for line in test:
        context = line[1]
        qas = []
        question = line[-1]
        qid = line[0]
        if type(context) != str or type(question) != str:
            print(context, type(context))
            print(answer, type(answer))
            print(question, type(question))
            continue
        answers = []
        answers.append({'answer_start': 1000000, 'text': '__None__'})
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})
        output.append({'context': context.lower(), 'qas': qas})
    return output

qa_test = do_qa_test(test)

In [0]:
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [0]:
from simpletransformers.question_answering import QuestionAnsweringModel

In [0]:
#For more arguments, refer to this link --> https://simpletransformers.ai/docs/usage/#configuring-a-simple-transformers-model

args_train={'reprocess_input_data': True,
'overwrite_output_dir': True,
'learning_rate': 5e-5,
'num_train_epochs': 1,
'max_seq_length': 192,
'doc_stride': 64,
'fp16': False,
}

model = QuestionAnsweringModel('distilbert', 'distilbert-base-cased', args=args_train, use_cuda=use_cuda)

In [0]:
model.train_model(qa_train)

In [0]:
predictions_val = model.predict(qa_val)
predictions_test = model.predict(qa_test)

In [0]:
#@title Output with highest prob - Val and Test
#val
predictions_df_val = pd.DataFrame.from_dict(predictions_val)
text_val = pd.DataFrame(predictions_val[0])
prob_val = pd.DataFrame(predictions_val[1])
prop1_val = prob_val['probability'].tolist()
prop2_val = pd.DataFrame(prop1_val)
text1_val = text_val['answer'].tolist()
text2_val = pd.DataFrame(text1_val)
#test
predictions_df_test = pd.DataFrame.from_dict(predictions_test)
text_test = pd.DataFrame(predictions_test[0])
prob_test = pd.DataFrame(predictions_test[1])
prop1_test = prob_test['probability'].tolist()
prop2_test = pd.DataFrame(prop1_test)
text1_test = text_test['answer'].tolist()
text2_test = pd.DataFrame(text1_test)

In [0]:
sub_val_df = val_df.copy()
sub_test_df = test_df.copy()

In [0]:
sub_val_df

In [0]:
#create files to export 
sub_val_df['selected_text_results'] = text2_val[0].values
sub_test_df['selected_text_results'] = text2_test[0].values

In [0]:
sub_test_df

In [0]:
"""from google.colab import files
sub_val_df.to_csv('sub_val.csv') 
files.download('sub_val.csv')
sub_test_df.to_csv('sub_test.csv') 
files.download('sub_test.csv')"""

In [0]:
"""train_df.to_csv("new_train_df")"""

# Save model, arguments and other files

In [0]:
with open('args_train.json', 'w') as fp: #This line creates a JSON file that is required when loading the model
    json.dump(args_train, fp)

# Load a pre-trained model

In [0]:
with open('args_train.json') as json_file: 
    test_args = json.load(json_file) 

In [0]:
loaded_model = QuestionAnsweringModel('distilbert', 'outputs/', args=test_args, use_cuda=use_cuda)

In [0]:
predictions_val = loaded_model.predict(qa_val)
predictions_test = loaded_model.predict(qa_test)

In [0]:
sub_val_df = val_df.copy()
sub_test_df = test_df.copy()

In [0]:
sub_val_df

In [0]:
#create files to export 
sub_val_df['selected_text_results'] = text2_val[0].values
sub_test_df['selected_text_results'] = text2_test[0].values

In [0]:
print(sub_test_df.head())