In [None]:
import numpy as np
import pandas as pd
import random
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import torch
from sklearn.model_selection import train_test_split
import re

seed = 1337
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True


In [None]:
print('Making questions')

train_df=pd.read_csv("/kaggle/input/tweet-sentiment-extraction/train.csv")

question_p="identify the positive sentence? "
question_n="identify the negative sentence?"
question_neu="identify the neutral sentence? " 

In [None]:
def make_question(x):
    if x=="positive":
        return question_p
    elif x=="negative":
        return question_n
    else:
        return question_neu

train_df=train_df.dropna()

In [None]:
def answer_index(x):
    return x["text"].find(str(x["selected_text"]))

def answer_end(x):
    return  x["text"].find(str(x["selected_text"]))+len(str(x["selected_text"]))-1

print('Making answers')

train_df['text'] = train_df['text'].apply(lambda x: re.sub(r"\s{2,}"," ", x))
train_df['text'] = train_df['text'].apply(lambda x: re.sub(r"^ ","", x))

train_df['selected_text'] = train_df['selected_text'].apply(lambda x: re.sub(r"\s{2,}"," ", x))
train_df['selected_text'] = train_df['selected_text'].apply(lambda x: re.sub(r"^ ","", x))

train_df["answer_index"]=train_df.apply(answer_index,axis=1)
train_df["answer_end"]=train_df.apply(answer_end,axis=1)
target = train_df.selected_text

In [None]:
train_data=[]

def make_train_data(df):
    
    for ind in df.index:
        main_d=dict()
        context=df.loc[ind,"text"].lower()
        main_d["context"]=context
        qas=[]
        inside_qas={}
        inside_qas["id"]=df.loc[ind,"textID"]
        inside_qas["question"]=df.loc[ind,"sentiment"]
        inside_answer=[{"text":df.loc[ind,"selected_text"].lower(),"answer_start": int(df.loc[ind,"answer_index"]),"answer_end":int(df.loc[ind,"answer_end"])}]
        inside_qas["answers"]=inside_answer
        inside_qas["is_impossible"]=False
        qas.append(inside_qas)
        main_d["qas"]=qas
        train_data.append(main_d)
    return train_data

print('Making train data')

train_data=make_train_data(train_df)

#X_train, X_test, y_train, y_test = train_test_split(train_data, target, test_size=0.2, random_state=seed)

In [None]:
import json

print('Making json file')

with open('/kaggle/working/train.json', 'w') as outfile:
    json.dump(train_data, outfile)

#with open('/kaggle/working/X_test.json', 'w') as outfile:
 #   json.dump(X_test, outfile)


In [None]:
!pip install '/kaggle/input/simple-transformers-pypi/seqeval-0.0.12-py3-none-any.whl' -q
!pip install '/kaggle/input/simple-transformers-pypi/simpletransformers-0.22.1-py3-none-any.whl' -q

In [None]:
from simpletransformers.question_answering import QuestionAnsweringModel

model_distilbert = '/kaggle/input/transformers-pretrained-distilbert/distilbert-base-uncased-distilled-squad/'

model=QuestionAnsweringModel('distilbert', model_distilbert,
                             args={'reprocess_input_data': True,
                                     'overwrite_output_dir': True,
                                     "do_lower_case": True,
                                     'learning_rate': 5e-3,
                                     'num_train_epochs': 1,
                                     'max_seq_length': 64,
                                     'doc_stride': 128,
                                     'fp16': False,},
                                        use_cuda=True)

model.train_model('/kaggle/working/train.json')

In [None]:
test_data=[]

def make_test_data(df):
    
    for ind in df.index:
        main_d=dict()
        context=df.loc[ind,"text"].lower()
        main_d["context"]=context
        qas=[]
        inside_qas={}
        inside_qas["id"]=df.loc[ind,"textID"]
        inside_qas["question"]=df.loc[ind,"sentiment"]
        inside_qas["is_impossible"]=False
        qas.append(inside_qas)
        main_d["qas"]=qas
        test_data.append(main_d)
    return test_data

In [None]:
#'''
print('Making test data')

test_df=pd.read_csv("/kaggle/input/tweet-sentiment-extraction/test.csv")
train_df['text'] = train_df['text'].apply(lambda x: re.sub(r"\s{2,}"," ", x))
train_df['text'] = train_df['text'].apply(lambda x: re.sub(r"^ ","", x))

test_data=make_test_data(test_df)

test_data_json=json.dumps(test_data)

with open('/kaggle/working/test.json', 'w') as outfile:
    json.dump(test_data, outfile)
  #  '''

In [None]:
print('Making predictions')
predictions=model.predict(test_data)

In [None]:
print('Making DF')
predictions_df = pd.DataFrame.from_dict(predictions)
sub_df=pd.read_csv("/kaggle/input/tweet-sentiment-extraction/sample_submission.csv")
sub_df['selected_text'] = predictions_df['answer']

sub_df.to_csv('submission.csv', index=False)