In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/transformers-pretrained-distilbert/distilbert-base-uncased-distilled-squad/config.json
/kaggle/input/transformers-pretrained-distilbert/distilbert-base-uncased-distilled-squad/vocab.txt
/kaggle/input/transformers-pretrained-distilbert/distilbert-base-uncased-distilled-squad/pytorch_model.bin
/kaggle/input/transformers-pretrained-distilbert/distilbert-base-uncased-distilled-squad/special_tokens_map.json
/kaggle/input/transformers-pretrained-distilbert/distilbert-base-uncased-distilled-squad/tokenizer_config.json
/kaggle/input/simple-transformers-pypi/simpletransformers-0.22.1-py3-none-any.whl
/kaggle/input/simple-transformers-pypi/seqeval-0.0.12-py3-none-any.whl
/kaggle/input/tweet-sentiment-extraction/sample_submission.csv
/kaggle/input/tweet-sentiment-extraction/test.csv
/kaggle/input/tweet-sentiment-extraction/train.csv


In [2]:
import torch
#!pip install transformers
from transformers import BertForQuestionAnswering
!pip install '/kaggle/input/simple-transformers-pypi/seqeval-0.0.12-py3-none-any.whl' -q
!pip install '/kaggle/input/simple-transformers-pypi/simpletransformers-0.22.1-py3-none-any.whl' -q


from simpletransformers.question_answering import QuestionAnsweringModel
MODEL_PATH = '/kaggle/input/transformers-pretrained-distilbert/distilbert-base-uncased-distilled-squad/'




In [3]:
train_df=pd.read_csv("/kaggle/input/tweet-sentiment-extraction/train.csv")
test_df=pd.read_csv("/kaggle/input/tweet-sentiment-extraction/test.csv")

train_df=train_df.dropna()
#model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

question_p="positive sentence? "
question_n="negative sentence?"
question_neu="neutral sentence? " 
def make_question(x):
    if x=="positive":
        return question_p
    elif x=="negative":
        return question_n
    else:
        return question_neu
    
train_df["sentiment"]=train_df["sentiment"].apply(make_question)
test_df["sentiment"]=test_df["sentiment"].apply(make_question)


In [4]:
def answer_index(x):
    return x["text"].find(str(x["selected_text"]))
train_df["answer_index"]=train_df.apply(answer_index,axis=1)

def make_train_data(df):
    
    train_data=[]

    for ind in df.index:
        main_d=dict()
        context=df.loc[ind,"text"].lower()
        main_d["context"]=context
        qas=[]
        inside_qas={}
        inside_qas["id"]=df.loc[ind,"textID"]
        inside_qas["question"]=df.loc[ind,"sentiment"]
        inside_answer=[{"text":df.loc[ind,"selected_text"].lower(),"answer_start": int(df.loc[ind,"answer_index"])}]
        inside_qas["answers"]=inside_answer
        inside_qas["is_impossible"]=False
        qas.append(inside_qas)
        main_d["qas"]=qas
        train_data.append(main_d)
    return train_data


def make_test_data(df):
    test_data=[]

    
    for ind in df.index:
        main_d=dict()
        context=df.loc[ind,"text"].lower()
        main_d["context"]=context
        qas=[]
        inside_qas={}
        inside_qas["id"]=df.loc[ind,"textID"]
        inside_qas["question"]=df.loc[ind,"sentiment"]
        #inside_answer=[{"text":df.loc[ind,"selected_text"].lower(),"answer_start": int(df.loc[ind,"answer_index"])}]
        #inside_qas["answers"]=inside_answer
        inside_qas["is_impossible"]=False
        qas.append(inside_qas)
        main_d["qas"]=qas
        test_data.append(main_d)
    return test_data
        
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))
def get_jacc(m,n):
    all=[]
    for i in range(len(m)):
        aa=m[i]['answer']
        #print(aa)
        all.append(jaccard(m[i]['answer'],n[i]))
    return np.mean(all)
        
def get_predlist(pred):
    all=[]
    for i in pred:
        all.append(i['answer'])
    return all

test_data=make_test_data(test_df)


In [5]:
from sklearn.model_selection import StratifiedKFold
import json
SEED=88888
np.random.seed(SEED)

num_splits=5
sss  = StratifiedKFold(n_splits=num_splits,shuffle=True,random_state=SEED)
all=[]
for fold,(train_index,val_index) in enumerate(sss.split(train_df.index,train_df.sentiment.values)):

    print('#'*25)
    print('### FOLD %i'%(fold+1))
    print('#'*25)
    
    #K.clear_session()
    model = QuestionAnsweringModel("distilbert",MODEL_PATH,
                               args={'reprocess_input_data': True,
                                     'overwrite_output_dir': True,
                                     'learning_rate': 4e-5,
                                     'num_train_epochs': 3,
                                     'max_seq_length': 128,
                                     'doc_stride': 64,
                                     'fp16': False,
                                     #'train_batch_size':128
                                    },
                              use_cuda=True)

        
    

    train_data=train_df.iloc[train_index]
    train_data.reset_index(drop=True,inplace=True)
    train_data=make_train_data(train_data)
    #train_data_json=json.dumps(train_data)
    with open('/kaggle/working/trainn.json', 'w') as outfile:
        json.dump(train_data, outfile)
    
    
    val_df=train_df.iloc[val_index]
    val_df.reset_index(drop=True,inplace=True)
    val_data=make_train_data(val_df)


    
    model.train_model('/kaggle/working/trainn.json')
    #model.load_weights('%s-roberta-%i.h5'%(1,fold))
    predict_val=model.predict(val_data)
    print('JACCARD FOR FOLD %i : '%(fold+1),get_jacc(predict_val,list(val_df.selected_text)))
    
    print("*"*25,"making_test_predictions","*"*25)
    predictions=model.predict(test_data)
    test_df['selected_text FOLD %i'%(fold+1)] = get_predlist(predictions)

#########################
### FOLD 1
#########################


100%|██████████| 21984/21984 [00:33<00:00, 659.77it/s]


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=2748.0, style=ProgressStyle(descr…

Running loss: 3.997953



Running loss: 0.525864


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=2748.0, style=ProgressStyle(descr…

Running loss: 1.157968



100%|██████████| 5496/5496 [00:06<00:00, 788.78it/s]


HBox(children=(FloatProgress(value=0.0, max=687.0), HTML(value='')))


JACCARD FOR FOLD 1 :  0.7030360940936244
************************* making_test_predictions *************************


100%|██████████| 3534/3534 [00:04<00:00, 790.95it/s]


HBox(children=(FloatProgress(value=0.0, max=442.0), HTML(value='')))


#########################
### FOLD 2
#########################


100%|██████████| 21984/21984 [00:33<00:00, 648.14it/s]


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=2748.0, style=ProgressStyle(descr…

Running loss: 0.589872


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=2748.0, style=ProgressStyle(descr…

Running loss: 0.449681



100%|██████████| 5496/5496 [00:07<00:00, 780.96it/s]


HBox(children=(FloatProgress(value=0.0, max=687.0), HTML(value='')))

In [6]:
import numpy as np

pred_cols=['selected_text FOLD 1','selected_text FOLD 2','selected_text FOLD 3','selected_text FOLD 4','selected_text FOLD 5']
pred_text=[];c=0
for i in range(test_df.shape[0]):
#for i in range(2,3):
    if test_df.iloc[i]['sentiment']=='neutral sentence? ':
        pred_text.append(test_df.iloc[i]['text'])
        continue
    temp=test_df.iloc[i]['text'].split(" ")
    temp = [x.lower() for x in temp]
    text1 = " "+" ".join(temp)

    start_idx,end_idx=np.zeros(len(text1)+1),np.zeros(len(text1)+1)
    #print(len(temp))
    #print(text1)
    for cols in pred_cols:
        temp1=test_df.iloc[i][cols].split(" ")
        id=0
        if len(temp1[0])==1:
            id=1
            start_idx[text1.find(temp1[id])-2]+=1
            end_idx[text1.find(temp1[-1])+len(temp1[-1])]+=1
        else:
            start_idx[text1.find(temp1[id])]+=1
            end_idx[text1.find(temp1[-1])+len(temp1[-1])]+=1
    #print(start_idx)
    start_idx=np.argmax(start_idx)
    end_idx=np.argmax(end_idx)
    pred_text.append(text1[start_idx:end_idx])
  

In [7]:
test_df['selected_text']=pred_text
test_df.head()

Unnamed: 0,textID,text,sentiment,selected_text FOLD 1,selected_text FOLD 2,selected_text FOLD 3,selected_text FOLD 4,selected_text FOLD 5,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral sentence?,last session of the day http://twitpic.com/67ezh,last session of the day,last session of the day,last session of the day,last session of the day http://twitpic.com/67ezh,Last session of the day http://twitpic.com/67ezh
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive sentence?,exciting,exciting,exciting,exciting,exciting (precisely -- skyscrapers galore). go...,exciting
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative sentence?,such a shame!,such a shame!,shame!,such a shame!,shame!,such a shame!
3,01082688c6,happy bday!,positive sentence?,happy,happy bday!,happy,happy bday!,happy,happy
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive sentence?,i like it!!,i like,i like it!!,i like it!!,i like it!!,i like it!!


In [8]:
predictions_df = pd.DataFrame.from_dict(predictions)
sub_df=pd.read_csv("/kaggle/input/tweet-sentiment-extraction/sample_submission.csv")
sub_df['selected_text'] =pred_text

sub_df.to_csv('submission.csv', index=False)

In [9]:
sub_df


Unnamed: 0,textID,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh
1,96d74cb729,exciting
2,eee518ae67,such a shame!
3,01082688c6,happy
4,33987a8ee5,i like it!!
...,...,...
3529,e5f0e6ef4b,tired
3530,416863ce47,thanks
3531,6332da480c,. my little dog is sinking into depression..
3532,df1baec676,i love your videos!
