In [1]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# ! pip install datasets transformers
!pip uninstall fsspec -qq -y
!pip install --no-index --find-links ../input/hf-datasets/wheels datasets -qq

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-cloud 0.1.13 requires tensorflow<3.0,>=1.15.0, which is not installed.
dask-cudf 21.6.1+2.g101fc0fda4 requires cupy-cuda112, which is not installed.
cudf 21.6.1+2.g101fc0fda4 requires cupy-cuda110, which is not installed.
s3fs 2021.6.1 requires fsspec==2021.06.1, but you have fsspec 2021.6.0 which is incompatible.
pytorch-lightning 1.3.8 requires fsspec[http]!=2021.06.0,>=2021.05.0, but you have fsspec 2021.6.0 which is incompatible.
dask-cudf 21.6.1+2.g101fc0fda4 requires dask<=2021.5.1,>=2021.4.0, but you have dask 2021.6.2 which is incompatible.
dask-cudf 21.6.1+2.g101fc0fda4 requires distributed<=2021.5.1,>=2.22.0, but you have distributed 2021.6.2 which is incompatible.[0m


In [3]:
from datasets import load_dataset, Dataset
from pprint import pprint
import pandas as pd
import numpy as np

# Preprocessing

In [4]:
# model_checkpoint = "distilbert-base-uncased"
# model_checkpoint = "deepset/xlm-roberta-large-squad2"
# model_checkpoint = "../input/xlm-roberta-squad2/deepset/xlm-roberta-base-squad2"
# model_checkpoint = "../input/pipeline-for-qa-train/test-chaii-trained"
model_checkpoint = "../input/pipeline-for-qa-train-with-5-folds-1-epoch/chaii-trained-model-0"

from transformers import XLMTokenizer,AutoTokenizer
# tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
tokenizer("What is your name?", "My name is Sylvain.")

{'input_ids': [0, 4865, 83, 935, 9351, 32, 2, 2, 2646, 9351, 83, 100973, 21845, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# Doc_stride is used to handle large text: tokens>512

In [6]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

# Reading Data

In [7]:
from sklearn.model_selection import StratifiedKFold
folds = 5
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
df = pd.read_csv('../input/chaii-hindi-and-tamil-question-answering/train.csv')
for f, (t_, v_) in enumerate(kf.split(X=df, y=df.language.values)):
        df.loc[v_, 'kfold'] = f

In [8]:
# In some padding required on left side
pad_on_right = tokenizer.padding_side == "right"
def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [9]:
val_dataset = Dataset.from_pandas(df[:5])

In [10]:
val_dataset

Dataset({
    features: ['id', 'context', 'question', 'answer_text', 'answer_start', 'language', 'kfold'],
    num_rows: 5
})

In [11]:
features = prepare_validation_features(val_dataset[:5])
print(features.keys())

dict_keys(['input_ids', 'attention_mask', 'offset_mapping', 'example_id'])


In [12]:
validation_features = val_dataset.map(
    prepare_validation_features,
    batched=True,
    remove_columns=val_dataset.column_names
)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [13]:
len(validation_features)

45

In [14]:
validation_features

Dataset({
    features: ['attention_mask', 'example_id', 'input_ids', 'offset_mapping'],
    num_rows: 45
})

# Postprocess and jaccard for Top 1

In [15]:
from tqdm.auto import tqdm
import collections


def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
#         if not squad_v2:
        predictions[example["id"]] = best_answer["text"]
#         else:
#             answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
#             predictions[example["id"]] = answer

    return predictions

In [16]:
def jaccard(row): 
    str1 = row['PredictionString']
    str2 = row['answer_text']
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [17]:
def cal_jac(final_predictions, val, i):
    final_list = list(final_predictions.values())
#     print(len(final_list))
#     print(val.head())
    val['PredictionString']=final_list
    val['jaccard_score']=val.apply(jaccard,axis=1)
#     print('jaccard_score: ',val['jaccard_score'].mean())
    val.to_csv(f'jaccard_score_fold_{i}.csv',index=False)
    return val['jaccard_score'].mean()

# Postprocess and jaccard for Top 5

In [18]:
from tqdm.auto import tqdm
import collections


def postprocess_qa_predictions_topn(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30,top=1):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:top]
            best_answers = []
            for ans in best_answer:
                best_answers.append(ans['text'])
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answers = [""]
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
#         if not squad_v2:
#         print(best_answers)
        predictions[example["id"]] = best_answers
#         else:
#             answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
#             predictions[example["id"]] = answer

    return predictions

In [19]:
def topn_jaccard(row): 
    strs = row['topn_PredictionString']
    str2 = row['answer_text']
    mx_score=0.0
    for str1 in strs:
        a = set(str1.lower().split()) 
        b = set(str2.lower().split())
        c = a.intersection(b)
        mx_score = max(float(len(c)) / (len(a) + len(b) - len(c)),mx_score)
    return mx_score

In [20]:
def cal_jac_topn(final_predictions, val, i):
    topn_final_list = list(final_topn_predictions.values())
#     print(len(topn_final_list))
    val['topn_PredictionString']=topn_final_list
    val['topn_jaccard_score']=val.apply(topn_jaccard,axis=1)
#     print('topn_jaccard_score: ',val['topn_jaccard_score'].mean())
    val.to_csv(f'topn_jaccard_score_fold_{i}.csv',index=False)
    return val['topn_jaccard_score'].mean()

# Prediction

In [21]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

In [22]:
def get_model(i):
    model_checkpoint = f"../input/pipeline-for-qa-train-with-5-folds-1-epoch/chaii-trained-model-{i}"
#     model_checkpoint = f"../input/pipeline-for-qa-train-with-5-folds/chaii-trained-model-{i}"
    model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
    return Trainer(model)

In [23]:
def get_data(i,df):
    valid = df[df['kfold']==i]
    val_dataset = Dataset.from_pandas(valid)
    validation_features = val_dataset.map(
                                    prepare_validation_features,
                                    batched=True,
                                    remove_columns=val_dataset.column_names
                                )
    validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))
    return validation_features, valid, val_dataset

In [24]:
folds = 5
jac_score = []
topn_jac_score = []
for i in range(folds):
    print(f'predicting using model {i} start')
    trainer = get_model(i)
    validation_features, val, val_dataset = get_data(i,df)
    raw_predictions = trainer.predict(validation_features)
    final_predictions = postprocess_qa_predictions(val_dataset, validation_features, raw_predictions.predictions)
    jac_score.append(cal_jac(final_predictions,val,i))
    final_topn_predictions = postprocess_qa_predictions_topn(val_dataset, validation_features, raw_predictions.predictions,
                                                   top=5)
    topn_jac_score.append(cal_jac_topn(final_predictions,val,i))
    print(f'prediction of model {i} complete')
    print('-----------------------------------')

predicting using model 0 start


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Post-processing 223 example predictions split into 2866 features.


HBox(children=(FloatProgress(value=0.0, max=223.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Post-processing 223 example predictions split into 2866 features.


HBox(children=(FloatProgress(value=0.0, max=223.0), HTML(value='')))


prediction of model 0 complete
-----------------------------------
predicting using model 1 start


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Post-processing 223 example predictions split into 2953 features.


HBox(children=(FloatProgress(value=0.0, max=223.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Post-processing 223 example predictions split into 2953 features.


HBox(children=(FloatProgress(value=0.0, max=223.0), HTML(value='')))


prediction of model 1 complete
-----------------------------------
predicting using model 2 start


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Post-processing 223 example predictions split into 2841 features.


HBox(children=(FloatProgress(value=0.0, max=223.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Post-processing 223 example predictions split into 2841 features.


HBox(children=(FloatProgress(value=0.0, max=223.0), HTML(value='')))


prediction of model 2 complete
-----------------------------------
predicting using model 3 start


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Post-processing 223 example predictions split into 3259 features.


HBox(children=(FloatProgress(value=0.0, max=223.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Post-processing 223 example predictions split into 3259 features.


HBox(children=(FloatProgress(value=0.0, max=223.0), HTML(value='')))


prediction of model 3 complete
-----------------------------------
predicting using model 4 start


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Post-processing 222 example predictions split into 2822 features.


HBox(children=(FloatProgress(value=0.0, max=222.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Post-processing 222 example predictions split into 2822 features.


HBox(children=(FloatProgress(value=0.0, max=222.0), HTML(value='')))


prediction of model 4 complete
-----------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [25]:
print(jac_score)
print(sum(jac_score)/len(jac_score))

[0.708398010808325, 0.6305227035047662, 0.7055456874963603, 0.6913410207132182, 0.6615079365079364]
0.6794630718061212


In [26]:
print(topn_jac_score)
print(sum(topn_jac_score)/len(topn_jac_score))

[0.8557361733931241, 0.8678678197736491, 0.8836364432776989, 0.8842267971191738, 0.8336342836342837]
0.865020303439586
