In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:

        #Columns

# id - a unique identifier
# context - the text of the Hindi/Tamil sample from which answers should be derived
# question - the question, in Hindi/Tamil
# answer_text (train only) - the answer to the question (manual annotation) (note: for test, this is what you are attempting to predict)
# answer_start (train only) - the starting character in context for the answer (determined using substring match during data preparation)
# language - whether the text in question is in Tamil or Hindi


      #Colonnes
    
# id - un identifiant unique
# context - le texte de l'échantillon en hindi/tamil à partir duquel les réponses doivent être dérivées
# question - la question, en hindi/tamoul
# answer_text (train only) - la réponse à la question (annotation manuelle) (note : pour le test, c'est ce que vous essayez de prédire)
# answer_start (train only) - le caractère de départ dans le contexte de la réponse (déterminé en utilisant la correspondance des sous-chaînes pendant la préparation des données)
# language - si le texte en question est en tamoul ou en hindi.


## Importation des librairies

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
import nltk
from nltk.corpus import stopwords

from transformers import pipeline

import torch

## Importation des bases de donées

#### Test cuda

In [4]:
import torch  

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [5]:
train = pd.read_csv("../input/alldata/train.csv")
print('train set shape: ', train.shape)

test = pd.read_csv("../input/alldata/test.csv")
print('Test set shape: ', test.shape)

sample_submission = pd.read_csv("../input/alldata/sample_submission.csv")
print('Sample submission set shape: ', sample_submission.shape)

In [6]:
train.head()

In [7]:
train.info()

In [8]:
test.head()

In [9]:
test.info()

In [10]:
sample_submission.head()

In [11]:
sample_submission.info()

In [12]:
# Visualisation de la base avec la spécification de la langue "hindi"

train[train.language == 'hindi'].head()

In [13]:
# Visualisation de la base avec la spécification de la langue "hindi"

train[train.language == "tamil"].head()

In [14]:
# Définition du context suivant les langues

tamil_context = train[train.language == 'tamil']['context'].str.cat(sep='\n')
hindi_context = train[train.language == 'hindi']['context'].str.cat(sep='\n')

In [15]:
# Affichage de la longueur des caractères suivant les langues

print(
    '\nlength of tamil characters : ', len(set(tamil_context)),
    '\nlength of hindi characters : ', len(set(hindi_context)),
    '\nlength of hindi & tamil characters : ', len(set(tamil_context) & set(hindi_context)),
    '\nlength of only tamil characters : ', len(set(tamil_context) - set(hindi_context)),
    '\nlength of only hindi characters : ', len(set(hindi_context) - set(tamil_context))
)

In [16]:
train["language"].value_counts(normalize=True).plot(kind='pie', autopct='%1.1f%%')

In [17]:
train.plot.scatter(x='id',y='answer_start') 

## Tokenisation des mots

In [18]:
def text_cleaner(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [19]:
def word_tokenizer(text):
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = text_cleaner(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    # remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(tokenized_text)
    return combined_text

In [20]:
train['tokenized_text'] = train['context'].apply(str).apply(lambda x: word_tokenizer(x))
train['text_len'] = train['tokenized_text'].astype(str).apply(len)
train['text_word_count'] = train['tokenized_text'].apply(lambda x: len(str(x).split()))

train[['context', 'tokenized_text', 'text_len', 'text_word_count']].head(10)

In [21]:
test['tokenized_text'] = test['context'].apply(str).apply(lambda x: word_tokenizer(x))
test['text_len'] = test['tokenized_text'].astype(str).apply(len)
test['text_word_count'] = test['tokenized_text'].apply(lambda x: len(str(x).split()))

test[['context', 'tokenized_text', 'text_len', 'text_word_count']]

## Visualisation des données

In [22]:
train_groupby = train.groupby(by = ['language']).count()

train_groupby

In [23]:
# Affichage de plage de colour
sns.color_palette('pastel')

In [24]:
# créer des accessoires de texte
textprops = dict(horizontalalignment = 'center',
                 verticalalignment = 'top',
                 rotation = 0,
                 # rotation_mode = "default",
                 rotation_mode = 'anchor',
                 size = 14,
                 # color = "#81D8D0",
                 color = sns.color_palette('pastel')[-5],)





# créer un camembert
plt.figure(figsize=(6, 6))



# configurer le camembert
plt.pie(x = train_groupby.id,
        labels = train_groupby.index,
        colors = sns.color_palette('pastel')[2 : 3] + sns.color_palette('pastel')[-1 : ],
        autopct = '%.2f%%',
        # explode = (0.02, 0.02),
        explode = [0.02] * 2,
        startangle = 90,
        pctdistance = 0.4,
        labeldistance = 1.2,
        textprops = textprops,)


# configurer la legend camembert
legend = plt.legend(title = 'Séparation des textes par langue source',
                    title_fontsize = 'x-large',
                    #loc = 'lower center',
                    bbox_to_anchor = (0, -0.15, 0.5, 0.5),
                    labels = ['HINDI', 'TAMIL'],
                    labelcolor = sns.color_palette('pastel')[-5],
                    fontsize = 'large',
                    facecolor = '#F6F8ED',
                    edgecolor = sns.color_palette('pastel')[7],)


# changer la couleur de la légende du camembert
plt.setp(legend.get_title(), color = sns.color_palette('pastel')[5])

# affichage de circle
centre_circle = plt.Circle((0,0),0.70,fc = 'white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

# afficher le camembert
plt.show()

In [25]:
sns.color_palette('Paired')

In [26]:
plt.figure(figsize = (12, 6))

sns.set_style('whitegrid')

ax = sns.histplot(x = train.text_len.sort_values(ascending = False),
                  bins = 100,
                  hue = 'language',
                  data = train,
                  kde = True,
                  element = 'step',
                  palette = sns.color_palette('Paired')[9 : 10] + sns.color_palette('Paired')[1 : 2],
                  legend = True,)

ax.set(xlabel = 'longueur textes',
       ylabel = 'count',
       title = 'Distribution de la longueur du texte original',)

legend = plt.legend(fontsize = 10,
                    loc = 'upper right',
                    title = 'Distinguer les échantillons par langue - Histogrammes',
                    title_fontsize = 12,
                    shadow = True,
                    facecolor = 'white',
                    labels = ['HINDI', 'TAMIL'],
                    labelcolor = sns.color_palette('Paired')[0],)

plt.setp(legend.get_title(),
         color = sns.color_palette('Paired')[-4],)

plt.show()

In [27]:
ax = plt.figure(figsize = (12, 6))

sns.set_style('darkgrid')

ax = sns.histplot(x = train.text_word_count.sort_values(ascending = False),
                  bins = 100,
                  hue = 'language',
                  data = train,
                  kde = True,
                  element = 'step',
                  palette = sns.color_palette('Paired')[5 : 6] + sns.color_palette('Paired')[7 : 8],)

ax.set(xlabel = 'longueur des textes tokenize',
       ylabel = 'count',
       title = 'Distribution de la longueur du texte Tokenize',)

legend = plt.legend(fontsize = 10,
                    loc = 'upper right',
                    title = 'Distinguer les échantillons par langue - Histogrammes',
                    title_fontsize = 12,
                    shadow = True,
                    facecolor = 'white',
                    labels = ['HINDI', 'TAMIL'],
                    labelcolor = sns.color_palette('Paired')[4],)

plt.setp(legend.get_title(),
         color = sns.color_palette('Paired')[6],)

plt.show()

## Modeles 

### Modele1: Jaccard

In [28]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [29]:
def gen_answers(df):
    a = []
    for text in df['context']:
        tokens = text.split()
        nums = np.array([x.isnumeric() for x in tokens])
        i = np.argmax(nums == 1)
        a.append(' '.join(tokens[i:min(i+2, len(tokens))]))

    return a

In [30]:
%%time
train_ans = gen_answers(train)
score = np.mean([jaccard(y_true, pred) for y_true, pred in zip(train['answer_text'].values, train_ans)])
print(f'Average Jaccard Score: {score}')

In [31]:
%%time
test_ans = gen_answers(test)
submission = test[['id']]
submission['PredictionString'] = test_ans
submission.to_csv('submission.csv', index=False)

In [32]:
submission.head()

### Modele 2 : bert-base-multilingual-cased-finetuned-squad

In [33]:
model = "../input/bertbasemultilingualcasedfinetunedsquad/bert-base-multilingual-cased-finetuned-squad"
qna = pipeline('question-answering', model = model, tokenizer = model, device = 0)

predictions = []

for question, context in test[["question", "context"]].to_numpy():
    result = qna(context=context, question=question)
    predictions.append(result["answer"])

In [34]:
submission = pd.DataFrame()
submission['id'] = test['id']
submission['PredictionString'] = predictions
submission.to_csv("submission2.csv", index=None)

submission.head()

### Modele 3 : xlm-roberta-squad2/deepset/xlm-roberta-base-squad2

In [35]:
model = "../input/xlm-roberta-squad2/deepset/xlm-roberta-base-squad2"
qna = pipeline('question-answering', model = model, tokenizer = model, device = 0)

predictions = []

for question, context in test[["question", "context"]].to_numpy():
    result = qna(context=context, question=question)
    predictions.append(result["answer"])

In [36]:
submission = pd.DataFrame()
submission['id'] = test['id']
submission['PredictionString'] = predictions
submission.to_csv("submission3.csv", index=None)

submission.head()

### Modele4 : xlm-roberta-squad2/deepset/xlm-roberta-large-squad2

In [37]:
# modele 3
model = "../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2"
qna = pipeline('question-answering', model = model, tokenizer = model, device = 0)

predictions = []

for question, context in test[["question", "context"]].to_numpy():
    result = qna(context=context, question=question)
    predictions.append(result["answer"])

In [38]:
submission = pd.DataFrame()
submission['id'] = test['id']
submission['PredictionString'] = predictions
submission.to_csv("submission4.csv", index=None)

submission.head()

In [39]:
submission = pd.DataFrame()
submission['id'] = test['id']
submission['PredictionString'] = predictions
submission.to_csv("submission4.csv", index=None)

submission.head()