In [None]:
import pandas as pd 
import numpy as np 
from transformers import AutoTokenizer , TFAutoModel 
import re
from sklearn.model_selection import train_test_split
from datasets import Dataset 
import tensorflow as tf 


In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [None]:
train_data.head()

In [None]:
def add_Q_A_signs(data) : 
    data['question1'] = '<Q>' + data['question1']
    data['question2'] = '<A>' + data['question2']
    return data 

In [None]:
train_data = add_Q_A_signs(train_data)
test_data = add_Q_A_signs(test_data)


In [None]:
test_data.info()

In [None]:
train_dataset = Dataset.from_pandas(train_data[[ 'question1' , 'question2'  , 'is_duplicate']].iloc[:10000]) 
eval_dataset = Dataset.from_pandas(train_data[['question1' , 'question2'  , 'is_duplicate']].iloc[10000:10500])
test_dataset = Dataset.from_pandas(test_data[['question1' , 'question2']])

In [None]:
tock_id = 'clips/mfaq'
tock = AutoTokenizer.from_pretrained(tock_id) 


In [None]:
def tokenize_dataset(dataset):
    return tock(dataset["question1"], dataset['question2'],  padding='max_length', truncation=True)


In [None]:
tokenized_train_dataset = train_dataset.map(tokenize_dataset , batched = True)
tokenized_eval_dataset = eval_dataset.map(tokenize_dataset , batched = True)


In [None]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator(return_tensors="tf")


In [None]:
tokenized_train_dataset

In [None]:
tf_train_dataset = tokenized_train_dataset.to_tf_dataset(
    columns = ['attention_mask' ,  'input_ids'  ] ,
    label_cols = ['is_duplicate'] , 
    shuffle = True ,
    collate_fn =  data_collator  , 
    batch_size = 16,
)
tf_eval_dataset = tokenized_eval_dataset.to_tf_dataset(
    columns = ['attention_mask' ,  'input_ids'  ] ,
    label_cols = ['is_duplicate'] , 
    shuffle = True ,
    collate_fn =  data_collator  , 
    batch_size = 16,
)

In [None]:
tf_train_dataset

In [None]:
model_id = 'clips/mfaq'

In [None]:
class MODEL(tf.keras.Model) : 
    def __init__(self ) : 
        super(MODEL , self).__init__()
        self.extractor = TFAutoModel.from_pretrained(model_id) 
        self.AveragePool = tf.keras.layers.GlobalAveragePooling1D()
        self.drop = tf.keras.layers.Dropout(.5)
        self.out = tf.keras.layers.Dense(1 , activation = 'sigmoid')
    def call(self , inputs) : 
        features = self.extractor(inputs)
        X = self.AveragePool(features[0])
        X = self.drop(X)
        return self.out(X)

In [None]:
model = MODEL()

In [None]:
optimizer = tf.keras.optimizers.Adam()
model.compile(optimizer = optimizer , loss = 'binary_crossentropy'  , metrics = ['accuracy'])


In [None]:
hist = model.fit(tf_train_dataset ,validation_data = tf_eval_dataset , epochs = 2 , batch_size = 16 )