In [1]:
import numpy as np 
import tensorflow as tf 
import os 
import pandas as pd 
import re
from transformers import AutoTokenizer , TFAutoModel 
from datasets  import Dataset

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


## validate the text 
we remove some skip charchters and \n 

In [4]:
def remove_skip_chars(text) :
    text =  re.sub(r'[\n]' ,'' , text) 
    return re.sub(r'[^a-zA-Z0-9 ]' , '' , text)

In [5]:
train_data['excerpt']  = train_data.excerpt.apply(remove_skip_chars)
test_data['excerpt']  = test_data.excerpt.apply(remove_skip_chars)


In [6]:
train_data.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,All through dinner time Mrs Fayre was somewhat...,-0.315372,0.480805
2,b69ac6792,,,As Roger had predicted the snow departed as qu...,-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [7]:
train_dataset = Dataset.from_pandas(train_data[['id', 'excerpt' , 'target']])
test_dataset = Dataset.from_pandas(test_data[['id' , 'excerpt']])

In [8]:
tokinizer_id = "pretrained_tock_v1" 
model_id = 'pretrained_model_v1'

In [10]:
from transformers import AutoTokenizer , TFAutoModel 
tokenizer = AutoTokenizer.from_pretrained(tokinizer_id)

In [12]:
def tokenize_function(examples):
    return tokenizer(examples["excerpt"], padding='max_length', truncation=True)
tokenized_train_datasets = train_dataset.map(tokenize_function, batched=True)
tokenized_test_datasets = test_dataset.map(tokenize_function, batched=True)

100%|██████████| 3/3 [00:04<00:00,  1.66s/ba]
100%|██████████| 1/1 [00:00<00:00, 62.51ba/s]


In [14]:
small_train_dataset = tokenized_train_datasets.shuffle(seed=42).select(range(1000))

In [15]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator(return_tensors="tf")

In [16]:
tf_train_dataset = small_train_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["target"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size = 8
)

tf_validation_dataset = tokenized_test_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=None , 
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

In [17]:
tf_train_dataset

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(2, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(2, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(2,), dtype=tf.float32, name=None))>

In [18]:
class custom_model(tf.keras.Model): 
    def __init__(self) : 
        super(custom_model , self).__init__() 
        self.pretrained =  TFAutoModel.from_pretrained(model_id)
        self.average = tf.keras.layers.GlobalAveragePooling1D()
        # self.dense1 = tf.keras.layers.Dense(1024 , activation = 'relu')
        self.drop1 = tf.keras.layers.Dropout(.5)
        self.out = tf.keras.layers.Dense(1 )
    def call(self , inputs) : 
        features = self.pretrained(inputs) 
        X = self.average(features[0]) 
        # X=  self.dense1(X)
        X = self.drop1(X)
        return self.out(X)

In [19]:
model = custom_model()

All model checkpoint layers were used when initializing TFBartModel.

All the layers of TFBartModel were initialized from the model checkpoint at pretrained_model_v1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartModel for predictions without further training.


In [20]:
def rmse(y_true, y_pred):
    y_true, y_pred = tf.cast(y_true , tf.float32) , tf.cast(y_pred , tf.float32)
    squared_difference = tf.square(y_true - y_pred)
    return tf.sqrt(tf.reduce_mean(squared_difference, axis=-1))  # Note the `axis=-1`

In [21]:
model.compile(optimizer = 'adam' , loss = rmse)


In [22]:
tf_train_dataset

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(2, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(2, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(2,), dtype=tf.float32, name=None))>

In [23]:
hist = model.fit(tf_train_dataset , epochs = 5 , batch_size = 2 )

Epoch 1/5
 19/500 [>.............................] - ETA: 15:54:35 - loss: 7.0588