In [None]:

import numpy as np 
import pandas as pd 



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='darkgrid', font_scale=1)
data = pd.read_csv('train.csv')


from sklearn.model_selection import train_test_split
train_df, test_df,y_train,y_test = train_test_split(data, data, test_size = 0.3,random_state=50)


train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')


# Checking for null values
train_df.isnull().sum() ,train_df.shape

In [None]:
bert_path = '/huggingface-bert-variants/bert-base-uncased/bert-base-uncased'

In [None]:
targets = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

fig, axes = plt.subplots(2, len(targets)//2, figsize=(15,6))

for i, target in enumerate(targets):
    ax = axes.flat[i]
    sns.histplot(x=target, data=train_df, linewidth=1.25, alpha=1, ax=ax, zorder=2)
    ax.set_title(target)
    ax.set(xlabel=None, ylabel=None)
    
fig.suptitle('Output Variables')
plt.tight_layout()
plt.show()

In [None]:
k = len(train_df.columns) #number of variables for heatmap
f,ax = plt.subplots(figsize=(11, 11))
cols = train_df.corr().abs().nlargest(k, 'syntax')['syntax'].index
cm = np.corrcoef(train_df[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
cols = cols.drop('cohesion')
cols

In [None]:
# Merging Train and Test Data
ntrain = train_df.shape[0]
ntest = test_df.shape[0]
all_data = pd.concat((train_df, test_df)).reset_index(drop=True)
all_data.drop(['text_id'], axis=1, inplace=True)
print("all_data size is : {}".format(all_data.shape))

In [None]:
import string
from nltk.corpus import stopwords
PUNCT_TO_REMOVE = string.punctuation
STOPWORDS = set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

# Create an instance of a PorterStemmer object
lemmatizer = WordNetLemmatizer()

def preprocess(text) : 
    #Using cased model so not lowering 
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+','', text)
    text = re.sub(r'@[0-9a-zA-Z]*\W+',' ' , text)
    #text = re.sub(r'\.','[SEP]' , text)
    
    #text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'\#', ' ', text)
    text = re.sub(r'\'', ' ', text)
        
    list_text = text.split()
    text = ' '.join(list_text[:512])
    return text

In [None]:
test = 'Our \'fffff\'Deeds are the . Reason of @insta this #earthquake M'
print(preprocess(test))

In [None]:
all_data['full_text'] = all_data['full_text'].apply(lambda text : preprocess(text))

In [None]:
train_data = all_data[:ntrain].copy()
test_data = all_data[ntrain:]

train_data.shape , test_data.shape

In [None]:
BATCH_SIZE = 6

MAX_LEN = max(len(x.split()) for x in all_data['full_text'])
print(MAX_LEN)

In [None]:
import tensorflow as tf
from transformers import BertTokenizer , TFBertModel 

AUTO = tf.data.experimental.AUTOTUNE

In [None]:
tokenizer = BertTokenizer.from_pretrained(bert_path)

In [None]:
def encode(input_text):
    inputs = tokenizer.batch_encode_plus(input_text,padding='max_length',max_length=MAX_LEN, truncation=True)
    return inputs

In [None]:
train_input = encode(train_data['full_text'].values.tolist())['input_ids']

train_data_ds = (
    tf.data.Dataset
    .from_tensor_slices((train_input,train_data.drop('full_text', axis = 1)))
    .repeat()
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

In [None]:
testing_input = encode(test_data.full_text.values.tolist())['input_ids']

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(testing_input)
    .batch(BATCH_SIZE)
)

In [None]:
# Custom error function MCRMSE : column wise root mean squared eoor
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=-1, keepdims=True)

In [None]:
def create_model():
    bert_encoder = TFBertModel.from_pretrained(bert_path )
    input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_word_ids")

    embedding = bert_encoder(input_word_ids)[0]
    x = tf.keras.layers.GlobalAveragePooling1D()(embedding)
    x = tf.keras.layers.LayerNormalization()(x)
    #Output layer without activation function because regression task
    output = tf.keras.layers.Dense(6,)(x)

    model = tf.keras.models.Model(inputs=input_word_ids, outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-5), loss=MCRMSE
                  , metrics=MCRMSE)

    return model

In [None]:
model= create_model()
model.summary()

In [None]:
import  gc
gc.collect()

In [None]:
BATCH_SIZE

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

callback = tf.keras.callbacks.EarlyStopping(monitor='MCRMSE', patience = 2 ,restore_best_weights=True)

history = model.fit(
                    train_data_ds, 
                    steps_per_epoch= train_data.shape[0]//BATCH_SIZE,
                    batch_size = BATCH_SIZE,
                    epochs= 3,
                    verbose = 1,
                    shuffle= True,
                    callbacks=[callback],
                       )