# **Environment Setup**

In [None]:
!pip install --quiet ipython-autotime
%load_ext autotime

In [None]:
from zipfile import ZipFile
import pandas as pd

# **Import from Kaggle**

In [None]:
!mkdir /root/.kaggle
!cp /content/kaggle.json /root/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle competitions download feedback-prize-english-language-learning -p /content/datasets

In [None]:
with ZipFile("/content/datasets/feedback-prize-english-language-learning.zip","r") as zip_ref:
  zip_ref.extractall("/content/datasets")

# **Data Exploration**

In [None]:
df = pd.read_csv("/content/datasets/train.csv")

In [None]:
df.columns

Index(['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary',
       'phraseology', 'grammar', 'conventions'],
      dtype='object')

time: 4.33 ms (started: 2022-10-16 08:27:57 +00:00)


In [None]:
df.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


time: 31.6 ms (started: 2022-10-16 08:27:57 +00:00)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3911 entries, 0 to 3910
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   text_id      3911 non-null   object 
 1   full_text    3911 non-null   object 
 2   cohesion     3911 non-null   float64
 3   syntax       3911 non-null   float64
 4   vocabulary   3911 non-null   float64
 5   phraseology  3911 non-null   float64
 6   grammar      3911 non-null   float64
 7   conventions  3911 non-null   float64
dtypes: float64(6), object(2)
memory usage: 244.6+ KB
time: 23.5 ms (started: 2022-10-16 08:27:57 +00:00)


# **Approach with ML Models**

**Dependencies**

In [None]:
from statistics import mean
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV,cross_validate
from xgboost import XGBRegressor

**Utility Functions**

In [None]:
def regression_report(scoring_vector):
  print("Mean Absolute Error:"+str(abs(mean(scoring_vector['test_neg_mean_absolute_error']))))
  print("Mean Squared Error:"+str(abs(mean(scoring_vector['test_neg_mean_squared_error']))))
  print("Median Absolute Error:"+str(abs(mean(scoring_vector['test_neg_median_absolute_error']))))
  

**Preprocessing**

We will use the TFIDF Vectorizer to preprocess and vectorize the texts.

In [None]:
tfidf = TfidfVectorizer()

X = tfidf.fit_transform(df["full_text"])

In [None]:
tfidf.get_feature_names_out()[:15]

In [None]:
X.shape   #21363 features

(3911, 21363)

time: 5.45 ms (started: 2022-10-07 14:37:30 +00:00)


In [None]:
y = df.drop(["text_id","full_text"],axis=1)

In [None]:
cross_val_scoring = ["neg_mean_squared_error","neg_mean_absolute_error","neg_median_absolute_error"]

**Training and Testing with Linear Regressor**

In [None]:
linear_regressor = MultiOutputRegressor(LinearRegression())
scores = cross_validate(linear_regressor,X,y,cv=5,scoring=cross_val_scoring)

In [None]:
regression_report(scores)

Mean Absolute Error:0.5877873583883393
Mean Squared Error:0.5478169927430634
Median Absolute Error:0.4962948992294312
time: 12.7 ms (started: 2022-10-07 14:49:18 +00:00)


**Training and Testing with SVM Regressor**

In [None]:
svr = SVR()
multi_regressor = MultiOutputRegressor(svr)
params = {'estimator__C':[0.1,0.01,1],'estimator__gamma':[0.1,1,10,1000]}
clf = GridSearchCV(multi_regressor,params,verbose=3)

In [None]:
clf.fit(X,y)

In [None]:
clf.best_params_

{'estimator__C': 1, 'estimator__gamma': 1}

time: 4.87 ms (started: 2022-10-08 00:46:01 +00:00)


In [None]:
multi_regressor = MultiOutputRegressor(SVR(C=1,gamma=1))

In [None]:
scores = cross_validate(multi_regressor,X,y,cv=5,scoring=cross_val_scoring)

In [None]:
regression_report(scores)

Mean Absolute Error:0.44030738436414085
Mean Squared Error:0.3035907313556448
Median Absolute Error:0.3757517880211312
time: 1.2 ms (started: 2022-10-08 18:35:28 +00:00)


**Training and Testing with XGBRegressor**

In [None]:
xgb = XGBRegressor(n_estimators=100,objective='reg:squarederror')
multi_regressor = MultiOutputRegressor(xgb)
params = {'estimator__min_child_weight':[0.1,1,10],'estimator__gamma':[1,10,100],'estimator__max_depth':[3,6,10]}
clf = GridSearchCV(multi_regressor,params,verbose=3)

In [None]:
clf.fit(X,y)

In [None]:
clf.best_params_

{'estimator__gamma': 1,
 'estimator__max_depth': 6,
 'estimator__min_child_weight': 10}

time: 5.16 ms (started: 2022-10-08 14:22:31 +00:00)


In [None]:
multi_regressor = MultiOutputRegressor(XGBRegressor(gamma=1,max_depth=3,min_child_weight=10))

In [None]:
scores = cross_validate(multi_regressor,X,y,cv=5,scoring=cross_val_scoring)

In [None]:
regression_report(scores)

Mean Absolute Error:0.4459536600999776
Mean Squared Error:0.309199442458324
Median Absolute Error:0.3842533310254415
time: 1.27 ms (started: 2022-10-08 18:48:55 +00:00)


# **Approach with TF Neural Networks**

**Dependencies**

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
import tensorflow as tf
import spacy
import numpy as np

In [None]:
!mkdir /content/checkpoint

**Constants**

In [None]:
MAX_TOKENS = 30000 
OUTPUT_SEQUENCE_LENGTH=6044    #max text length in the dataset

**Preprocessing**

In [None]:
X = df['full_text']
y = df.drop(['text_id','full_text'],axis=1)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((X,y))

In [None]:
ds_cardinality = tf.data.experimental.cardinality(dataset)
dataset = dataset.shuffle(buffer_size=ds_cardinality)
test_set = dataset.take(ds_cardinality // 5)  #20%
train_set = dataset.skip(ds_cardinality // 5)

In [None]:
train_set = train_set.batch(32)
test_set = test_set.batch(32)

**Embedding Matrix generation**

In [None]:
text_vectorization = tf.keras.layers.TextVectorization(
    max_tokens=MAX_TOKENS,
    output_sequence_length=OUTPUT_SEQUENCE_LENGTH
)


text_vectorization.adapt(df['full_text'].to_numpy())

In [None]:
num_tokens = len(text_vectorization.get_vocabulary())

In [None]:
nlp = spacy.load('en_core_web_lg')

embedding_dim = len(nlp('The').vector)

In [None]:
embedding_matrix = np.zeros((num_tokens,embedding_dim))

for i,word in enumerate(text_vectorization.get_vocabulary()):
  embedding_matrix[i] = nlp(str(word)).vector

In [None]:
embedding_layer = tf.keras.layers.Embedding(num_tokens,
                                            embedding_dim,
                                            input_length=OUTPUT_SEQUENCE_LENGTH,
                                            embeddings_initializer=tf.initializers.Constant(embedding_matrix),
                                            trainable=False)

In [None]:
vector = np.random.random_integers(low=0,high=num_tokens-1,size=OUTPUT_SEQUENCE_LENGTH)

In [None]:
embedding_layer(vector)

**Training and Testing with LSTM Neural Network**

In [None]:
input = tf.keras.layers.Input(shape=(),dtype=tf.string)
x = text_vectorization(input)
x = embedding_layer(x)
x = tf.keras.layers.LSTM(256,return_sequences=True)(x)
x = tf.keras.layers.LSTM(256,return_sequences=True)(x)
x = tf.keras.layers.Flatten()(x)
output = tf.keras.layers.Dense(6)(x)

model = tf.keras.Model(inputs=input,outputs=output)

model.compile(optimizer="adam",loss="mse",metrics=["mae","mse"])

In [None]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None,)]                 0         
                                                                 
 text_vectorization (TextVec  (None, 6044)             0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 6044, 300)         7541100   
                                                                 
 lstm (LSTM)                 (None, 6044, 256)         570368    
                                                                 
 lstm_1 (LSTM)               (None, 6044, 256)         525312    
                                                                 
 flatten (Flatten)           (None, 1547264)           0         
                                                             

In [None]:
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath="/content/checkpoint",monitor='val_loss',mode='min',save_best_only=True)
lr_plateau = tf.keras.callbacks.ReduceLROnPlateau(verbose=1,patience=6)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=16)

In [None]:
model.fit(train_set,validation_data=(test_set),epochs=70,callbacks=[lr_plateau,early_stopping,model_checkpoint])

Epoch 1/70



Epoch 2/70



Epoch 3/70



Epoch 4/70



Epoch 5/70
Epoch 6/70
Epoch 7/70



Epoch 8/70



Epoch 9/70
Epoch 10/70
Epoch 11/70



Epoch 12/70
Epoch 13/70



Epoch 14/70
Epoch 15/70
Epoch 16/70



Epoch 17/70



Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70



Epoch 22/70



Epoch 23/70
Epoch 24/70
Epoch 25/70



Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70



Epoch 31/70
Epoch 32/70
Epoch 33/70



Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 39: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 40/70
Epoch 41/70



Epoch 42/70
Epoch 43/70



Epoch 44/70



Epoch 45/70



Epoch 46/70



Epoch 47/70
Epoch 48/70



Epoch 49/70



Epoch 50/70



Epoch 51/70



Epoch 52/70



Epoch 53/70



Epoch 54/70



Epoch 55/70



Epoch 56/70



Epoch 57/70



Epoch 58/70



Epoch 59/70



Epoch 60/70



Epoch 61/70
Epoch 62/70



Epoch 63/70



Epoch 64/70



Epoch 65/70
Epoch 66/70



Epoch 67/70
Epoch 68/70
Epoch 69/70



Epoch 70/70





<keras.callbacks.History at 0x7f1e5065c690>

time: 2h 19min 54s (started: 2022-10-10 18:50:41 +00:00)


# **Approach with HF Transformers**

**Dependencies**

In [None]:
!pip install --quiet transformers

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import TFAutoModel

In [None]:
import numpy as np

In [None]:
tf.config.run_functions_eagerly(True)

In [None]:
!mkdir /content/checkpoint
!mkdir /content/checkpoint/bert

**Utility Functions**

In [None]:
def bert_preprocess(text):
  return (tokenizer(str(text),padding='max_length',truncation=True,return_attention_mask=False).data)

**Preprocess**

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
X = df['full_text']
y = df.drop(['full_text','text_id'],axis=1)  

In [None]:
encodings = tokenizer(X.to_list(),padding='max_length',truncation=True,return_attention_mask=False)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((
    dict(encodings),
    y
))

In [None]:
dataset = dataset.batch(2)

In [None]:
ds_cardinality = tf.data.experimental.cardinality(dataset)
train_set = dataset.skip(ds_cardinality // 5)
test_set = dataset.take(ds_cardinality // 5)

In [None]:
MAX_LENGTH = 512

**Training and Testing with BERT**

In [None]:
from transformers import TFBertModel

In [None]:
def build_model():
    
    encoder = TFAutoModel.from_pretrained('bert-base-uncased')
    input_word_ids = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="input_ids")
    input_mask = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="token_type_ids") 
    
    embedding = encoder([input_word_ids, input_mask])[0] 
    
    
    output = tf.keras.layers.Dense(6)(embedding[:,0,:])
    
    
       
    model = tf.keras.Model(inputs=[input_word_ids, input_mask], outputs=output) 
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='mse', metrics=['mae',"mse"]) 
    return model

In [None]:
bert_model = build_model()

In [None]:
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath="/content/checkpoint/bert",monitor='val_loss',mode='min',save_best_only=True)
lr_plateau = tf.keras.callbacks.ReduceLROnPlateau(verbose=1,patience=6)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=16)

In [None]:
bert_model.fit(train_set,validation_data=(test_set),epochs=35,callbacks=[model_checkpoint,lr_plateau,early_stopping])