In [1]:
import numpy as np
import os
import tensorflow as tf
from tensorflow import keras
from keras import layers, models
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import string

In [2]:
train = pd.read_csv('/kaggle/input/feedback-prize-english-language-learning/train.csv')
print(len(train))
display(train.head())

3911


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [3]:
targets = train[['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']].values.tolist()
inputs = train['full_text'].values.tolist()

In [4]:
inputs[1], targets[1]

("When a problem is a change you have to let it do the best on you no matter what is happening it can change your mind. sometimes you need to wake up and look what is around you because problems are the best way to change what you want to change along time ago. A\n\nproblem is a change for you because it can make you see different and help you to understand how tings wok.\n\nFirst of all it can make you see different then the others. For example i remember that when i came to the United States i think that nothing was going to change me because i think that nothing was going to change me because everything was different that my country and then i realist that wrong because a problem may change you but sometimes can not change the way it is, but i remember that i was really shy but i think that change a lot because sometimes my problems make me think that there is more thing that i never see in my life but i just need to see it from a different way and dont let nothing happened and ruin

### Feature Engineering

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer 

vectorizer = TfidfVectorizer(smooth_idf = True, sublinear_tf = True, analyzer='char',ngram_range=(1, 3),
                             max_features=2000)

vectorizer.fit(inputs)

TfidfVectorizer(analyzer='char', max_features=2000, ngram_range=(1, 3),
                sublinear_tf=True)

In [6]:
def extract_vectors(x):
    vecs = vectorizer.transform(x)
    return vecs.toarray().flatten()

train['vecs'] = train['full_text'].apply(lambda x: extract_vectors([x]))

The [Flesch–Kincaid readability tests](https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests) are readability tests designed to indicate how difficult a passage in English is to understand. 

In [7]:
def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

In [8]:
def flesch_kincaid_score(x):
    """
    The score is given by 206.835 - 1.015 × (total words ÷ total sentences) - 84.6 × (total syllables ÷ total words)
    """
    total_words = len(x.split())
    total_sentences = (len(x.split('.')))
    syllables = sum([syllable_count(w) for w in x.split()])
    return 206.835 - 1.015 * (total_words / total_sentences) - 84.6 * (syllables / total_words)

In [9]:
# some more features
train.drop(['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'], axis=1, inplace = True)
train['char_count'] = train['full_text'].apply(len)
train['word_count'] = train['full_text'].apply(lambda x: len(x.split()))
train['word_density'] = train['char_count'] / (train['word_count'] + 1)
train['punctuation_count'] = train['full_text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
train['flesch_kincaid'] = train['full_text'].apply(lambda x: flesch_kincaid_score(x))

In [10]:
eng_features = ['char_count', 'word_count', 'word_density', 'punctuation_count', 'flesch_kincaid']
feature_set = []
for i, row in train.iterrows():
    vecs = row['vecs']
    vals = row[eng_features].astype(float)
    features = np.hstack([vecs, vals]).flatten()
    feature_set.append(features)
inputs = np.array(feature_set)
targets = np.array(targets)

### ML Model

In [11]:
model = models.Sequential(name='baseline')
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(6))

2022-12-22 04:52:52.152482: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [12]:
model.compile(loss='mse', optimizer='adam', metrics='acc')

In [13]:
history = model.fit(x=inputs, y=targets, epochs=20)

2022-12-22 04:52:52.380714: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Evaluation

In [14]:
test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
test.head()

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


In [15]:
test['vecs'] = test['full_text'].apply(lambda x: extract_vectors([x]))
test['char_count'] = test['full_text'].apply(len)
test['word_count'] = test['full_text'].apply(lambda x: len(x.split()))
test['word_density'] = test['char_count'] / (test['word_count'] + 1)
test['punctuation_count'] = test['full_text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
test['flesch_kincaid'] = test['full_text'].apply(lambda x: flesch_kincaid_score(x))

In [16]:
eng_features = ['char_count', 'word_count', 'word_density', 'punctuation_count', 'flesch_kincaid']
feature_set = []
for i, row in test.iterrows():
    vecs = row['vecs']
    vals = row[eng_features].astype(float)
    features = np.hstack([vecs, vals]).flatten()
    feature_set.append(features)
inputs = np.array(feature_set)

In [17]:
predictions = model.predict(inputs)
predictions = predictions.tolist()
predictions[0]

[3.04675030708313,
 2.552311658859253,
 2.7769510746002197,
 2.758589267730713,
 2.456598997116089,
 2.524026393890381]

In [18]:
submission = pd.DataFrame(predictions, columns=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'])
submission = pd.concat([test['text_id'], submission], axis=1)
submission

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.04675,2.552312,2.776951,2.758589,2.456599,2.524026
1,000BAD50D026,2.998186,2.739042,2.883553,2.842175,2.651947,2.797789
2,00367BB2546B,3.282906,2.896102,3.002149,2.991028,2.836655,2.905456


In [19]:
submission.to_csv('submission.csv', index=False)