In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn import tree
from sklearn.cross_decomposition import PLSRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import validation_curve
from sklearn.model_selection import KFold
np.random.seed(0)
plt.style.use('ggplot')

/kaggle/input/commonlitreadabilityprize/sample_submission.csv
/kaggle/input/commonlitreadabilityprize/train.csv
/kaggle/input/commonlitreadabilityprize/test.csv


In [2]:
# Import training and testing data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
# Display first 5 rows of training data
train_data.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


In [4]:
test_data.head()

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...


In [5]:
print(train_data['target'].min())
print(train_data['target'].max())

-3.676267773
1.711389827


We have datasets that score how easy a text is to read using the "target" column which is the output we want. The competition didn't specify what the bounds of this score was but from the training dataset we see the minimum score is -3.68 while the maximum was 1.71. The features we can use to train a model to get this target score is limited to the features in the test data which is id, url, license, and excerpt.

In [6]:
# Only select columns in training data set also in test data
train_data_trim = train_data[["id", "url_legal", "license", "excerpt", "target"]]
train_data_trim.head()

Unnamed: 0,id,url_legal,license,excerpt,target
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197


In [7]:
# Find null values for columns
print("Trainig Data")
for col in train_data_trim.columns:
    print("number of null values in ", col, " = ", train_data_trim[col].isnull().sum(), " out of ", train_data_trim.shape[0])
    
print("\nTest Data")
for col in test_data.columns:
    print("number of null values in ", col, " = ", test_data[col].isnull().sum(), " out of ", test_data.shape[0])

Trainig Data
number of null values in  id  =  0  out of  2834
number of null values in  url_legal  =  2004  out of  2834
number of null values in  license  =  2004  out of  2834
number of null values in  excerpt  =  0  out of  2834
number of null values in  target  =  0  out of  2834

Test Data
number of null values in  id  =  0  out of  7
number of null values in  url_legal  =  4  out of  7
number of null values in  license  =  4  out of  7
number of null values in  excerpt  =  0  out of  7


We see that a substantial amount of data in the url_legal and license columns are missing so it's infeasible to use these columns for training so we'll emit these columns as well. There's no missing rows in the other columns so they should be alright to use.

In [8]:
train_data_trim = train_data[["id", "excerpt", "target"]]
train_data_trim.head()

Unnamed: 0,id,excerpt,target
0,c12129c31,When the young people returned to the ballroom...,-0.340259
1,85aa80a4c,"All through dinner time, Mrs. Fayre was somewh...",-0.315372
2,b69ac6792,"As Roger had predicted, the snow departed as q...",-0.580118
3,dd1000b26,And outside before the palace a great garden w...,-1.054013
4,37c1b32fb,Once upon a time there were Three Bears who li...,0.247197


In [9]:
train_data_trim.dtypes

id          object
excerpt     object
target     float64
dtype: object

As machine learning models only take in numerical input, we need to convert both the id and excerpt columns. This will be infeasible to do for the id column as there's no obvious pattern we could make use of for ids to transform it into numerical data while with the excerpt column, the text data could be tokenized and vectorized which we can do.

In [10]:
# Vectorize words of train data
vectorizer = TfidfVectorizer(
    strip_accents='unicode',
    stop_words='english',
    token_pattern=r'(?u)\b[A-Za-z]+\b',
    lowercase=True,
    max_features=13000) #looped through multiple numbers and had minimal rmse with 13000 features

X = vectorizer.fit_transform(train_data['excerpt'].values)

# Get target values of train data
y = train_data.loc[:,'target']

In [11]:
# Print tokenized words
print(vectorizer.get_feature_names()[:250])

['aai', 'aaminah', 'abagnale', 'abandon', 'abandoned', 'abbey', 'abbreviated', 'abe', 'abebe', 'abel', 'aberration', 'abike', 'abilities', 'ability', 'able', 'aboard', 'abode', 'abounded', 'abroad', 'abruptly', 'absence', 'absolute', 'absolutely', 'absorb', 'absorbed', 'absorbing', 'absorbs', 'absorption', 'abstain', 'abstract', 'abstracted', 'abstraction', 'absurd', 'abula', 'abundance', 'abundant', 'abyss', 'abyssal', 'academic', 'academy', 'accelerate', 'accelerated', 'acceleration', 'accent', 'accept', 'acceptance', 'accepted', 'access', 'accessible', 'accident', 'accidentally', 'accommodate', 'accommodated', 'accommodation', 'accompanied', 'accompany', 'accompanying', 'accomplish', 'accomplished', 'accomplishment', 'accomplishments', 'accord', 'according', 'accordingly', 'account', 'accounted', 'accounts', 'accumulate', 'accumulation', 'accuracy', 'accurate', 'accurately', 'accused', 'accustomed', 'ache', 'achieve', 'achieved', 'achievement', 'achievements', 'acid', 'acids', 'acqu

In [12]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [13]:
# Test different models to see best result
models = [MLPRegressor(random_state=42), tree.DecisionTreeRegressor(), KNeighborsRegressor(), svm.SVR(), KernelRidge()]
modelnames = ["MLP Regressor", "Decision Tree Regressor", "K Neighbors Regressor", "Support Vector Regression", "Kernel Ridge"]

for index,model in enumerate(models):
    # Train model to fit training data
    regr = model.fit(X_train,y_train)
    
    # Get mean square error for validation data
    validation_preds = regr.predict(X_val)
    print(modelnames[index], ": ", mean_squared_error(y_val, validation_preds, squared=False))

MLP Regressor :  0.7558103587312314
Decision Tree Regressor :  1.2176665865952547
K Neighbors Regressor :  0.9012276667825023
Support Vector Regression :  0.7933053740691738
Kernel Ridge :  0.7849454521688869


We can see after testing multiple regression models with their default settings, MLP Regressor gives us the lowest mean square error which is what we're trying to minimize for this contest. We'll use this model and tune the parameters to get an even lower mean square error.

In [14]:
# Increasing initial learning rate as model wasn't converging
estimator = MLPRegressor(learning_rate_init=0.01, random_state=42)

# Define the different parameters you want to test
param_grid = {'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,1)],
          'activation': ['relu','tanh','logistic'],
          'alpha': [0.0001, 0.05],
          'learning_rate': ['constant','adaptive']}

# Test different parameter combinations using root mean square error as score and get best parameters
gsc = GridSearchCV(
    estimator,
    param_grid,
    cv=5, scoring='neg_root_mean_squared_error', verbose=True, n_jobs=-1)

grid_result = gsc.fit(X, y)

best_params = grid_result.best_params_

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 25.6min finished


In [15]:
#Find out what the best parameters were
print(best_params)

{'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant'}


In [16]:
# Train MLP Regressor using best parameters
best_mlp = MLPRegressor(hidden_layer_sizes = best_params["hidden_layer_sizes"], 
                    activation = best_params["activation"],
                    alpha = best_params["alpha"],
                    learning_rate = best_params["learning_rate"],
                    max_iter= 1000,
                    n_iter_no_change=100,
                    random_state=42
                    ).fit(X_train, y_train)

In [19]:
# Get mean square error of best mlp regressor
val_preds = best_mlp.predict(X_val)
mean_squared_error(y_val, val_preds, squared=False)

0.7444619170980235

In [20]:
# Get X values from test set
X_test = vectorizer.transform(test_data['excerpt'])

In [21]:
# Predict values for test set
test_preds = best_mlp.predict(X_test)

In [22]:
# Output csv of predictions
predictions = pd.DataFrame()
predictions['id'] = test_data['id']
predictions['target'] = test_preds
predictions.to_csv("submission.csv",index=False)