In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
tqdm.pandas()

from lightgbm import LGBMRegressor
from sklearn import model_selection
from sklearn import metrics

import string
from nltk.corpus import stopwords

In [None]:
train = pd.read_csv('../input/feedback-prize-english-language-learning/train.csv')
test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
ss = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')

In [None]:
train.head()

# EDA

## Check distributions

In [None]:
train.cohesion.hist();

In [None]:
train.syntax.hist();

In [None]:
train.vocabulary.hist();

In [None]:
train.phraseology.hist();

In [None]:
train.grammar.hist();

In [None]:
train.conventions.hist();

In [None]:
# check length of content
def get_length_of_text(x):
    return len(x)

print(f'Average length: {train.full_text.apply(lambda x: get_length_of_text(x)).mean():0.2f}')
print(f'Std length: {train.full_text.apply(lambda x: get_length_of_text(x)).std():0.2f}')
print(f'Min length: {train.full_text.apply(lambda x: get_length_of_text(x)).min():0.2f}')
print(f'Max length: {train.full_text.apply(lambda x: get_length_of_text(x)).max():0.2f}')

In [None]:
train.full_text.apply(lambda x: get_length_of_text(x)).hist();

First question: why is there no value between 2.5 and 3 for any of the variables?

## Check nans

In [None]:
train.isna().sum()

## Check correlations

In [None]:
colormap = sns.color_palette("Blues")
sns.heatmap(train.corr(), annot=True, cmap=colormap);

All variables are kinda correlated with each other

# Modeling

The idea here is to predict all variables. 

Two approaches:
- Multioutput regression
- Single output regression x6

In [None]:
train.head()

In [None]:
df = train.copy()
# we are going to iterate through each target variable
target_vars = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

In [None]:
# we need to extract the vectors from the text
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(smooth_idf=True, sublinear_tf=True) # this should be tuned in the future
vectorizer.fit(raw_documents=train.full_text)

In [None]:
def extract_vectors(x):
    vecs = vectorizer.transform(x)
    return vecs.toarray().flatten()

# extract_vectors([train.iloc[0].full_text])
df['vecs'] = train.full_text.progress_apply(lambda x: extract_vectors([x]))

def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count


def flesch_kincaid_score(x):
    '206.835 - 1.015 × (total words ÷ total sentences) - 84.6 × (total syllables ÷ total words).'
    total_words = len(x.split())
    total_sentences = (len(x.split('.')))
    syllables = sum([syllable_count(w) for w in x.split()])
    return 206.835 - 1.015 * (total_words / total_sentences) - 84.6 * (syllables / total_words)

In [None]:
# extract features from text
stop_words = stopwords.words('english')
df['char_count'] = df['full_text'].progress_apply(len)
df['word_count'] = df['full_text'].progress_apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count'] + 1)
df['punctuation_count'] = df['full_text'].progress_apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
df['title_word_count'] = df['full_text'].progress_apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df['upper_case_word_count'] = df['full_text'].progress_apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
df['stopword_count'] = df['full_text'].progress_apply(lambda x: len([wrd for wrd in x.split() if wrd.lower() in stop_words]))
df['flesch_kincaid_score'] = df['full_text'].progress_apply(lambda x: flesch_kincaid_score(x))

In [None]:
eng_features = df.drop(['text_id', 'full_text', 'cohesion','syntax', 'vocabulary',
       'phraseology', 'grammar', 'conventions', 'vecs'], axis=1).columns

In [None]:
eng_features

In [None]:
feature_set = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    vecs = row['vecs']
    vals = row[eng_features].astype(float)
    features = np.hstack([vecs, vals]).flatten()
    feature_set.append(features)
X = np.array(feature_set)

In [None]:
cohesion_train = df.cohesion.values
syntax_train = df.syntax.values
vocabulary_train = df.vocabulary.values
phraseology_train = df.phraseology.values
grammar_train = df.grammar.values
conventions_train = df.conventions.values

## Hyperparameter optimization with Optuna

In [None]:
'''
import optuna
from lightgbm.callback import log_evaluation, early_stopping

def objective(trial, data=X, target=conventions_train):
    
    train_x, test_x, train_y, test_y = model_selection.train_test_split(data, target, test_size=0.3, random_state=42)
    param = {
        'metric': 'rmse', 
        'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 10, 500),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.001, 0.01, 0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.2, 0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.004, 0.008, 0.01, 0.02, 0.05, .1, 0.2, 0.5]),
        'max_depth': trial.suggest_categorical('max_depth', [10, 20,100, 150]),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
    }
    model = LGBMRegressor(**param)  
    
    model.fit(train_x, train_y, eval_set=[(test_x, test_y)], callbacks=[log_evaluation(period=0)])
    
    preds = model.predict(test_x)
    
    rmse = np.sqrt(metrics.mean_squared_error(test_y, preds))
    
    return rmse
'''

In [None]:
'''
optuna.logging.set_verbosity(optuna.logging.INFO)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print(f'Number of finished trials: {len(study.trials)}')
print(f'Best trial: {study.best_trial.params}')
print(f'Best score: {study.best_value}')
'''

In [None]:
cohesion_best_params = {'n_estimators': 420, 'reg_alpha': 0.13919864437901744, 'reg_lambda': 0.5069801040693652, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.02, 'max_depth': 100, 'min_child_samples': 99}
syntax_best_params = {'n_estimators': 500, 'reg_alpha': 0.25908918503224804, 'reg_lambda': 0.004271708198460402, 'colsample_bytree': 0.6, 'subsample': 1.0, 'learning_rate': 0.02, 'max_depth': 150, 'min_child_samples': 61}
vocabulary_best_params = {'n_estimators': 428, 'reg_alpha': 0.0010889416899550251, 'reg_lambda': 0.20016253704202466, 'colsample_bytree': 0.4, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 10, 'min_child_samples': 19}
phraseology_best_params = {'n_estimators': 426, 'reg_alpha': 0.09286405380355575, 'reg_lambda': 0.011669618789040185, 'colsample_bytree': 0.3, 'subsample': 0.4, 'learning_rate': 0.02, 'max_depth': 100, 'min_child_samples': 1}
grammar_best_params = {'n_estimators': 410, 'reg_alpha': 0.002402918599536554, 'reg_lambda': 0.011524180731876684, 'colsample_bytree': 0.8, 'subsample': 0.4, 'learning_rate': 0.02, 'max_depth': 10, 'min_child_samples': 40}
conventions_best_params = {'n_estimators': 468, 'reg_alpha': 0.057858250513145686, 'reg_lambda': 4.601288604571916, 'colsample_bytree': 0.8, 'subsample': 0.5, 'learning_rate': 0.05, 'max_depth': 150, 'min_child_samples': 34}

In [None]:
training_performances_container = []
val_performances_container = []

In [None]:
cv_strategy = model_selection.KFold(n_splits=5, shuffle=True, random_state=42)

performances = {}

for var in [("cohesion", cohesion_train, cohesion_best_params), ("syntax", syntax_train, syntax_best_params), ("vocabulary", vocabulary_train, vocabulary_best_params), 
            ("phraseology", phraseology_train, phraseology_best_params), ("grammar", grammar_train, grammar_best_params), ("conventions", conventions_train, conventions_best_params)]:
    print(f'Validating on {var[0]}')
    fold = 0
    train_scores = []
    val_scores = []
    
    for train_idx, val_idx in cv_strategy.split(X, var[1]):

        # train
        X_train = X[train_idx]
        y_train = var[1][train_idx]

        # validation
        X_val = X[val_idx]
        y_val = var[1][val_idx]

        # training
        model = LGBMRegressor(**var[2])
        model.fit(X_train, y_train)

        # predicting
        train_preds = model.predict(X_train)
        val_preds = model.predict(X_val)

        # storing scores
        train_score = np.sqrt(metrics.mean_squared_error(y_train, train_preds))
        val_score = np.sqrt(metrics.mean_squared_error(y_val, val_preds))

        train_scores.append(train_score)
        val_scores.append(val_score)

        print(f"Fold {fold} ==> Train accuracy: {train_score:0.4f} | Validation accuracy: {val_score:0.4f}")
        fold += 1

    training_performance = np.mean(train_scores)
    val_performance = np.mean(val_scores)
    
    training_performances_container.append(training_performance)
    val_performances_container.append(val_performance)
    print(f"END. Average training performance: {training_performance:0.4f} | Average validation performance: {val_performance:0.4f}")


In [None]:
cohesion_model = LGBMRegressor(**cohesion_best_params)
syntax_model = LGBMRegressor(**syntax_best_params)
vocabulary_model = LGBMRegressor(**vocabulary_best_params)
phraseology_model = LGBMRegressor(**phraseology_best_params)
grammar_model = LGBMRegressor(**grammar_best_params)
conventions_model = LGBMRegressor(**conventions_best_params)

In [None]:
print(f'Fitting Cohesion Model')
cohesion_model.fit(X, cohesion_train)
print(f'Fitting Syntax Model')
syntax_model.fit(X, syntax_train)
print(f'Fitting Vocabulary Model')
vocabulary_model.fit(X, vocabulary_train)
print(f'Fitting Phraseology Model')
phraseology_model.fit(X, phraseology_train)
print(f'Fitting Grammar Model')
grammar_model.fit(X, grammar_train)
print(f'Fitting Conventions Model')
conventions_model.fit(X, conventions_train)

In [None]:
_test = test.copy()
_test['vecs'] = _test.full_text.apply(lambda x: extract_vectors([x]))
_test['char_count'] = _test['full_text'].progress_apply(len)
_test['word_count'] = _test['full_text'].progress_apply(lambda x: len(x.split()))
_test['word_density'] = _test['char_count'] / (_test['word_count']+1)
_test['punctuation_count'] = _test['full_text'].progress_apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
_test['title_word_count'] = _test['full_text'].progress_apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
_test['upper_case_word_count'] = _test['full_text'].progress_apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
_test['stopword_count'] = _test['full_text'].progress_apply(lambda x: len([wrd for wrd in x.split() if wrd.lower() in stop_words]))
_test['flesch_kincaid_score'] = _test['full_text'].progress_apply(lambda x: flesch_kincaid_score(x))

In [None]:
test_feature_set = []
for i, row in tqdm(_test.iterrows(), total=len(_test)):
    vecs = row['vecs']
    vals = row[eng_features].astype(float)
    features = np.hstack([vecs, vals]).flatten()
    test_feature_set.append(features)
X_test = np.array(test_feature_set)

In [None]:
cohesion_predictions = cohesion_model.predict(X_test)
syntax_predictions = syntax_model.predict(X_test)
vocabulary_predictions = vocabulary_model.predict(X_test)
phraseology_predictions = phraseology_model.predict(X_test)
grammar_predictions = grammar_model.predict(X_test)
conventions_predictions = conventions_model.predict(X_test)

In [None]:
submission = ss.copy()
submission.cohesion = cohesion_predictions
submission.syntax = syntax_predictions
submission.vocabulary = vocabulary_predictions
submission.phraseology = phraseology_predictions
submission.grammar = grammar_predictions
submission.conventions = conventions_predictions

In [None]:
submission.to_csv("submission.csv", index=False)