In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#jsonモジュールのインポート
import json
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.parsing.preprocessing import remove_stopwords
import nltk
from nltk.stem import SnowballStemmer
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics

!pip install --upgrade gensim



Requirement already up-to-date: gensim in /usr/local/lib/python3.7/dist-packages (4.0.1)


In [None]:
snowball = SnowballStemmer(language='english')

In [None]:
from google.colab import drive
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [None]:
test_df = pd.read_csv("/drive/My Drive/Colab Notebooks/CommonLit Readability/test.csv")
train_df = pd.read_csv("/drive/My Drive/Colab Notebooks/CommonLit Readability/train.csv")

##Baseline Preprocessing

In [None]:
def text_cleaning(text):
    text = ''.join([k if k not in string.punctuation else ' ' for k in text])
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    
    return text

In [None]:
train_df['excerpt'] = train_df['excerpt'].apply(text_cleaning)
test_df['excerpt'] = test_df['excerpt'].apply(text_cleaning)

In [None]:
def create_taggedDocument_from_text(row):
    text = row['excerpt']
    #text = remove_stopwords(text)
    
    textWordlist = nltk.word_tokenize(text)

    wordlist = [word for word in textWordlist]
    #wordlist = [snowball.stem(word) for word in textWordlist]
    return TaggedDocument(words=wordlist, tags=[row['id']])

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
train_df['taggedDocument'] = train_df.apply(create_taggedDocument_from_text, axis=1)
test_df['taggedDocument'] = test_df.apply(create_taggedDocument_from_text, axis=1)

In [None]:
training_docs = train_df['taggedDocument'].values.tolist() + test_df['taggedDocument'].values.tolist()

In [None]:
# 学習実行（パラメータを調整可能）
# documents:学習データ（TaggedDocumentのリスト）
# min_count=1:最低1回出現した単語を学習に使用する
# dm=0:学習モデル=DBOW（デフォルトはdm=1:学習モデル=DM）
dvmodel = Doc2Vec(documents=training_docs, 
                epochs=50, 
                alpha=0.0025, 
                min_alpha=0.000001, 
                sample=0.001, 
                min_count=5, 
                window=15, 
                negative=5,
                ns_exponent=0.75, 
                dbow_words=0, 
                dm=1)

In [None]:
#Doc2Vecからベクトルを特徴量として抽出
train_docvecs_df = pd.DataFrame()
test_docvecs_df = pd.DataFrame()


for Id in train_df["id"]:
    train_docvecs_df[Id] = dvmodel.dv[Id]
for Id in test_df["id"]:
    test_docvecs_df[Id] = dvmodel.dv[Id]

train_docvecs_df = train_docvecs_df.T
train_docvecs_df = train_docvecs_df.rename_axis('id').reset_index()

test_docvecs_df = test_docvecs_df.T
test_docvecs_df = test_docvecs_df.rename_axis('id').reset_index()

In [None]:
train_X, val_X, train_y, val_y = train_test_split(train_docvecs_df.drop('id', axis=1), train_df['target'], test_size = 0.3, random_state=71)
lgb_train = lgb.Dataset(train_X.values, train_y.values)
lgb_eval = lgb.Dataset(val_X.values, val_y.values, reference=lgb_train)

In [None]:
params = {
    # 回帰問題
    'objective': 'regression',
    # RMSEで評価
    'metric': 'rmse',
}
lgbModel = lgb.train(params, lgb_train, valid_sets=lgb_eval,
                     verbose_eval=100,  # 50イテレーション毎に学習結果出力
                     num_boost_round=1000,  # 最大イテレーション回数指定
                     early_stopping_rounds=500,
                    )

Training until validation scores don't improve for 500 rounds.
[100]	valid_0's rmse: 0.860603
[200]	valid_0's rmse: 0.867301
[300]	valid_0's rmse: 0.868667
[400]	valid_0's rmse: 0.869191
[500]	valid_0's rmse: 0.869261
Early stopping, best iteration is:
[28]	valid_0's rmse: 0.841667


In [None]:
y_pred = lgbModel.predict(val_X.values, num_iteration=lgbModel.best_iteration)
rmse_baseline = np.sqrt(metrics.mean_squared_error(val_y.values, y_pred))
rmse_baseline

0.8416665319177608

In [None]:
predicted = lgbModel.predict(test_docvecs_df.drop('id', axis=1).values, num_iteration=lgbModel.best_iteration)

In [None]:
predicted

array([-1.31582229, -0.14363317, -0.2113018 , -1.35768384, -1.96663057,
       -0.71837581, -0.3442394 ])

## Model Improvement

In [None]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

In [None]:
#LightGBM without parameter tuning
lgModel2 = lgb.LGBMRegressor().fit(train_X.values, train_y.values)

In [None]:
y_pred = lgModel2.predict(val_X.values)
rmse = np.sqrt(metrics.mean_squared_error(val_y.values, y_pred))
rmse

0.8357415269874051

In [None]:
#Linear Regression
from sklearn.linear_model import LinearRegression

lrModel = LinearRegression().fit(train_X.values, train_y.values)

In [None]:
y_pred = lrModel.predict(val_X.values)
rmse = np.sqrt(metrics.mean_squared_error(val_y.values, y_pred))
rmse

0.8635875226486939

In [None]:
#Ridge Regression
from sklearn.linear_model import Ridge

rrModel = Ridge().fit(train_X.values, train_y.values)

In [None]:
y_pred = rrModel.predict(val_X.values)
rmse = np.sqrt(metrics.mean_squared_error(val_y.values, y_pred))
rmse

0.8559836986869808

In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeRegressor 

dtModel = DecisionTreeRegressor().fit(train_X.values, train_y.values)

In [None]:
y_pred = dtModel.predict(val_X.values)
rmse = np.sqrt(metrics.mean_squared_error(val_y.values, y_pred))
rmse

1.124067296111009

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestRegressor

rfModel = RandomForestRegressor().fit(train_X.values, train_y.values)

In [None]:
y_pred = rfModel.predict(val_X.values)
rmse = np.sqrt(metrics.mean_squared_error(val_y.values, y_pred))
rmse

0.8355171932119455

Random Forest Parameter Hypertuning

In [None]:
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

rfModel2 = RandomForestRegressor(random_state = 42)

print('Parameters currently in use:\n')
pprint(rfModel2.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [None]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rfModel2, param_distributions = random_grid, scoring="neg_mean_squared_error", n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_X.values, train_y.values)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 21.5min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 94.5min


In [None]:
rf_random.best_params_

{'bootstrap': False,
 'max_depth': 30,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 800}

In [None]:
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(train_X.values, train_y.values)
pred_base = base_model.predict(val_X.values)
rmse_base = np.sqrt(metrics.mean_squared_error(val_y.values, pred_base))
print("RMSE Base: ",rmse_base)

best_random = RandomForestRegressor(bootstrap = False, max_depth = 30, max_features = 'sqrt', min_samples_leaf = 2, min_samples_split = 10, n_estimators = 800, random_state = 42)
best_random.fit(train_X.values, train_y.values)
pred_random = best_random.predict(val_X.values)
rmse_random = np.sqrt(metrics.mean_squared_error(val_y.values, pred_random))
print("RMSE Random: ",rmse_random)

print('Improvement of {:0.2f}%.'.format( 100 * (rmse_base - rmse_random) / rmse_base))



RMSE Base:  0.8731974651457542
RMSE Random:  0.8242243824779221
Improvement of 5.61%.


In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False],
    'max_depth': [30, 40, 50, 60, None],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [600, 700, 800, 900, 1000]
}

pprint(param_grid)

{'bootstrap': [False],
 'max_depth': [30, 40, 50, 60, None],
 'max_features': ['sqrt'],
 'min_samples_leaf': [1, 2, 3],
 'min_samples_split': [8, 10, 12],
 'n_estimators': [600, 700, 800, 900, 1000]}


In [None]:
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(train_X.values, train_y.values)

Fitting 3 folds for each of 225 candidates, totalling 675 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed: 39.2min


In [None]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 12,
 'n_estimators': 700}

In [None]:
from sklearn.ensemble import RandomForestRegressor

base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(train_X.values, train_y.values)
pred_base = base_model.predict(val_X.values)
rmse_base = np.sqrt(metrics.mean_squared_error(val_y.values, pred_base))
print("RMSE Base: ",rmse_base)

best_grid = RandomForestRegressor(bootstrap = False, max_depth = None, max_features = 'sqrt', min_samples_leaf = 2, min_samples_split = 12, n_estimators = 700, random_state = 42)
best_grid.fit(train_X.values, train_y.values)
pred_grid = best_grid.predict(val_X.values)
rmse_grid = np.sqrt(metrics.mean_squared_error(val_y.values, pred_grid))
print("RMSE Grid: ",rmse_grid)

print('Improvement of {:0.2f}%.'.format( 100 * (rmse_base - rmse_grid) / rmse_base))


RMSE Base:  0.8661309124647412
RMSE Grid:  0.8239196044889792
Improvement of 4.87%.


In [None]:
print("RMSE Baseline: ",rmse_baseline)
print("RMSE Improved Model: ",rmse_grid)
print('Improvement of {:0.2f}%.'.format( 100 * (rmse_baseline - rmse_grid) / rmse_baseline))

RMSE Baseline:  0.8416665319177608
RMSE Improved Model:  0.824183877763254
Improvement of 2.08%.
