<a href="https://colab.research.google.com/github/Dash400air/CommonLit/blob/main/CommonLit_LightGBM%2BRoberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [77]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [78]:
!pip install transformers==4.5.0 pytorch-lightning==1.2.7



In [79]:
!pip install nltk



In [80]:
!pip install textstat



In [81]:
import os
import pandas as pd
import numpy as np


import random
import glob
import re
import string
from tqdm import tqdm
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer, RobertaModel
import pytorch_lightning as pl

from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing.text import Tokenizer

import lightgbm as lgb

from fastprogress.fastprogress import  progress_bar

In [102]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/Common Lit kaggle/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/Common Lit kaggle/test.csv')

# Textstat

In [103]:
import textstat

def get_readability(df):
    excerpt = df.excerpt
    excerpt_list = excerpt.values.tolist()
    readability_list = []

    for text in excerpt_list: 
        fre = textstat.flesch_reading_ease(text)
        smog = textstat.smog_index(text)
        fkg = textstat.flesch_kincaid_grade(text)
        cli = textstat.coleman_liau_index(text)
        arbi = textstat.automated_readability_index(text)
        dcrs = textstat.dale_chall_readability_score(text)
        dw = textstat.difficult_words(text)
        lwf = textstat.linsear_write_formula(text)
        gf = textstat.gunning_fog(text)

        readability_list.append([fre, smog, fkg, cli, arbi, dcrs, dw, lwf, gf])
    readability = pd.DataFrame(readability_list)
    readability = readability.rename(columns={0: 'fre',
                                             1: 'smog',
                                             2: 'fkg',
                                             3: 'cli',
                                             4: 'arbi',
                                             5: 'dcrs',
                                             6: 'dw',
                                             7: 'lwf',
                                             8: 'gf'
                                             }
                                    )
    return readability

In [104]:
readability_train = get_readability(df)
readability_test = get_readability(test_df)

In [105]:
readability_train.head()

Unnamed: 0,fre,smog,fkg,cli,arbi,dcrs,dw,lwf,gf
0,80.31,8.6,6.1,8.06,8.3,6.65,25,9.0,8.31
1,82.54,8.3,5.2,6.78,7.2,5.92,17,7.285714,7.53
2,75.74,10.1,7.9,7.2,10.1,6.29,17,14.75,10.49
3,72.02,6.7,11.4,8.54,16.4,6.61,14,12.5,13.61
4,75.47,8.8,10.0,4.83,11.8,1.57,1,13.5,11.76


# Preprocessing

In [106]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [107]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords

def remove_stopwords(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [108]:
stemmer = nltk.SnowballStemmer("english")

def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

In [109]:
def preprocess_data(text, strip=False):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords
    #text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    # Stemm all the words in the sentence
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    
    if strip:
        text = text.strip()
    
    return text

In [110]:
df['excerpt'] = df['excerpt'].apply(preprocess_data)
df.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,when the young peopl return to the ballroom it...,-0.340259,0.464009
1,85aa80a4c,,,all through dinner time mrs fayr was somewhat ...,-0.315372,0.480805
2,b69ac6792,,,as roger had predict the snow depart as quick ...,-0.580118,0.476676
3,dd1000b26,,,and outsid befor the palac a great garden was ...,-1.054013,0.450007
4,37c1b32fb,,,onc upon a time there were three bear who live...,0.247197,0.510845


# Bert Dataset

In [111]:
class BertDataset(nn.Module):
    def __init__(self, df, tokenizer, max_len=128):
        self.excerpt = df['excerpt'].to_numpy()
        self.max_len = max_len
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(
            self.excerpt[idx],
            return_tensors='pt',
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )
        return encode
    
    def __len__(self):
        return len(self.excerpt)
    

def get_embeddings(df, path, plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    MODEL_PATH = path
    model = RobertaModel.from_pretrained(MODEL_PATH, num_labels=1)
    tokenizer = RobertaTokenizer.from_pretrained(MODEL_PATH)
    model.to(device)
    model.eval()

    ds = BertDataset(df, tokenizer, config['max_len'])
    dl = DataLoader(
        ds,
        batch_size=config["batch_size"],
        shuffle=False,
        num_workers = 4,
        pin_memory=True,
        drop_last=False
    )

    embeddings = list()
    with torch.no_grad():
        for i, inputs in progress_bar(list(enumerate(dl))):
            inputs = {key:val.reshape(val.shape[0], -1).to(device) for key, val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs[0][:, 0].detach().cpu().numpy()
            embeddings.extend(outputs)
            
    return np.array(embeddings)

In [112]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [113]:
config = {
    'batch_size': 128,
    'max_len': 256,
    'seed': 42,
}
seed_everything(seed=config['seed'])

train_embeddings =  get_embeddings(df,'roberta-base')
test_embeddings = get_embeddings(test_df,'roberta-base')

cuda is used


  cpuset_checked))


cuda is used


# Tf-idf, Wordcounts

In [114]:
vectorizer = TfidfVectorizer(max_features=1000)

train_word_tf_idfs = vectorizer.fit_transform(df['excerpt'].values).toarray()
train_word_tf_idfs = pd.DataFrame(train_word_tf_idfs)
train_word_tf_idfs.columns = vectorizer.get_feature_names()

test_word_tf_idfs = vectorizer.fit_transform(test_df['excerpt'].values).toarray()
test_word_tf_idfs = pd.DataFrame(test_word_tf_idfs)
test_word_tf_idfs.columns = vectorizer.get_feature_names()

def count_words(df):
    word_counts = []
    for sentence in progress_bar(df['excerpt']):
        sentence = re.sub("[^a-zA-Z]", " ", sentence)
        
        words = sentence.split()
        count = len(words)
        word_counts.append(count)
        
    return word_counts

df['count_words'] = count_words(df)
test_df['count_words'] = count_words(test_df)

# Text matrix

In [115]:
M_train, M_test = df['excerpt'], test_df['excerpt']

texts_train, texts_test = M_train, M_test
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts_train)
tokenizer.fit_on_texts(texts_test)

matrix1 = tokenizer.texts_to_matrix(texts_train, "binary")
matrix2 = tokenizer.texts_to_matrix(texts_train, "count")
matrix3 = tokenizer.texts_to_matrix(texts_train, "tfidf")
M1_train = pd.DataFrame(matrix1)
M2_train = pd.DataFrame(matrix2)
M3_train = pd.DataFrame(matrix3)

matrix1 = tokenizer.texts_to_matrix(texts_test, "binary")
matrix2 = tokenizer.texts_to_matrix(texts_test, "count")
matrix3 = tokenizer.texts_to_matrix(texts_test, "tfidf")
M1_test = pd.DataFrame(matrix1)
M2_test = pd.DataFrame(matrix2)
M3_test = pd.DataFrame(matrix3)

# Get Dataframe

In [123]:
X_train = pd.DataFrame(train_embeddings)
X_train = pd.concat([X_train, readability_train[['fre', 'smog', 'dcrs', 'dw']], df['count_words'], M3_train], axis=1)

X_test = pd.DataFrame(test_embeddings)
X_test = pd.concat([X_test, readability_test[['fre', 'smog', 'dcrs', 'dw']], test_df['count_words'], M3_test], axis=1)

train_columns = X_train.columns.tolist()
test_columns = X_test.columns.tolist()

#diff_columns = list(set(test_columns) - set(train_columns))
#X_train = X_train.reindex(columns = train_columns + diff_columns)

#diff_columns = list(set(train_columns) - set(test_columns))
#X_test = X_test.reindex(columns = test_columns + diff_columns)

In [124]:
y_train = df['target']

In [125]:
train = pd.concat([X_train, y_train], axis=1)

# K-fold

In [126]:
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["kfold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["target"], bins=num_bins, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data


# create folds
train = create_folds(train, num_splits=5)

In [127]:
y_train = train[['target', 'kfold']]
X_train = train.drop(['target'], axis=1)

# Run Model

In [129]:
params = {
    'boosting_type': 'gbdt', #
    'metric': 'rmse',
    'objective': 'regression',
    'seed': 42, #
    'learning_rate': 0.01, #
    "n_jobs": -1,
    'max_depth': 4, #
    "verbose": -1
}

pred = np.zeros(X_test.shape[0])
rmses = []

for fold in range(5):
    x_tr = X_train[X_train.kfold != fold].reset_index(drop=True).drop(['kfold'], axis=1)
    x_va = X_train[X_train.kfold == fold].reset_index(drop=True).drop(['kfold'], axis=1)
    y_tr = y_train[y_train.kfold != fold].reset_index(drop=True).drop(['kfold'], axis=1)
    y_va = y_train[y_train.kfold == fold].reset_index(drop=True).drop(['kfold'], axis=1)

    train_set = lgb.Dataset(x_tr, y_tr)
    val_set = lgb.Dataset(x_va, y_va, reference=train_set)

    model = lgb.train(
        params,
        train_set, 
        num_boost_round=10000,
        early_stopping_rounds=100,
        valid_sets=[train_set, val_set], 
        verbose_eval=-1
    )

    y_pred = model.predict(x_va)
    rmse = np.sqrt(mse(y_va, y_pred))
    rmses.append(rmse)
    
    tmp_pred = model.predict(X_test)
    pred += tmp_pred / 5
    
print("\n", "Mean Fold RMSE:", np.mean(rmses))

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[3175]	training's rmse: 0.406358	valid_1's rmse: 0.623629
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[4482]	training's rmse: 0.355387	valid_1's rmse: 0.636776
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[4028]	training's rmse: 0.366727	valid_1's rmse: 0.64384
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[2694]	training's rmse: 0.424723	valid_1's rmse: 0.652254
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[3325]	training's rmse: 0.397707	valid_1's rmse: 0.624418

 Mean Fold RMSE: 0.6361833655101314
