# Data Wrangling

## Importing datasets

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

cl_train = pd.read_csv(os.path.join("dataset", "train.csv"))
cl_test = pd.read_csv(os.path.join("dataset", "test.csv"))

## Data Exploration

In [2]:
cl_train.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


It looks like we could only use one predictor `excerpt` to predict `target`

## Data Cleaning

We will use PorterStemmer to stem the sentences in the dataset which strips a suffix of words. LancasterStemmer is dropped because the stemming algorithm is too aggressive causing over-stemming.

In [3]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def stem_sentence(sentence):
    """ Given a sentence,
    modify each word in the sentence to stemmed word.
    """
    porter = PorterStemmer()
    words = word_tokenize(sentence)
    stemmed_words = []

    for word in words:
        stemmed_words.append(porter.stem(word))
        stemmed_words.append(" ")

    return "".join(stemmed_words)

def stem_paragraph(paragraph):
    """ Given a paragraph
    return a paragraph whose word is stemmed.
    """
    stemmed_sentence = []
    for sentence in paragraph.split("\n"):
        stemmed = stem_sentence(sentence)
        stemmed_sentence.append(stemmed)
        stemmed_sentence.append("\n")

    return "".join(stemmed_sentence)

def stem_dataset(dataset):
    # Stem all paragraphs in the dataset
    dataset_copy = dataset.copy()
    
    for index, row in dataset_copy.iterrows():
        dataset_copy.loc[index, 'clean_excerpt'] = stem_paragraph(row['excerpt'])

    return dataset_copy



In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_X(clean_data):
    vectorizer = TfidfVectorizer(lowercase=True,token_pattern=r'(?u)\b[A-Za-z]+\b',stop_words='english',max_features=2000,strip_accents='unicode')
    vectorizer.fit(clean_data['clean_excerpt'].values)

    X = pd.DataFrame(columns= range(0, 2000))

    for index, row in clean_data.iterrows():
        numbers = vectorizer.transform(clean_data['clean_excerpt'][[index]])
        X = X.append(pd.DataFrame(numbers.toarray()))

    return X

# Model Selection

Here we will explore several ML models to fit the data.
We use k-fold cross validation to determine the (negative) Mean Squared Prediction Error.
For evalution of models, I will use cross-validation rather than hold-out methods because the data is not too large and the trend of data might change drastically depending on how the data is split. Also, I'd like to use all the training data to fit a model which cannot be done with hold-out method.

In [5]:
def summary_scores(scores):
    print("scores:", scores)
    print("Mean:", scores.mean())
    print("StdDev", scores.std())

## Random Forest

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

y = cl_train.loc[:, 'target']
clean_train = stem_dataset(cl_train)
X = get_X(clean_train)

forest_reg = RandomForestRegressor()
scores = cross_val_score(forest_reg, X, y, scoring="neg_mean_squared_error", cv=10)

summary_scores(scores)

scores: [-0.54829039 -0.78331758 -0.76585322 -0.83507622 -0.73874038 -0.82733497
 -0.92774872 -0.89624048 -0.51306613 -0.72331626]
Mean: -0.7558984368568438
StdDev 0.12840414473872985


## XGBoost

In [7]:
from xgboost import XGBRegressor

xgb_reg = XGBRegressor()
scores = cross_val_score(xgb_reg, X, y, scoring="neg_mean_squared_error", cv=10)

summary_scores(scores)

scores: [-0.60714023 -0.76548463 -0.72409527 -0.72086336 -0.66467657 -0.73704524
 -0.84123137 -0.87512155 -0.60346959 -0.71691881]
Mean: -0.725604660725046
StdDev 0.0837910285446129


## Elastic Net

In [8]:
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
scores = cross_val_score(elastic_net, X, y, scoring="neg_mean_squared_error", cv=10)

summary_scores(scores)

scores: [-0.59897904 -0.94855322 -1.0715376  -1.31934371 -1.08471865 -1.32741218
 -1.66454245 -1.19868725 -0.53320022 -1.1841688 ]
Mean: -1.0931143121188982
StdDev 0.32111513854750146


# Evaluation
Looks like random forest regressor and XGBoost seem very promising. We will try fine-tune these models.

## hyperparameter tuning

In [9]:
from sklearn.model_selection import GridSearchCV
param_grid_rf = [
    {'n_estimators': [3, 10, 30, 50, 100], 'max_features': [2, 4, 6, 8]},
    {'bootstrap':[False], 'n_estimators': [3,10], 'max_features': [2, 3, 4]}
]

grid_search_rf = GridSearchCV(forest_reg, param_grid_rf, cv=10, scoring='neg_mean_squared_error', return_train_score=True)
grid_search_rf.fit(X, y)
result = grid_search_rf.cv_results_

min_rmse = min(np.sqrt(-result["mean_test_score"]))

print(f'Minimum RMSE for Random Forest: {min_rmse}')

Minimum RMSE for Random Forest: 0.8613121575635296


In [10]:
param_grid_xgb = [
    {'max_depth': [2, 3, 4, 5, 6], 'learning_rate': [0.1, 0.2, 0.3, 0.5, 0.8]}
]

grid_search_xgb = GridSearchCV(xgb_reg, param_grid_xgb, cv=10, scoring='neg_mean_squared_error', return_train_score=True)
grid_search_xgb.fit(X, y)
result = grid_search_xgb.cv_results_
result = grid_search_xgb.cv_results_


min_rmse = min(np.sqrt(-result["mean_test_score"]))

print(f'Minimum RMSE for XGBoost: {min_rmse}')

Minimum RMSE for XGBoost: 0.8349324496113455


I have decided to use XG boost over random forest.

In [59]:
import pickle

final_model =grid_search_xgb.best_estimator_

pickle.dump(final_model, open(os.path.join("models", "xgb_final.sav"), 'wb'))

# Submission

In [58]:
X_test = get_X(stem_dataset(cl_test))
final_prediction = final_model.predict(X_test)
fp = pd.DataFrame(final_prediction, columns=['target'])

output = cl_test[['id']].join(fp)

# output.to_csv("output.csv")