# Data Wrangling

## Importing datasets

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

cl_train = pd.read_csv(os.path.join("dataset", "train.csv"))
cl_test = pd.read_csv(os.path.join("dataset", "test.csv"))


## Data Exploration

In [2]:
cl_train.head()

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845


It looks like we could only use one predictor `excerpt` to predict `target`

## Data Cleaning

We will use PorterStemmer to stem the sentences in the dataset which strips a suffix of words. LancasterStemmer is dropped because the stemming algorithm is too aggressive causing over-stemming.

In [3]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def stem_sentence(sentence):
    """ Given a sentence,
    modify each word in the sentence to stemmed word.
    """
    porter = PorterStemmer()
    words = word_tokenize(sentence)
    stemmed_words = []

    for word in words:
        stemmed_words.append(porter.stem(word))
        stemmed_words.append(" ")

    return "".join(stemmed_words)

def stem_paragraph(paragraph):
    """ Given a paragraph
    return a paragraph whose word is stemmed.
    """
    stemmed_sentence = []
    for sentence in paragraph.split("\n"):
        stemmed = stem_sentence(sentence)
        stemmed_sentence.append(stemmed)
        stemmed_sentence.append("\n")

    return "".join(stemmed_sentence)

def stem_dataset(dataset):
    # Stem all paragraphs in the dataset
    dataset_copy = dataset.copy()
    
    for index, row in dataset_copy.iterrows():
        dataset_copy.loc[index, 'clean_excerpt'] = stem_paragraph(row['excerpt'])

    return dataset_copy



In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_X(clean_data):
    vectorizer = TfidfVectorizer(lowercase=True,token_pattern=r'(?u)\b[A-Za-z]+\b',stop_words='english',max_features=2000,strip_accents='unicode')
    vectorizer.fit(clean_data['clean_excerpt'].values)

    X = pd.DataFrame(columns= range(0, 2000))

    for index, row in clean_data.iterrows():
        numbers = vectorizer.transform(clean_data['clean_excerpt'][[index]])
        X = X.append(pd.DataFrame(numbers.toarray()))

    return X

# Model Selection

Here we will explore several ML models to fit the data.
We use k-fold cross validation to determine the (negative) Mean Squared Prediction Error.
For evalution of models, I will use cross-validation rather than hold-out methods because the data is not too large and the trend of data might change drastically depending on how the data is split. Also, I'd like to use all the training data to fit a model which cannot be done with hold-out method.

In [5]:
def summary_scores(scores):
    print("scores:", scores)
    print("Mean:", scores.mean())
    print("StdDev", scores.std())

## Random Forest

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

y = cl_train.loc[:, 'target']
clean_train = stem_dataset(cl_train)
X = get_X(clean_train)

forest_reg = RandomForestRegressor()
scores = cross_val_score(forest_reg, X, y, scoring="neg_mean_squared_error", cv=10)

summary_scores(scores)

scores: [-0.5494264  -0.77342274 -0.76808348 -0.81752546 -0.71320759 -0.81412132
 -0.92116885 -0.89019521 -0.49008581 -0.74388369]
Mean: -0.7481120536214574
StdDev 0.1294762322116122


## XGBoost

In [7]:
from xgboost import XGBRegressor

xgb_reg = XGBRegressor()
scores = cross_val_score(xgb_reg, X, y, scoring="neg_mean_squared_error", cv=10)

summary_scores(scores)

scores: [-0.60714023 -0.76548463 -0.72409527 -0.72086336 -0.66467657 -0.73704524
 -0.84123137 -0.87512155 -0.60346959 -0.71691881]
Mean: -0.725604660725046
StdDev 0.08379102854461291


## Elastic Net

In [8]:
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
scores = cross_val_score(elastic_net, X, y, scoring="neg_mean_squared_error", cv=10)

summary_scores(scores)

scores: [-0.59897904 -0.94855322 -1.0715376  -1.31934371 -1.08471865 -1.32741218
 -1.66454245 -1.19868725 -0.53320022 -1.1841688 ]
Mean: -1.093114312118898
StdDev 0.3211151385475014


# Evaluation
Looks like random forest regressor and XGBoost seem very promising. We will try hypertune these models.

## hyperparameter tuning

In [16]:
from sklearn.model_selection import GridSearchCV
param_grid_rf = [
    {'n_estimators': [3, 10, 30, 50, 100], 'max_features': [2, 4, 6, 8]},
    {'bootstrap':[False], 'n_estimators': [3,10], 'max_features': [2, 3, 4]}
]

grid_search_rf = GridSearchCV(forest_reg, param_grid_rf, cv=10, scoring='neg_mean_squared_error', return_train_score=True)
grid_search_rf.fit(X, y)
result = grid_search_rf.cv_results_
for mean_score, params in zip(result["mean_test_score"], result["params"]):
    print(mean_score, params)

-1.1272475484489852 {'max_features': 2, 'n_estimators': 3}
-0.8493934919743149 {'max_features': 2, 'n_estimators': 10}
-0.7845822072537697 {'max_features': 2, 'n_estimators': 30}
-0.7791184846199336 {'max_features': 2, 'n_estimators': 50}
-0.7728278298944244 {'max_features': 2, 'n_estimators': 100}
-1.0548520213893688 {'max_features': 4, 'n_estimators': 3}
-0.855060500597002 {'max_features': 4, 'n_estimators': 10}
-0.7809480578570392 {'max_features': 4, 'n_estimators': 30}
-0.7683194080375831 {'max_features': 4, 'n_estimators': 50}
-0.751077660763361 {'max_features': 4, 'n_estimators': 100}
-1.0575076450300147 {'max_features': 6, 'n_estimators': 3}
-0.8185961891621819 {'max_features': 6, 'n_estimators': 10}
-0.7644292597009067 {'max_features': 6, 'n_estimators': 30}
-0.7464606903932263 {'max_features': 6, 'n_estimators': 50}
-0.7450545822667392 {'max_features': 6, 'n_estimators': 100}
-1.005575226272889 {'max_features': 8, 'n_estimators': 3}
-0.8279898384970001 {'max_features': 8, 'n_e

In [17]:
param_grid_xgb = [
    {'max_depth': [2, 3, 4, 5, 6], 'learning_rate': [0.1, 0.2, 0.3, 0.5, 0.8]}
]

grid_search_xgb = GridSearchCV(xgb_reg, param_grid_xgb, cv=10, scoring='neg_mean_squared_error', return_train_score=True)
grid_search_xgb.fit(X, y)
result = grid_search_xgb.cv_results_
result = grid_search_xgb.cv_results_
for mean_score, params in zip(result["mean_test_score"], result["params"]):
    print(mean_score, params)

-0.7858697012544287 {'learning_rate': 0.1, 'max_depth': 2}
-0.7491335550807847 {'learning_rate': 0.1, 'max_depth': 3}
-0.7244502287552528 {'learning_rate': 0.1, 'max_depth': 4}
-0.7131853128195291 {'learning_rate': 0.1, 'max_depth': 5}
-0.7065250615087468 {'learning_rate': 0.1, 'max_depth': 6}
-0.7225685660193695 {'learning_rate': 0.2, 'max_depth': 2}
-0.700593077063764 {'learning_rate': 0.2, 'max_depth': 3}
-0.6971121954140018 {'learning_rate': 0.2, 'max_depth': 4}
-0.6972089150438523 {'learning_rate': 0.2, 'max_depth': 5}
-0.6985663051005833 {'learning_rate': 0.2, 'max_depth': 6}
-0.7100671022445013 {'learning_rate': 0.3, 'max_depth': 2}
-0.7078963752420718 {'learning_rate': 0.3, 'max_depth': 3}
-0.7092652813677149 {'learning_rate': 0.3, 'max_depth': 4}
-0.7185644111652352 {'learning_rate': 0.3, 'max_depth': 5}
-0.725604660725046 {'learning_rate': 0.3, 'max_depth': 6}
-0.7564854851517179 {'learning_rate': 0.5, 'max_depth': 2}
-0.759832062530283 {'learning_rate': 0.5, 'max_depth': 3}


I have decided to use XG boost over random forest.

In [59]:
import pickle

final_model =grid_search_xgb.best_estimator_

pickle.dump(final_model, open(os.path.join("models", "xgb_final.sav"), 'wb'))

# Submission

In [58]:
X_test = get_X(stem_dataset(cl_test))
final_prediction = final_model.predict(X_test)
fp = pd.DataFrame(final_prediction, columns=['target'])

output = cl_test[['id']].join(fp)