In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



In [4]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse as sps
import re

cv_review = CountVectorizer(stop_words='english', strip_accents='ascii')
cv_condition = CountVectorizer(stop_words='english', strip_accents='ascii')

def preprocess_train(dataset):
    dataset['review'] = dataset['review'].replace({np.nan: ''})
    dataset['condition'] = dataset['condition'].replace({np.nan: ''})
    # need to encode this as a matrix!
    review_enc = cv_review.fit_transform(dataset['review'])       # returns a sparse matrix
    condition_enc = cv_condition.fit_transform(dataset['condition']) # returns a sparse matrix

    dates = pd.to_datetime(dataset['date'])
    date_mat = sps.csr_matrix(np.hstack([dates.dt.year.to_numpy().reshape(-1,1), dates.dt.month.to_numpy().reshape(-1,1), dates.dt.day.to_numpy().reshape(-1,1)]))
    usefulcnt_mat = sps.csr_matrix(dataset['usefulCount'].to_numpy().reshape(-1,1))
    return sps.hstack([condition_enc, review_enc, date_mat, usefulcnt_mat]), dataset['rating'].to_numpy()

def preprocess_test(dataset):
    dataset['review'] = dataset['review'].replace({np.nan: ''})
    dataset['condition'] = dataset['condition'].replace({np.nan: ''})
    # need to encode this as a matrix!
    review_enc = cv_review.transform(dataset['review'])       # returns a sparse matrix
    condition_enc = cv_condition.transform(dataset['condition']) # returns a sparse matrix

    dates = pd.to_datetime(dataset['date'])
    date_mat = sps.csr_matrix(np.hstack([dates.dt.year.to_numpy().reshape(-1,1), dates.dt.month.to_numpy().reshape(-1,1), dates.dt.day.to_numpy().reshape(-1,1)]))
    usefulcnt_mat = sps.csr_matrix(dataset['usefulCount'].to_numpy().reshape(-1,1))
    return sps.hstack([condition_enc, review_enc, date_mat, usefulcnt_mat]), dataset['rating'].to_numpy()

## Testing

In [5]:
dpath = "../data/part1_data"
train_data = pd.read_csv(f"{dpath}/DrugsComTrain.csv")
train_data.head()
train_data.shape

(112908, 5)

In [6]:
X_train, y_train = preprocess_train(train_data)

In [7]:
X_val, y_val = preprocess_test(pd.read_csv(f"{dpath}/DrugsComVal.csv"))
X_test, y_test = preprocess_test(pd.read_csv(f"{dpath}/DrugsComTest.csv"))

In [8]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [9]:
X_train.shape

(112908, 45221)

In [10]:
X_val.shape

(48389, 45221)

In [11]:
from sklearn.metrics import accuracy_score
train_acc = clf.score(X_train, y_train)
val_acc   = clf.score(X_val, y_val)
test_acc  = clf.score(X_test, y_test)
                           
print(f"Train accuracy: {train_acc:.4f}")
print(f"Validation accuracy: {val_acc:.4f}")
print(f"Test accuracy: {test_acc:.4f}")

Train accuracy: 1.0000
Validation accuracy: 0.5836
Test accuracy: 0.5777


In [13]:
from sklearn.model_selection import GridSearchCV
grid_searcher = GridSearchCV(clf, {'max_depth': range(4,11,2), 'min_samples_split': range(2,6), 'min_samples_leaf': range(1,6)}, n_jobs=-1)
grid_searcher.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
grid_searcher.best_params_

In [None]:
from sklearn.metrics import accuracy_score
train_acc = accuracy_score(y_train, grid_searcher.predict(X_train))
val_acc   = accuracy_score(y_val,   grid_searcher.predict(X_val))
test_acc  = accuracy_score(y_test,  grid_searcher.predict(X_test))
                           
print(f"Train accuracy: {train_acc:.4f}")
print(f"Validation accuracy: {val_acc:.4f}")
print(f"Test accuracy: {test_acc:.4f}")

In [None]:
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

In [None]:
fig, ax = plt.subplots(figsize=(8,6), dpi=150)
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.savefig('impurity_vs_alpha_mode.pdf', bbox_inches='tight')

In [None]:
clfs = []
for ccp_alpha in ccp_alphas:
    clf = tree.DecisionTreeClassifier(ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clfs.append(clf)
print(
    "Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
        clfs[-1].tree_.node_count, ccp_alphas[-1]
    )
)

In [None]:
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
fig, ax = plt.subplots(2, 1, figsize=(6,6), dpi=150)
ax[0].plot(ccp_alphas, node_counts, marker=".")
ax[0].set_xlabel("alpha")
ax[0].set_ylabel("number of nodes")
ax[0].set_title("Number of nodes vs alpha")
ax[1].plot(ccp_alphas, depth, marker=".")
ax[1].set_xlabel("alpha")
ax[1].set_ylabel("depth of tree")
ax[1].set_title("Depth vs alpha")
fig.tight_layout()
plt.savefig('nodes_vs_alpha_mode.pdf', bbox_inches='tight')

In [None]:
train_scores = [clf.score(X_train, y_train) for clf in clfs]
val_scores = [clf.score(X_val, y_val) for clf in clfs]
test_scores = [clf.score(X_test, y_test) for clf in clfs]

fig, ax = plt.subplots(figsize=(6,4), dpi=150)
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker=".", label="train")
ax.plot(ccp_alphas, val_scores, marker=".", label="val")
ax.plot(ccp_alphas, test_scores, marker=".", label="test")
ax.legend()
plt.savefig('accuracy_vs_alpha_mode.pdf', bbox_inches='tight')

In [None]:
best_tree_idx = np.argmax(val_scores)
print(
    "Number of nodes in the best tree is: {} with ccp_alpha: {}".format(
        clfs[best_tree_idx].tree_.node_count, ccp_alphas[best_tree_idx]
    )
)

In [None]:
best_tree = clfs[best_tree_idx]
best_tree_alpha = ccp_alphas[best_tree_idx]

In [None]:
train_acc = best_tree.score(X_train, y_train)
val_acc = best_tree.score(X_val, y_val)
test_acc = best_tree.score(X_test, y_test)

print(f"Train accuracy on best clf: {train_acc:.4f}")
print(f"Validation accuracy on best clf: {val_acc:.4f}")
print(f"Test accuracy on best clf: {test_acc:.4f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(bootstrap=True, oob_score=True)
clf.fit(X_train, y_train)
clf.score(X_val, y_val)

In [None]:
def oob_score(estimator, X_test, y_test):
    return estimator.oob_score_

gs = GridSearchCV(clf, {'n_estimators': range(50,201,50), 'max_features': range(1,5), 'min_samples_split': range(2,6)}, scoring=oob_score)
gs.fit(X_train, y_train)

In [None]:
clf_best = gs.best_estimator_
print(f"Train accuracy: {accuracy_score(y_train, clf_best.predict(X_train)):.4f}")
print(f"Out of bag accuracy: {clf_best.oob_score_:.4f}")
print(f"Validation accuracy: {accuracy_score(y_val, clf_best.predict(X_val)):.4f}")
print(f"Test accuracy: {accuracy_score(y_test, clf_best.predict(X_test)):.4f}")


In [None]:
gs.best_params_

In [None]:
from xgboost import XGBClassifier

X_train_raw, y_train_raw = no_impute(pd.read_csv(f"{dpath}/train.csv"))
X_val_raw, y_val_raw = no_impute(pd.read_csv(f"{dpath}/val.csv"))
X_test_raw, y_test_raw = no_impute(pd.read_csv(f"{dpath}/test.csv"))

clf = XGBClassifier()
gs = GridSearchCV(clf, {'n_estimators': range(10,51,10), 'subsample': np.arange(0.1,0.61,0.1), 'max_depth': range(4,11)})
gs.fit(X_train_raw, y_train_raw)

In [None]:
gs.best_params_

In [None]:
print(f"Train accuracy: {gs.score(X_train_raw, y_train_raw):.4f}")
print(f"Validation accuracy: {gs.score(X_val_raw, y_val_raw):.4f}")
print(f"Test accuracy: {gs.score(X_test_raw, y_test_raw):.4f}")