In [22]:
#imports
import json
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

Read in the cleaned data from the previous notebook. I will split into train and validate (stratifying according to the target label) and transform using the tf-idf vectorizer.

In [2]:
#read in df
df = pd.read_csv('clean_stemmed.csv', index_col=0)
df.head()

Unnamed: 0,label,text
0,Thyroid_Cancer,thyroid surgeri children singl institut osama ...
1,Thyroid_Cancer,adopt strategi use prior year base four exclus...
2,Thyroid_Cancer,coronari arterybypass graft thrombosi brin bri...
3,Thyroid_Cancer,solitari plasmacytoma sp skull uncommon clinic...
4,Thyroid_Cancer,studi aim investig serum matrix metalloprotein...


In [3]:
#labels are balanced enough
df.label.value_counts()

Lung_Cancer       452
Thyroid_Cancer    283
Colon_Cancer      261
Name: label, dtype: int64

In [4]:
#split
train, val = train_test_split(df, train_size=0.7,
                              stratify=df.label)

train.shape, val.shape

((697, 2), (299, 2))

In [5]:
#transform train using vectorizer
vectorizer = TfidfVectorizer()
train_vec = vectorizer.fit_transform(train.text)

train_vec.shape

(697, 124793)

There are about 125k unique words found in my body of text. I'm going to mess with some of the parameters of the vectorizer to bring that value down. I'll only select the top 1,000 most frequently occuring words.

In [6]:
#round 2
vectorizer = TfidfVectorizer(max_features=1_000)
train_vec = vectorizer.fit_transform(train.text)

train_vec.shape

(697, 1000)

In [7]:
train_vec = pd.DataFrame(train_vec.toarray(), columns=vectorizer.vocabulary_.keys())
train_vec.head()

Unnamed: 0,heterogen,densiti,amount,respect,tumor,locat,area,significantli,differ,accord,...,hcc,ablat,circ,atm,scfvmtbhsp,circrna,wait,keloid,chl,uplcesiqtofmsm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.035233,0.0,0.0,0.0,0.0,0.013543,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.012075,0.01887,0.0,0.084698,0.01509,0.0,0.030736,0.023047,0.01766
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.03489,0.124067,0.010776,0.0,0.0,0.0,0.021319,0.0,0.0,0.054637,...,0.057295,0.013217,0.0,0.007125,0.020601,0.008258,0.0,0.0,0.01892,0.048325
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.02046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Still a lot of features. I'll use recursive feature elimination with a basic decision tree to reduce this to 50.

In [8]:
#run rfe
tree = DecisionTreeClassifier(random_state=42)
selector = RFE(tree, n_features_to_select=50, step=50)
selector.fit(train_vec, train.label)

RFE(estimator=DecisionTreeClassifier(random_state=42), n_features_to_select=50,
    step=50)

Here we can see the final 50 features that have been selected. Each row is a document from the train set. I will use these features to predict the label.

In [9]:
final_train = train_vec.loc[:, selector.support_]
final_train.head()

Unnamed: 0,lung,howev,signific,despit,higher,tabl,high,grade,associ,lower,...,accuraci,led,efficaci,period,stabl,coloni,sarscov,nk,ibd,exercis
0,0.0,0.0,0.0,0.009236,0.0,0.0,0.0,0.0,0.012403,0.0,...,0.0,0.0,0.0,0.024715,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.017456,0.030876,0.007472,0.0,0.016232,0.0,0.0,0.010902,...,0.035404,0.008456,0.0,0.078027,0.0,0.033445,0.0,0.0,0.021548,0.0
2,0.0,0.0,0.0,0.030571,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.027268,0.0,0.0,0.0,0.0,0.0,0.0
3,0.061663,0.0,0.0,0.107017,0.008179,0.0,0.0,0.0,0.068071,0.0,...,0.0,0.0,0.010631,0.020095,0.0,0.0,0.010145,0.0,0.029482,0.050725
4,0.0,0.0,0.0,0.061036,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.015554,0.0,0.028334,0.0,0.0,0.0,0.06282


In [10]:
le = LabelEncoder()
y_train = le.fit_transform(train.label)

First, let's establish a baseline accuracy. The baseline will be calculated by predicting the most prevalent class every time.

In [11]:
train.label.value_counts()

Lung_Cancer       316
Thyroid_Cancer    198
Colon_Cancer      183
Name: label, dtype: int64

In [12]:
round(train.label.value_counts().max() / train.label.value_counts().sum() * 100, 2)

45.34

Our baseline accuracy is about 45% when guessing the most prevalent class (Lung_Cancer) for every document. Let's see if we can beat that baseline with an xgboost model with all default parameters.

In [13]:
clf = XGBClassifier(random_state=42)
clf.fit(final_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)

In [14]:
clf.score(final_train, y_train)

1.0

Perfect accuracy on the train dataset.. I'm concerned. Let's prepare the validate dataset the same way as train, and see how our model fares on unseen data.

In [15]:
val_vec = vectorizer.transform(val.text)
val_vec = pd.DataFrame(val_vec.toarray(), columns=vectorizer.vocabulary_.keys())
final_val = val_vec.loc[:, selector.support_]
final_val.head()

Unnamed: 0,lung,howev,signific,despit,higher,tabl,high,grade,associ,lower,...,accuraci,led,efficaci,period,stabl,coloni,sarscov,nk,ibd,exercis
0,0.042803,0.01639,0.0,0.045614,0.0,0.0,0.0,0.0,0.024501,0.0,...,0.0,0.014991,0.0,0.016274,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.017168,0.006232,0.016514,0.006769,0.030325,0.0,0.06365,...,0.0,0.0,0.008101,0.099532,0.0,0.009298,0.015461,0.012204,0.0,0.0
2,0.0,0.0,0.0,0.009706,0.0,0.0,0.0,0.0,0.104273,0.0,...,0.0,0.0,0.0,0.06926,0.0,0.094626,0.0,0.0,0.020322,0.0
3,0.0,0.033239,0.0,0.00925,0.0,0.0,0.0,0.0,0.0,0.039195,...,0.0,0.0,0.017459,0.024752,0.0,0.01002,0.0,0.0,0.038734,0.0
4,0.0,0.0,0.175127,0.01475,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.058562,0.047933,0.0,0.0,0.0,0.079703


In [16]:
y_val = le.transform(val.label)

Now that our validate dataset has been prepared in the same manner as train, it can be used to generate predictions from our trained model.

In [17]:
clf.score(final_val, y_val)

0.8662207357859532

Over 86% accuracy! This is a great start for the project. We have almost doubled the baseline accuracy (45%). This proves the project has merit, and future feature selection / model fine-tuning could improve the result. Now, the goal is to pickle this model and make it callable from the command line.

In [19]:
with open('xgb.pickle', 'wb') as f:
    pickle.dump(clf, f, pickle.HIGHEST_PROTOCOL)

I will also need to pickle the vectorizer, so it can be used to prepare the text data of future articles.

In [20]:
with open('vec.pickle', 'wb') as f:
    pickle.dump(vectorizer, f, pickle.HIGHEST_PROTOCOL)

I had an idea. I may need my label encoder as well. This object will help me get back the human-readable labels that are predicted by my model.

In [21]:
with open('enc.pickle', 'wb') as f:
    pickle.dump(le, f, pickle.HIGHEST_PROTOCOL)

In [30]:
with open('feature_list.json', 'w') as f:
    json.dump(selector.support_.tolist(), f)