In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [6]:
def read_corpus(corpus_file, use_sentiment):
    st = PorterStemmer()
    stop = stopwords.words('english')
    documents = []
    labels = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            tokens = line.strip().split()
            
            # remove stopwords    
            doc = [token for token in tokens[3:] if token not in stop]
        
            # porter stemmer
            doc = [st.stem(word) for word in doc]

            documents.append(doc)

            if use_sentiment:
                # 2-class problem: positive vs negative
                labels.append( tokens[1] )
            else:
                # 6-class problem: books, camera, dvd, health, music, software
                labels.append( tokens[0] )
                
    return np.array(documents), np.array(labels)

In [17]:
X, Y = read_corpus('trainset.txt', use_sentiment=False)

In [140]:
# a dummy function that just returns its input
def identity(x):
    return x

In [141]:
vec = TfidfVectorizer(preprocessor = identity, tokenizer = identity)

In [159]:
params = {
    "cls__alpha": np.arange(0.3, 0.8, 0.01)
}

In [160]:
clf = Pipeline( [('vec', vec), ('cls', MultinomialNB())])

In [172]:
GS = GridSearchCV(clf, params, cv=5,
                       scoring="f1_macro", n_jobs=4, verbose=1, return_train_score=True)

In [173]:
GS.fit(X,Y);

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   12.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   54.4s
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:  1.2min finished


In [174]:
df = pd.DataFrame(GS.cv_results_)

In [175]:
best_setting = df[["params","mean_test_score"]].sort_values(by="mean_test_score", ascending=False).iloc[0]

In [176]:
best_setting

params             {'cls__alpha': 0.5400000000000003}
mean_test_score                               0.90988
Name: 24, dtype: object