In [18]:
import sklearn
from sklearn.datasets import load_files
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt

In [19]:
#Load the original dataset
moviedir = r'./data/movie_reviews/txt_sentoken'
movies = load_files(moviedir, shuffle=True)
print("n_samples: %d" % len(movies.data))

n_samples: 2000


In [20]:
#Split the dataset into a training and a test set
docs_train, docs_test, train_target, test_target = train_test_split(
        movies.data, movies.target, test_size=0.1, random_state=None)

In [21]:
#Build a pipeline to filter out the tokens that are too rare or too frequent
pipeline = Pipeline([
        ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
        ('clf', LinearSVC(C=1000)),
])

In [23]:
# Set up a grid search to find out the best parameter.
parameters = {
        'vect__ngram_range': [(1, 1), (1, 2)],
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
grid_search.fit(docs_train, train_target)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.95,
                                                        max_features=None,
                                                        min_df=3,
                                                        ngram_range=(1, 1),
                                    

In [24]:
# print the mean and std for each candidate along with the parameter 
# settings for all the candidates explored by grid search.
n_candidates = len(grid_search.cv_results_['params'])
for i in range(n_candidates):
    print(i, 'params - %s; mean - %0.2f; std - %0.2f'
        % (grid_search.cv_results_['params'][i],
           grid_search.cv_results_['mean_test_score'][i],
           grid_search.cv_results_['std_test_score'][i]))

0 params - {'vect__ngram_range': (1, 1)}; mean - 0.85; std - 0.01
1 params - {'vect__ngram_range': (1, 2)}; mean - 0.86; std - 0.01


In [25]:
# Predict the outcome on the testing set and store it in a variable named y_predicted
target_predicted = grid_search.predict(docs_test)

In [28]:
# Print the classification report
print(metrics.classification_report(test_target, target_predicted,
                                        target_names=movies.target_names))

              precision    recall  f1-score   support

         neg       0.88      0.84      0.86        95
         pos       0.86      0.90      0.88       105

    accuracy                           0.87       200
   macro avg       0.87      0.87      0.87       200
weighted avg       0.87      0.87      0.87       200



In [29]:
# Print and plot the confusion matrix
cm = metrics.confusion_matrix(test_target, target_predicted)
print(cm)

[[80 15]
 [11 94]]


In [None]:
plt.matshow(cm)