In [1]:
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [2]:
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews



In [3]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

negfeats = [' '.join(movie_reviews.words(fileids=[f])) for f in negids]
posfeats = [' '.join(movie_reviews.words(fileids=[f])) for f in posids]

texts = negfeats + posfeats
labels = [0] * len(negfeats) + [1] * len(posfeats)

In [4]:
def make_pipeline(vectorizer, transformer, classifier):
    return Pipeline([
            ('vectorizer', vectorizer),
            ('transformer', transformer),
            ('classifier', classifier)
        ])
    

def make_classifier(text, label):
    svc = make_pipeline(CountVectorizer(min_df=1, ngram_range=(1, 5), max_df=0.9, stop_words=None),
                        TfidfTransformer(smooth_idf=True,use_idf=True),
                        LinearSVC(max_iter=500, loss='hinge', C=1.9, tol=0.001, random_state=777))
    svc.fit(text, label)
    return svc

In [5]:
classifier = make_classifier(texts, labels)



In [6]:
print('Точность предсказания равна = {}'.format(accuracy_score(labels, classifier.predict(texts))))

Точность предсказания равна = 1.0


In [7]:
with open('sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(classifier, f)