In [7]:
import os

import nltk
import numpy as np
import pyprind
import pandas as pd

In [2]:
pbar = pyprind.ProgBar(50004)  # 进度条对象, 50004 是迭代总次数, 即要读取的文档数
labels = {"pos": 1, "neg": 0}

df = pd.DataFrame()

In [3]:
# 读取文本, 比较直观的

for s in ("test", "train"):
    for l in ("pos", "neg"):
        path = "aclImdb/%s/%s" % (s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), "r") as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ["review", "sentiment"]

0% [############################# ] 100% | ETA: 00:00:21

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,This film is definetly Fonda's best film. The ...,1
1,There is no plot. There are no central charact...,1
2,Read a biography of the late George C. Scott a...,1
3,Alexander Nevsky is rightfully held up as a ci...,1
4,Kar Wai Wong's incredibly impressive romance t...,1


In [5]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv("moive_data.csv", index=False)

In [3]:
df = pd.read_csv("moive_data.csv")

df.head(3)

Unnamed: 0,review,sentiment
0,"I recently rented Twister, a movie I'd seen se...",1
1,"An awful film; badly written, badly acted, cli...",0
2,"It was a good story, but not very well told. I...",0


In [14]:
df.loc[0, "review"][-50:]

"smerizing. If you like his work, you'll like this."

In [4]:
import re

def preprocessor(text):
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + " ".join(emoticons).replace('-', '')
    return text

In [23]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [25]:
df["review"] = df["review"].apply(preprocessor)

In [5]:
def tokenizer(text):
    return text.split()

In [27]:
tokenizer("runners like running and thus they run")

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [6]:
from nltk.stem.porter import PorterStemmer


porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [29]:
tokenizer_porter("runners like running and thus they run")

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [8]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/kissg/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
from nltk.corpus import stopwords

stop = stopwords.words("english")
print([w for w in tokenizer_porter("a runner likes running and runs a lot")[-10:] if w not in stop])

['runner', 'like', 'run', 'run', 'lot']


In [11]:
X_train = df.loc[:25000, "review"].values
y_train = df.loc[:25000, "sentiment"].values

X_test = df.loc[25000:, "review"].values
y_test = df.loc[25000:, "sentiment"].values

In [14]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)

param_grid = [
    {
        "vect__ngram_range": [(1, 1)],
        "vect__stop_words": [stop, None],
        "vect__tokenizer": [tokenizer, tokenizer_porter],
        "clf__penalty": ["l1", "l2"],
        "clf__C": [1.0, 10.0, 100.0]
    },{
        "vect__ngram_range": [(1, 1)],
        "vect__stop_words": [stop, None],
        "vect__tokenizer": [tokenizer, tokenizer_porter],
        "vect__use_idf": [False],
        "vect__norm": [None],
        "clf__penalty": ["l1", "l2"],
        "clf__C": [1.0, 10.0, 100.0]
    },
]

lr_tfidf = Pipeline([("vect", tfidf), ("clf", LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring="accuracy", cv=5, verbose=1, n_jobs=-1)

gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 27.3min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 141.7min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 187.5min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', '...se_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=1