In [26]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
path = "F:/for learn/Python_ML_and_Kaggle/Datasets/imdb/"

In [3]:
train = pd.read_csv(path + "labeledTrainData.tsv", delimiter='\t')
test = pd.read_csv(path + "testData.tsv", delimiter='\t')

In [11]:
def review_to_text(review, remove_stopwords):
    ## 去掉 html标记
    raw_text = BeautifulSoup(review, 'html').get_text()
    ## 去掉非字母字符
    words = re.sub('[^a-zA-Z]', ' ', raw_text).lower().split()
    ## 去掉停用词
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if w not in stop_words]
    return words

In [17]:
def fileGenerator(file, feature):
    output = []
    for review in file[feature]:
        words = review_to_text(review, True)
        text = ' '.join(words)
        output.append(text)
    return output

In [18]:
x_train = fileGenerator(train, "review")
x_test = fileGenerator(test, "review")



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [24]:
y_train = train["sentiment"]

In [27]:
# 分别采用 CountVectorizer, TfidfVectorizer进行贝叶斯训练
pipe_count = Pipeline([('count_vec', CountVectorizer(analyzer="word")),
                       ('mnb', MultinomialNB())])
pipe_tfidf = Pipeline([('tfidf_vec', TfidfVectorizer(analyzer="word")),
                       ('mnb', MultinomialNB())])

In [31]:
params_count = {'count_vec__binary': [True, False], 
                'count_vec__ngram_range': [(1, 1), (1, 2)], 
                'mnb__alpha': [0.1, 1.0, 10.0]}
params_tfidf = {'tfidf_vec__binary': [True, False],
                'tfidf_vec__ngram_range': [(1, 1), (1, 2)], 
                'mnb__alpha': [0.1, 1.0, 10.0]}

In [33]:
gs_count = GridSearchCV(estimator=pipe_count, 
                        param_grid=params_count, 
                        cv=4, 
                        n_jobs=-1, 
                        verbose=1)
gs_tfidf = GridSearchCV(estimator=pipe_tfidf, 
                        param_grid=params_tfidf, 
                        cv=4, 
                        n_jobs=-1, 
                        verbose=1)

In [34]:
gs_count.fit(x_train, y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  4.5min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('count_vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preproc...nizer=None, vocabulary=None)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'count_vec__binary': [True, False], 'count_vec__ngram_range': [(1, 1), (1, 2)], 'mnb__alpha': [0.1, 1.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [40]:
print(gs_count.best_params_)
print(gs_count.best_score_)
count_y_predict = gs_count.predict(x_test)

{'count_vec__binary': True, 'count_vec__ngram_range': (1, 2), 'mnb__alpha': 1.0}
0.88216


In [38]:
gs_tfidf.fit(x_train, y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  5.4min finished


GridSearchCV(cv=4, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tru...rue,
        vocabulary=None)), ('mnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'tfidf_vec__binary': [True, False], 'tfidf_vec__ngram_range': [(1, 1), (1, 2)], 'mnb__alpha': [0.1, 1.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [39]:
print(gs_tfidf.best_params_)
print(gs_tfidf.best_score_)
tfidf_y_predict = gs_tfidf.predict(x_test)

{'mnb__alpha': 0.1, 'tfidf_vec__binary': True, 'tfidf_vec__ngram_range': (1, 2)}
0.88712


In [88]:
def subMission(result, file):
    sub = pd.DataFrame({'id': test['id'], 'sentiment': result})
    sub.to_csv(file,index=False)

In [89]:
subMission(count_y_predict, path + "sub_count.csv")
subMission(tfidf_y_predict, path + "sub_tfidf.csv")

In [117]:
# 用 word2vec进行预测
import nltk.data
from gensim.models import word2vec, Word2Vec
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [49]:
unlabeled_train = pd.read_csv(path + "unlabeledTrainData.tsv", delimiter='\t', quoting=3)
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [46]:
def review_to_sentences(review, tokenizer):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence)>0:
            sentences.append(review_to_text(raw_sentence, False))
    return sentences

In [66]:
corpora = []
for review in unlabeled_train['review']:
    corpora += review_to_sentences(review.encode('utf-8').decode('utf-8'), tokenizer)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [69]:
num_features = 300
min_word_count = 20
num_workers = 4
context = 10
downsampling = 1e-3

In [72]:
model = word2vec.Word2Vec(sentences=corpora,
                          workers=num_workers,
                          size=num_features,
                          min_count=min_word_count, 
                          window=context, 
                          sample=downsampling)

In [74]:
model.init_sims(replace=True)
# model.save(path + "300features_20minwords_10context")
# model = Word2Vec.load(path + "300features_20minwords_10context")
# model.most_similar("man")

In [106]:
# 词向量产生文本特征向量
def makeFeatureVec(words, mdoel, num_features):
    featureVec = np.zeros((num_features,), dtype="float32")
    nwords = 0
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            nwords += 1
            featureVec += model[word]
    featureVec = featureVec/nwords
    return featureVec    

In [112]:
# 每个词条影评转化为基于词向量的特征向量（平均词向量）
def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features), dtype="float32")
    for review in reviews:
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter += 1
    return reviewFeatureVecs

In [108]:
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append(review_to_text(review, remove_stopwords=True))



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [115]:
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append(review_to_text(review, remove_stopwords=True))



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [113]:
trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features)
testDataVecs = getAvgFeatureVecs(clean_test_reviews, model, num_features)

In [122]:
gbc = GradientBoostingClassifier()

In [123]:
params_gbc = {'n_estimators': [10, 100, 500], 
              'learning_rate': [0.01, 0.1, 1.0],
              'max_depth': [2, 3, 4]}

In [124]:
gs =  GridSearchCV(estimator=gbc, param_grid=params_gbc, cv=4, n_jobs=-1, verbose=1)

In [None]:
gs.fit(trainDataVecs, y_train)

In [129]:
print(gs.best_params_)
print(gs.best_score_)

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [None]:
wv_y_predict = gs.predict(testDatavecs)
subMisson(wv_y_predict, path + "sub_wv.csv")