参考链接：https://www.jianshu.com/p/57a9b6103fe5
提交链接：https://www.kaggle.com/c/word2vec-nlp-tutorial

In [None]:
import pandas as pd
#import nltk
from bs4 import BeautifulSoup
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer#词干提取
from nltk.stem import WordNetLemmatizer#词形还原

In [2]:
train_data=pd.read_csv('data/labeledTrainData.tsv',header=0,sep='\t',quoting=3)

In [None]:
train_data[:3][:]

### 文本预处理
1. 去除HTML标签：BeautifulSoup
2. 将所有词转为小写
2. 去除数字和标点，用空格置换，（后续可以保留类似于"!!!" or ":-(" 这样的带有情感的符号）
3. 去除停用词：nltk.stopwords
4. 词干提取和词性还原：https://www.cnblogs.com/cwp-bg/p/9510513.html

In [None]:
#去除HTML标签
from bs4 import BeautifulSoup
example_1=BeautifulSoup(train_data['review'][0],'lxml')
example_1.get_text()

In [None]:
#转化为小写
example_2=example_1.get_text().lower()
#去除数字和标点
import re
example_3=re.sub('[^a-zA-Z]',' ',example_2)
example_3

In [None]:
nltk.download('punkt')

In [None]:
#分词，获取token
from nltk.tokenize import word_tokenize
word=word_tokenize(example_3)
print(len(word))
print(word)

In [None]:
nltk.download('stopwords')

In [None]:
#去除停用词
from nltk.corpus import stopwords
stops=stopwords.words('english')
word2=[w for w in word if w not in stops]
word2

In [None]:
nltk.download('wordnet')

In [None]:
#词干提取和词形还原
from nltk.stem.porter import PorterStemmer#词干提取
from nltk.stem import WordNetLemmatizer#词形还原
po_stem=PorterStemmer()
word3=[po_stem.stem(w) for w in word2]
w_lem=WordNetLemmatizer()
word3=[w_lem.lemmatize(w) for w in word2]
word3

In [None]:
#查看哪些词被提取词干和还原了
t=[w for w in word2 if w not in word3]
print(t)
t=[w for w in word3 if w not in word2]
print(t)

### 现在把上面的所有步骤都整合在一起，写成一个函数：

In [5]:
def txt2wordbag(raw_txt):
    l_stem=LancasterStemmer()#不能用PorterStemmer，对于一些未登录词会报错，比如OED
    w_lem=WordNetLemmatizer()
    txt=BeautifulSoup(raw_txt,'lxml')
    txt=txt.get_text().lower()
    txt=re.sub('[^a-zA-Z]',' ',txt)
    word=word_tokenize(txt)
    stops=set(stopwords.words('english'))
    word=[w for w in word if w not in stops]
    word=[l_stem.stem(w) for w in word]
    word=[w_lem.lemmatize(w) for w in word]
    return ' '.join(word)

In [6]:
size=train_data['review'].size
all_wordbag=[]
for i in range(0,size):
    if (i+1)%1000==0:
        print("process: %d of %d"%(i+1,size))
    t=txt2wordbag(train_data['review'][i])
    all_wordbag.append(t)

process: 1000 of 25000
process: 2000 of 25000
process: 3000 of 25000
process: 4000 of 25000
process: 5000 of 25000
process: 6000 of 25000
process: 7000 of 25000
process: 8000 of 25000
process: 9000 of 25000
process: 10000 of 25000
process: 11000 of 25000
process: 12000 of 25000
process: 13000 of 25000
process: 14000 of 25000
process: 15000 of 25000
process: 16000 of 25000
process: 17000 of 25000
process: 18000 of 25000
process: 19000 of 25000
process: 20000 of 25000
process: 21000 of 25000
process: 22000 of 25000
process: 23000 of 25000
process: 24000 of 25000
process: 25000 of 25000


### 使用sklearn提取词袋特征
从评论中提取的词中选取频次最高的前5000个单词作为词典

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vect=CountVectorizer(analyzer = "word",tokenizer = None,preprocessor = None, stop_words = None,max_features = 5000)
train_data_feature=vect.fit_transform(all_wordbag)
train_data_feature=train_data_feature.toarray()#得到的one-hot词向量
train_data_feature.shape

(25000, 5000)

In [8]:
vocab = vect.get_feature_names()#词汇表
vocab[:20]

['ab',
 'abandon',
 'abbot',
 'abc',
 'abduc',
 'abl',
 'abomin',
 'aborigin',
 'abort',
 'abound',
 'abraham',
 'abrupt',
 'absolv',
 'absorb',
 'abstract',
 'absurd',
 'abud',
 'abund',
 'abus',
 'abysm']

### 使用随机森林(、极端随机森林、梯度提升)预测

In [9]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
forest = RandomForestClassifier(n_estimators = 100) 
forest = forest.fit(train_data_feature,train_data['sentiment'])

In [10]:
test_data=pd.read_csv('data/testData.tsv',header=0,sep='\t',quoting=3)
size=test_data['review'].size
all_wordbag=[]
for i in range(0,size):
    if (i+1)%1000==0:
        print("process: %d of %d"%(i+1,size))
    t=txt2wordbag(test_data['review'][i])
    all_wordbag.append(t)

process: 1000 of 25000
process: 2000 of 25000
process: 3000 of 25000
process: 4000 of 25000
process: 5000 of 25000
process: 6000 of 25000
process: 7000 of 25000
process: 8000 of 25000
process: 9000 of 25000
process: 10000 of 25000
process: 11000 of 25000
process: 12000 of 25000
process: 13000 of 25000
process: 14000 of 25000
process: 15000 of 25000
process: 16000 of 25000
process: 17000 of 25000
process: 18000 of 25000
process: 19000 of 25000
process: 20000 of 25000
process: 21000 of 25000
process: 22000 of 25000
process: 23000 of 25000
process: 24000 of 25000
process: 25000 of 25000


In [11]:
test_data_feature=vect.fit_transform(all_wordbag)
test_data_feature=test_data_feature.toarray()#得到的one-hot词向量

In [13]:
result=forest.predict(test_data_feature)

### 创建提交文件

In [16]:
output = pd.DataFrame( data={"id":test_data["id"], "sentiment":result} )

output.to_csv( "data/result/Bag_of_Words_model.csv", index=False, quoting=3 )