In [1]:
import re
from bs4 import BeautifulSoup 
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer as TFIV

In [4]:
train = pd.read_csv('./dataset/labeledTrainData.tsv', header=0,
                delimiter="\t", quoting=3)
test = pd.read_csv('./dataset/testData.tsv', header=0, delimiter="\t",
               quoting=3 )
               
# Import both the training and test data.

In [2]:
#文本清理函数，生成wordlist
def review_to_wordlist(review):
    '''
    Meant for converting each of the IMDB reviews into a list of words.
    '''
    # First remove the HTML.
    review_text = BeautifulSoup(review).get_text()
    
    # Use regular expressions to only include words.
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
    # Convert words to lower case and split them into separate words.
    words = review_text.lower().split()
   
    # Return a list of words
    return(words)

# 数据预处理

In [5]:
#label
y_train = train['sentiment']

In [7]:
#处理成TF-IDF需要的数据格式[num_sample,],每个sample都是一段string文本
traindata = []
for i in range(0,len(train['review'])):
    traindata.append(" ".join(review_to_wordlist(train['review'][i])))
testdata = []
for i in range(0,len(test['review'])):
    testdata.append(" ".join(review_to_wordlist(test['review'][i])))

#调用sklearn的文本特征提取器
tfv = TFIV(min_df=3,  max_features=None, 
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [10]:
X_all = traindata + testdata # Combine both to fit the TFIDF vectorization.
lentrain = len(traindata)

tfv.fit(X_all) # This is the slow part!
X_all = tfv.transform(X_all)

X = X_all[:lentrain] # Separate back into training and test sets. 
X_test = X_all[lentrain:]

# 构建分类器：
LR/朴素贝叶斯/SGD

In [11]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.grid_search import GridSearchCV



In [13]:
grid_values = {'C':[30]} # Decide which settings you want for the grid search. 

model_LR = GridSearchCV(LR(penalty = 'l2', dual = True, random_state = 0), 
                        grid_values, scoring = 'roc_auc', cv = 20) 
# Try to set the scoring on what the contest is asking for. 
# The contest says scoring is for area under the ROC curve, so use this.
                        
model_LR.fit(X,y_train) # Fit the model.

GridSearchCV(cv=20, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1, param_grid={'C': [30]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [14]:
from sklearn.naive_bayes import MultinomialNB as MNB

In [15]:
model_NB = MNB()
model_NB.fit(X, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
from sklearn.cross_validation import cross_val_score
import numpy as np

print("20 Fold CV Score for Multinomial Naive Bayes: ", np.mean(cross_val_score
                                                                (model_NB, X, y_train, cv=20, scoring='roc_auc')))
     # This will give us a 20-fold cross validation score that looks at ROC_AUC so we can compare with Logi

20 Fold CV Score for Multinomial Naive Bayes:  0.94963712


In [17]:
from sklearn.linear_model import SGDClassifier as SGD
sgd_params = {'alpha': [0.00006, 0.00007, 0.00008, 0.0001, 0.0005]} # Regularization parameter

model_SGD = GridSearchCV(SGD(random_state = 0, shuffle = True, loss = 'modified_huber'), 
                        sgd_params, scoring = 'roc_auc', cv = 20) # Find out which regularization parameter works the best. 
                        
model_SGD.fit(X, y_train) # Fit the model.



GridSearchCV(cv=20, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='modified_huber', max_iter=5,
       n_iter=None, n_jobs=1, penalty='l2', power_t=0.5, random_state=0,
       shuffle=True, tol=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [6e-05, 7e-05, 8e-05, 0.0001, 0.0005]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

# 生成预测结果

In [22]:
#输出LR分类结果
LR_result = model_LR.predict(X_test)
LR_out = pd.DataFrame(data={"id":test["id"],"sentiment":LR_result})
LR_out.to_csv("Logistic_Reg.csv",index=False,quoting=3)

In [24]:
#输出朴素贝叶斯分类结果
NB_result = model_NB.predict(X_test)
NB_out = pd.DataFrame(data={"id":test["id"],"sentiment":NB_result})
NB_out.to_csv("MNB.csv",index=False,quoting=3)

In [25]:
#输出SGD模型输出
SGD_result = model_SGD.predict(X_test)
NB_out = pd.DataFrame(data={"id":test["id"],"sentiment":SGD_result})
NB_out.to_csv("SGD.csv",index=False,quoting=3)