In [1]:
import pandas as pd
import os

In [2]:
train_pos_folder = "aclImdb/train/pos/"
train_neg_folder = "aclImdb/train/neg/"
train_unsup_folder = "aclImdb/train/unsup/"

In [3]:
train_pos_files = os.listdir(train_pos_folder)
print len(train_pos_files)

12500


In [4]:
train_unsup_files = os.listdir(train_unsup_folder)
print len(train_unsup_files)

50000


In [5]:
train_neg_files = os.listdir(train_neg_folder)
print len(train_neg_files)

12500


In [6]:
train_pos_df = pd.DataFrame(columns=["Unique ID","File Name","File ID","Rating","Review"])

In [7]:
%%time

dataset = "train_pos_"
for f in train_pos_files:
    open_f = open(train_pos_folder+f)
    review = (open_f.read()).replace("<br />","\n")
    #print review
    file_name = f
    file_id = int(f.split("_")[0])
    rating = int((f.split("_")[1]).split(".")[0])
    unique_id = dataset + str(file_id) 
    train_pos_df.loc[len(train_pos_df)] = [unique_id,file_name,file_id,rating,review]

CPU times: user 45.8 s, sys: 156 ms, total: 46 s
Wall time: 46 s


In [8]:
train_neg_df = pd.DataFrame(columns=["Unique ID","File Name","File ID","Rating","Review"])

In [9]:
%%time

dataset = "train_neg_"
for f in train_neg_files:
    open_f = open(train_neg_folder+f)
    review = (open_f.read()).replace("<br />","\n")
    #print review
    file_name = f
    file_id = int(f.split("_")[0])
    rating = int((f.split("_")[1]).split(".")[0])
    unique_id = dataset + str(file_id) 
    train_neg_df.loc[len(train_neg_df)] = [unique_id,file_name,file_id,rating,review]

CPU times: user 46.7 s, sys: 212 ms, total: 47 s
Wall time: 46.9 s


In [10]:
train_df = pd.concat([train_pos_df, train_neg_df],ignore_index=True)

In [13]:
train_df["Label"] = "neg"
train_df.loc[train_df["Rating"] >= 5, "Label"] = "pos"

In [16]:
train_df.groupby("Label").size()

Label
neg    12500
pos    12500
dtype: int64

In [18]:
import re, string
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
import pdb
from pycorenlp import StanfordCoreNLP

stemmer = SnowballStemmer('english')

def tokenize_text(txt):
    
    if len(txt.split()) < 10:
            return []
    txt = re.sub(r"[^A-Za-z]", " ", txt)
    txt = re.sub(r"\"", " \" ", txt)
    txt = re.sub(r",", " , ", txt)
    txt = re.sub(r"\(", " ( ", txt)
    txt = re.sub(r"\)", " ) ", txt)
    txt = re.sub(r"\s{2,}", " ", txt)
    
    tokens = word_tokenize(txt)
    
    #tokens.append('length_' + str(len(tokens) / 10))
    
    tokens = [stemmer.stem(tok) for tok in tokens if (len(tok) > 2) or (tok in string.punctuation)]
    
    return tokens

In [20]:
X_train = train_df["Review"]
y_train = train_df["Label"]

In [22]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

vectorizer = TfidfVectorizer(sublinear_tf=True, tokenizer=tokenize_text, ngram_range=(1, 3))

params_svc = {'C': [0.01, 0.05, 0.1, 0.3, 1, 3, 5, 10]}
#params_svc = {'C': [.1, 0.5, 1, 5, 10]}
gridsearch = GridSearchCV(LinearSVC(class_weight='balanced'), 
                   params_svc, cv=5, scoring='recall_micro')

pipe = Pipeline([('vectorizer', vectorizer), ('clf', gridsearch)])
pipe.fit(X_train, y_train)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


CPU times: user 6min 40s, sys: 4.16 s, total: 6min 44s
Wall time: 6min 44s


In [23]:
print gridsearch.best_params_

{'C': 10}


In [24]:
X_train_df = train_df.copy(deep=True)

In [25]:
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import pdb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

df_errors_dfs = []

avg_precision = 0
avg_recall = 0
avg_f1 = 0

#vectorizer = TfidfVectorizer(sublinear_tf=True, tokenizer=tokenize_text, ngram_range=(1, 3))
kf = KFold(n_splits = 5)
for train_inds, test_inds in kf.split(X_train_df):
    
    cv_train_df = X_train_df.iloc[train_inds, :]
    cv_test_df = X_train_df.iloc[test_inds, :]
    
    train_sents = cv_train_df['Review']
    train_labels = cv_train_df['Label']
    test_ids = cv_test_df['Unique ID']
    test_sents = cv_test_df['Review']
    test_labels = cv_test_df['Label']
    
    # Using best params: re-assemble pipeline (using best classifier) and train model
    clf = gridsearch.best_estimator_
    #clf = LinearSVC(class_weight='balanced', C=5)
    pipe = Pipeline([('vectorizer', vectorizer), ('clf', clf)])
    %time pipe.fit(train_sents, train_labels)            
    %time test_preds = pipe.predict(test_sents)
    
    calibrated_model = CalibratedClassifierCV(clf, cv=5)
    calibrated_model.fit(vectorizer.transform(train_sents), train_labels)
    test_preds_scores = calibrated_model.predict_proba(vectorizer.transform(test_sents))
    test_preds_scores = test_preds_scores[:,[0]]
        
    avg_precision += precision_score(test_labels, test_preds, pos_label = 'pos')
    avg_recall += recall_score(test_labels, test_preds, pos_label = 'pos')
    avg_f1 += f1_score(test_labels, test_preds, pos_label = 'pos')

    df_errors_subset = pd.DataFrame()
    df_errors_subset['Unique ID'] = test_ids
    df_errors_subset['Review'] = test_sents
    df_errors_subset['Label'] = test_labels
    df_errors_subset['Model Prediction'] = test_preds
    df_errors_subset['Prediction Probability'] = test_preds_scores
    
    df_errors_subset = df_errors_subset[df_errors_subset['Label'] != df_errors_subset['Model Prediction']]
    df_errors_dfs.append(df_errors_subset)
    
    print 25*'-'
    
df_errors_train = pd.concat(df_errors_dfs)

print "Average precision: " + str(avg_precision / 5.0)
print "Average recall: " + str(avg_recall / 5.0)
print "Average f1: " + str(avg_f1 / 5.0)

CPU times: user 2min 23s, sys: 624 ms, total: 2min 24s
Wall time: 2min 24s
CPU times: user 25.5 s, sys: 16 ms, total: 25.5 s
Wall time: 25.5 s
-------------------------
CPU times: user 2min 22s, sys: 536 ms, total: 2min 22s
Wall time: 2min 22s
CPU times: user 25.5 s, sys: 16 ms, total: 25.5 s
Wall time: 25.5 s
-------------------------
CPU times: user 2min 22s, sys: 748 ms, total: 2min 23s
Wall time: 2min 23s
CPU times: user 25.4 s, sys: 76 ms, total: 25.5 s
Wall time: 25.5 s
-------------------------
CPU times: user 2min 24s, sys: 552 ms, total: 2min 25s
Wall time: 2min 25s
CPU times: user 25.1 s, sys: 28 ms, total: 25.1 s
Wall time: 25.1 s


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


-------------------------
CPU times: user 2min 27s, sys: 456 ms, total: 2min 27s
Wall time: 2min 27s
CPU times: user 24.5 s, sys: 12 ms, total: 24.5 s
Wall time: 24.5 s
-------------------------
Average precision: 0.5798193953670985
Average recall: 0.5342
Average f1: 0.5553946660258815


In [27]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

vectorizer = CountVectorizer()
text_clf = Pipeline([('vect', vectorizer), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [28]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2),(1,3),(1,4),(2,3),(2,4)],
            'tfidf__use_idf': (True, False),
            'clf__alpha': ( 1, 1e-1, 1e-2, 1e-3)}

In [30]:
gs_clf = GridSearchCV(text_clf, parameters)
gs_clf = gs_clf.fit(X_train, y_train)

In [31]:
print gs_clf.best_score_
print gs_clf.best_params_

0.89812
{'vect__ngram_range': (1, 4), 'tfidf__use_idf': True, 'clf__alpha': 0.1}


In [None]:
def show_most_informative_features(vectorizer, clf, n=30):
    feature_names = vectorizer.get_feature_names()
    for i, c in enumerate(clf.coef_):
        print "Most Predictive Features for:"
        coefs_with_fns = sorted(zip(c, feature_names))
        top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
        for (coef_1, fn_1), (coef_2, fn_2) in top:
            print "\t%.4f\t%-15s\t\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2)

In [None]:
show_most_informative_features(vectorizer, text_clf)