In [None]:
import pandas as pd
import os

In [None]:
# Locate the dataset on the machine
train_pos_folder = "/home/bhumika/Documents/ss18/ml_summer_school/Day6/IMDB_SummerSchool/train/pos/"
train_neg_folder = "/home/bhumika/Documents/ss18/ml_summer_school/Day6/IMDB_SummerSchool/train/neg/"
train_unsup_folder = "/home/bhumika/Documents/ss18/ml_summer_school/Day6/IMDB_SummerSchool/train/unsup/"

In [None]:
# dataset files - just a utility code
train_pos_files = os.listdir(train_pos_folder)
print len(train_pos_files)
train_neg_files = os.listdir(train_neg_folder)
print len(train_neg_files)
train_unsup_files = os.listdir(train_unsup_folder)
print len(train_unsup_files)


In [None]:
# Get file/review properties
train_pos_df = pd.DataFrame(columns=["Unique ID","File Name","File ID","Rating","Review"])
#%%time
dataset = "train_pos_"
for f in train_pos_files:
    open_f = open(train_pos_folder+f)
    review = (open_f.read()).replace("<br />","\n")
    #print review
    file_name = f
    file_id = int(f.split("_")[0])
    rating = int((f.split("_")[1]).split(".")[0])
    unique_id = dataset + str(file_id) 
    train_pos_df.loc[len(train_pos_df)] = [unique_id,file_name,file_id,rating,review]
    
train_neg_df = pd.DataFrame(columns=["Unique ID","File Name","File ID","Rating","Review"])
#%%time
dataset = "train_neg_"
for f in train_neg_files:
    open_f = open(train_neg_folder+f)
    review = (open_f.read()).replace("<br />","\n")
    #print review
    file_name = f
    file_id = int(f.split("_")[0])
    rating = int((f.split("_")[1]).split(".")[0])
    unique_id = dataset + str(file_id) 
    train_neg_df.loc[len(train_neg_df)] = [unique_id,file_name,file_id,rating,review]
    
train_df = pd.concat([train_pos_df, train_neg_df],ignore_index=True)

# Label the reviews according to the rating
train_df["Label"] = "neg"
train_df.loc[train_df["Rating"] >= 5, "Label"] = "pos"

# Check groups - just a utility code
train_df.groupby("Label").size()

In [None]:
# Tokenize the review
import re, string
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
import pdb
from pycorenlp import StanfordCoreNLP

stemmer = SnowballStemmer('english')

def tokenize_text(txt):
    
    if len(txt.split()) < 10:
            return []
    txt = re.sub(r"[^A-Za-z]", " ", txt)
    txt = re.sub(r"\"", " \" ", txt)
    txt = re.sub(r",", " , ", txt)
    txt = re.sub(r"\(", " ( ", txt)
    txt = re.sub(r"\)", " ) ", txt)
    txt = re.sub(r"\s{2,}", " ", txt)
    
    tokens = word_tokenize(txt)
    
    #tokens.append('length_' + str(len(tokens) / 10))
    
    tokens = [stemmer.stem(tok) for tok in tokens if (len(tok) > 2) or (tok in string.punctuation)]
    
    return tokens

In [None]:
# X = a review ; y = the corressponding predicted review category, pos/neg
X_train = train_df["Review"]
y_train = train_df["Label"]

In [None]:
# K-fold CV
X_train_df = train_df.copy(deep=True)
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import pdb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

df_errors_dfs = []

avg_precision = 0
avg_recall = 0
avg_f1 = 0

# Vectorize the tokens!
vectorizer = TfidfVectorizer(sublinear_tf=True, tokenizer=tokenize_text, ngram_range=(1, 3))
# Generate the train/validation sets
kf = KFold(n_splits = 5)
for train_inds, test_inds in kf.split(X_train_df):
    
    cv_train_df = X_train_df.iloc[train_inds, :]
    cv_test_df = X_train_df.iloc[test_inds, :]
    
    train_sents = cv_train_df['Review']
    train_labels = cv_train_df['Label']
    test_ids = cv_test_df['Unique ID']
    test_sents = cv_test_df['Review']
    test_labels = cv_test_df['Label']
    
    # Select the classifier
    from sklearn.linear_model import SGDClassifier
    clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, random_state=42)
    # Put it into the pipeline finally, for vectorization & classification !
    pipe = Pipeline([('vectorizer', vectorizer), ('clf', clf)])
    # Fit onto the (k-1) training sets
    %time pipe.fit(train_sents, train_labels)
    # Predict on the k_th validation set
    %time test_preds = pipe.predict(test_sents)
    
    # Evaluate the model
    avg_precision += precision_score(test_labels, test_preds, pos_label = 'pos')
    avg_recall += recall_score(test_labels, test_preds, pos_label = 'pos')
    avg_f1 += f1_score(test_labels, test_preds, pos_label = 'pos')

    df_errors_subset = pd.DataFrame()
    df_errors_subset['Unique ID'] = test_ids
    df_errors_subset['Review'] = test_sents
    df_errors_subset['Label'] = test_labels
    df_errors_subset['Model Prediction'] = test_preds
    #df_errors_subset['Prediction Probability'] = test_preds_scores
    
    df_errors_subset = df_errors_subset[df_errors_subset['Label'] != df_errors_subset['Model Prediction']]
    df_errors_dfs.append(df_errors_subset)
    
    print 25*'-'
    
df_errors_train = pd.concat(df_errors_dfs)

print "Average precision: " + str(avg_precision / 5.0)
print "Average recall: " + str(avg_recall / 5.0)
print "Average f1: " + str(avg_f1 / 5.0)