# NLP Sentiment with NLTK


In [40]:
# Step 1: Load and Pre-process: Shuffle, Keep Labeled Data, Clean
print("load  data ...")
train_texts,train_labels = get_texts(PATH/'train')   # extrating training and test from here
val_texts,val_labels = get_texts(PATH/'test')        # holdout set
print(train_labels)
print("loaded data:" , len(train_texts),len(val_texts))


# Shuffle
np.random.seed(42)
train_idx = np.random.permutation(len(train_texts))
val_idx = np.random.permutation(len(val_texts))
train_texts = train_texts[train_idx]
val_texts = val_texts[val_idx]
train_labels = train_labels[train_idx]
val_labels = val_labels[val_idx]
print("shuffled data:", len(train_texts), len(train_labels))

# Keep Labeled Data
idx=np.where(train_labels != 2 )[0]
train_texts = train_texts[idx]
train_labels = train_labels[idx]
print("labled data:" , len(train_texts), len(train_labels))

# Clean
#  run the htmlfix and pre-process functions (appendix)
reviews_train_clean = preprocess_reviews(train_texts) # train ... extract training and validation from this
reviews_val_clean = preprocess_reviews(val_texts)  # test holdout set
print("reviews clean:", len(reviews_train_clean))

load  data ...
[0 0 0 ... 2 2 2]
loaded data: 75000 25000
shuffled data: 75000 75000
labled data: 25000 25000
reviews clean: 25000


In [38]:
# Step 2:  Tokenize and Vectorize
print("CountvVectorizor ... ")
stop_words = ['in', 'of', 'at', 'a', 'the']
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words=stop_words)
ngram_vectorizer.fit(reviews_train_clean)
# change the two inputs below to be reviews_train_clean_lem ... if you want to use lemmatized text
X = ngram_vectorizer.transform(reviews_train_clean)
X_val = ngram_vectorizer.transform(reviews_val_clean)
print("vecctorized:", X.shape)

lemmatize ...
lematized: 25000 25000
CountvVectorizor ... 
vecctorized: (25000, 5443695)


In [39]:
# Step 3 Sentiment Classification
final = LinearSVC(C=0.01)
final.fit(X, train_labels)
print ("Final Accuracy: %s"  % accuracy_score(val_labels, final.predict(X_val)))

Final Accuracy: 0.90024


# Abstract ... Imports and Helper Functions

In [27]:
import html
from path import Path
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

In [28]:
# In the future can directly read the csv files for convenience ... 
# Load np arrays directly from files
#   an easier way might be to load a dataframe and then convert to np.array
#   but the information is in separate files, so would need create a dataframe
#   USF Fastai does it this way, and then creates a DataFrame

PATH=Path('./data/aclImdb')
CLASSES = ['neg', 'pos', 'unsup']
def get_texts(path):
    texts,labels = [],[]
    for idx,label in enumerate(CLASSES):
        for fname in (path/label).glob('*.*'):
            texts.append(fname.open('r', encoding='utf-8').read())
            labels.append(idx)
    return np.array(texts),np.array(labels)

In [29]:
# htmlfix and pre-process functions

# htmlfix function
re1 = re.compile(r'  +')
def htmlfix(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

punctuationfix = re.compile("[.;:!\'?,\"()\[\]]")
# Remove punctuation
# Remove html 
def preprocess_reviews(reviews):
    reviews = [punctuationfix.sub("", line.lower()) for line in reviews]
    reviews= [ htmlfix(line) for line in reviews]
    return reviews


In [30]:
# NLTK Wordnet Lemmatizer

def lemmatize(corpus):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

