In [11]:
# nltk imports
import nltk

In [3]:
import string
import numpy as np
import os

basePath = os.path.abspath('') + "\\sentiment-analysis-nlp\\"

In [4]:
file = basePath + "NLP\\sentiment_labelled_sentences\\full_set.txt"
with open(file) as f:
    content = f.readlines()

content[0:10]

['So there is no way for me to plug it in here in the US unless I go by a converter.\t0\n',
 'Good case, Excellent value.\t1\n',
 'Great for the jawbone.\t1\n',
 'Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!\t0\n',
 'The mic is great.\t1\n',
 'I have to jiggle the plug to get it to line up right to get decent volume.\t0\n',
 'If you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one.\t0\n',
 'If you are Razr owner...you must have this!\t1\n',
 'Needless to say, I wasted my money.\t0\n',
 'What a waste of money and time!.\t0\n']

In [5]:
# Removing white spaces
content = [x.strip() for x in content]

# Separating sentences from labels
sentences = [x.split("\t")[0] for x in content]
labels = [x.split("\t")[1] for x in content]

In [6]:
sentences[0:10]

['So there is no way for me to plug it in here in the US unless I go by a converter.',
 'Good case, Excellent value.',
 'Great for the jawbone.',
 'Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!',
 'The mic is great.',
 'I have to jiggle the plug to get it to line up right to get decent volume.',
 'If you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one.',
 'If you are Razr owner...you must have this!',
 'Needless to say, I wasted my money.',
 'What a waste of money and time!.']

In [7]:
labels[0:10]

['0', '1', '1', '0', '1', '0', '0', '1', '0', '0']

In [8]:
'''
Transforming the labels to go from -1 to 1, instead of 0 to 1
-1 represents negative, +1 represents positive
'''

y = np.array(labels, dtype='int8')
y = 2*y - 1
y

array([-1,  1,  1, ..., -1, -1, -1], dtype=int8)

In [13]:
# Removing extras - stopwords, digits, punctuation

def remove_elements(x, removal_list):
    for w in removal_list:
        x = x.replace(w, ' ')
    return x

# Removing digits
digit_list = [str(x) for x in range(10)]
digits_removed = [remove_elements(x, digit_list) for x in sentences]

# Removing punctuations
punctuations_removed = [remove_elements(x, string.punctuation) for x in digits_removed]

# Converting to lower case and removing whitespaces
sentences = [x.lower() for x in punctuations_removed]
sentences = [x.strip() for x in sentences]

# Removing stopwords
def remove_stopwords(stopword, text):
    new_text = ' '.join([word for word in text.split() if word not in stopword])
    return new_text

# Defining our own set of stopwords
stop_set = ['the', 'a', 'an', 'i', 'he', 'she', 'they', 'to', 'of', 'it', 'from']
preprocessed = [remove_stopwords(stop_set, x) for x in sentences]
preprocessed[0:10]

['so there is no way for me plug in here in us unless go by converter',
 'good case excellent value',
 'great for jawbone',
 'tied charger for conversations lasting more than minutes major problems',
 'mic is great',
 'have jiggle plug get line up right get decent volume',
 'if you have several dozen or several hundred contacts then imagine fun sending each them one by one',
 'if you are razr owner you must have this',
 'needless say wasted my money',
 'what waste money and time']

In [None]:
def porter_stemmer(words):
    porter = nltk.PorterStemmer()
    stemmed = [porter.stem(word) for word in words]
    return stemmed

stemmed_sentences = [porter_stemmer(words.split()) for words in preprocessed]
stemmed_sentences = [" ".join(i) for i in stemmed_sentences]
stemmed_sentences[0:10]

In [None]:
# TD/IDF

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
vectorizer = CountVectorizer(analyzer="word",
                             preprocessor=None,
                             stop_words='english',
                             max_features=6000,
                             ngram_range=(1,5))

date_features = vectorizer.fit_transform(preprocessed)
tfidf_transformer = TfidfTransformer()
data_features_tfidf = tfidf_transformer.fit_transform(date_features)
data_matrix = data_features_tfidf.toarray()
data_matrix

In [None]:
# Creating Training and Test Sets

np.random.seed(0)
test_index = np.append(np.random.choice((np.where(y==-1))[0], 250, replace=False),
                       np.random.choice((np.where(y==1))[0], 250, replace=False))
train_index = list(set(range(len(labels))) - set(test_index))

train_data = data_matrix[train_index,]
train_labels = y[train_index]

test_data = data_matrix[test_index,]
test_labels = y[test_index]