In [1]:
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
import pandas as pd
import numpy as np
import string
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import re

In [2]:
def openFile(path):
    #param path: path/to/file.ext (str)
    #Returns contents of file (str)
    with open(path) as file:
        data = file.read()
    return data
imdb_data = openFile('./dataset/imdb_labelled.txt')
amzn_data = openFile('./dataset/amazon_cells_labelled.txt')
yelp_data = openFile('./dataset/yelp_labelled.txt')

In [3]:
datasets = [imdb_data, amzn_data, yelp_data]
combined_dataset = []
# separate samples from each other
for dataset in datasets:
    combined_dataset.extend(dataset.split('\n'))

# separate each label from each sample
dataset = [sample.split('\t') for sample in combined_dataset]

In [4]:
df = pd.DataFrame(data=dataset, columns=['Reviews', 'Labels'])

# Remove any blank reviews
df = df[df["Labels"].notnull()]

# shuffle the dataset for later.
# Note this isn't necessary (the dataset is shuffled again before used), 
# but is good practice.
df = df.sample(frac=1)

In [5]:
df.head()

Unnamed: 0,Reviews,Labels
2951,"Maybe it's just their Vegetarian fare, but I'v...",0
1745,If you like a loud buzzing to override all you...,0
6,Wasted two hours.,0
2034,I found this place by accident and I could not...,1
1008,If you are Razr owner...you must have this!,1


In [6]:
positive_review = []
negative_review = []

In [7]:
#for every review if its lable is 1 than store in positive review else store in negative review
for i in range(len(df)):
    if(df.iloc[i][1] == '1'):
        positive_review.append(df.iloc[i][0])
    else:
        negative_review.append(df.iloc[i][0])

In [8]:
word_index_map = {} #dict
current_index = 0
positive_tokenized = [] #array
negative_tokenized = [] #array
orig_reviews = [] #array
stopword_english = stopwords.words('english')
important_words = ['above', 'below', 'off', 'over', 'under', 'more', 'most', 'such', 'no', 'nor', 'not', 'only', 'so', 'than', 'too', 'very', 'just', 'but']
ENGLISH_STOPWORDS = set(stopword_english) - set(important_words)
NEG_CONTRACTIONS = [
    (r'aren\'t', 'are not'),
    (r'can\'t', 'can not'),
    (r'couldn\'t', 'could not'),
    (r'daren\'t', 'dare not'),
    (r'didn\'t', 'did not'),
    (r'doesn\'t', 'does not'),
    (r'don\'t', 'do not'),
    (r'isn\'t', 'is not'),
    (r'hasn\'t', 'has not'),
    (r'haven\'t', 'have not'),
    (r'hadn\'t', 'had not'),
    (r'mayn\'t', 'may not'),
    (r'mightn\'t', 'might not'),
    (r'mustn\'t', 'must not'),
    (r'needn\'t', 'need not'),
    (r'oughtn\'t', 'ought not'),
    (r'shan\'t', 'shall not'),
    (r'shouldn\'t', 'should not'),
    (r'wasn\'t', 'was not'),
    (r'weren\'t', 'were not'),
    (r'won\'t', 'will not'),
    (r'wouldn\'t', 'would not'),
    (r'ain\'t', 'am not') # not only but stopword anyway
]
OTHER_CONTRACTIONS = {
    "'m": 'am',
    "'ll": 'will',
    "'s": 'has', # or 'is' but both are stopwords
    "'d": 'had'  # or 'would' but both are stopwords
}

In [9]:
def bag_of_words(words):    
    words_dictionary = [word for word in words]    
    return words_dictionary
 
# feature extractor function for ngrams (bigram)
def bag_of_ngrams(words, n=2):
    words_ng = []
    for item in iter(ngrams(words, n)):
        words_ng.append(item)
    words_dictionary = [word for word in words_ng]     
    return words_dictionary

In [10]:
def my_tokenizer(doc):
    all_features = []
    doc = doc.lower() # downcase
    # transform negative contractions (e.g don't --> do not)
    for t in NEG_CONTRACTIONS:
        doc = re.sub(t[0], t[1], doc)
    tokens = nltk.tokenize.word_tokenize(doc) # split string into words (tokens)
    # transform other contractions (e.g 'll --> will)
    tokens = [OTHER_CONTRACTIONS[token] if OTHER_CONTRACTIONS.get(token) 
                else token for token in tokens]
    # remove punctuation
#     r = r'[a-z]+'
    tokens = [word for word in tokens if word not in string.punctuation]
    
    words_clean = [t for t in tokens if t not in stopword_english] # remove all stopwords
    words_clean_for_bigrams = [t for t in tokens if t not in ENGLISH_STOPWORDS] # remove stopwords
    
    unigram_features = bag_of_words(words_clean)
    bigram_features = bag_of_ngrams(words_clean_for_bigrams)
    
    all_features = unigram_features.copy()
    for data in bigram_features:
        all_features.append(data)
    
    return words_clean_for_bigrams
    #return all_features

In [11]:
review_data = my_tokenizer("hello my name is Heriz Shreshta. i don't like this movie")
print(review_data)

['hello', 'name', 'heriz', 'shreshta', 'not', 'like', 'movie']


In [12]:
for review in positive_review:
    orig_reviews.append(review) #positive review add garyo
    tokens = my_tokenizer(review) #positive review tokenize garyo
    positive_tokenized.append(tokens) #positive token append garyo
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index #token word ko dict bhanayo
            current_index += 1

In [13]:
for review in negative_review:
    orig_reviews.append(review)
    tokens = my_tokenizer(review)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [14]:
print("len(word_index_map):", len(word_index_map))

len(word_index_map): 5252


In [15]:
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

In [16]:
N = len(positive_tokenized) + len(negative_tokenized)
# (N x D+1 matrix - keeping them together for now so we can shuffle more easily later
data = np.zeros((N, len(word_index_map) + 1))

In [17]:
data.shape

(3000, 5253)

In [18]:
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

In [19]:
for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

In [20]:
from sklearn.utils import shuffle
orig_reviews, data = shuffle(orig_reviews, data)

In [21]:
X = data[:,:-1]
Y = data[:,-1]

In [22]:
Xtrain,xTest,Ytrain,yTest = train_test_split(
                            X, Y, test_size=0.2, random_state=42)

In [23]:
iteration = 100
alpha = 0.001
reg = 10

In [24]:
def sigmoid(z):
    """
    The sigmoid / logistic function.
    Args:
        z: any real number.
    Returns:
        A value between O and 1.
    """
    return 1.0 / (1.0 + np.exp(-1.0 * z))

In [25]:
def accuracy(x,y,weight):
    '''
    tyo presentation ma gareko hisab le gaar
    '''
    [n,m] = x.shape
    accuracyValue = 0
    for i in range(n):
        z = np.dot(x,weight)
        h = sigmoid(z)
    for i in range(len(h)):
        if(h[i]>= 0.5 and y[i] == 1):
            accuracyValue += 1
        elif(h[i] < 0.5 and y[i] == 0):
            accuracyValue += 1
        else:
            accuracyValue += 0
    return ((accuracyValue/len(y))*100)

In [52]:
def train(x,y,iteration = 600,lr=0.01,reg = 10,h = lambda a,b: sigmoid(np.dot(a,b))):
    """
    Compute w (Batch gradient descent).
    Args:
        w: weight vector (numpy.array)
        x: documents matrix (numpy.array)
        y: output vector (numpy.array)
        h: function of x and w
    Returns:
        The gradient vector (list of float values).
    """
    [n, m] = x.shape
    
    weight = np.zeros(x.shape[1])
    for i in range(iteration):
        for j in range(m):
            REG = reg * weight[j] / n
            weight[j] = weight[j] - lr * ((h(x[i], weight) - y[i]) * x[i,j] - REG)
    return weight

In [53]:
#h = lambda a,b: sigmoid(np.dot(a,b))
weight_data = train(Xtrain,Ytrain)

In [54]:
print(weight_data[0:10])

[-0.00060541  0.01096489  0.         -0.00064057 -0.0786936   0.
  0.          0.         -0.00522667 -0.00050351]


In [55]:
result = accuracy(xTest,yTest,weight_data)
print(result)

76.66666666666667


In [67]:
#store the result in dictionary
i = 0
for key in word_index_map:
    word_index_map[key] = weight_data[i]
    i += 1

In [80]:
word_index_map['long']

-0.0006434160875956348

In [74]:
reviewSen = "it was a very good movie"
reviewSen = my_tokenizer(reviewSen)
value = 0
for data in reviewSen:
    try:
        value += word_index_map[data]
    except:
        value += 0

result = sigmoid(value)
print(result * 5)

2.522399915294322


NameError: name 'word_index_map' is not defined