In [9]:
# Import the data set
from keras.datasets import imdb
# split the data set into training and testing target-data
(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data()

In [10]:
# get all the words already tokenized(remember that the words are unique, each having an ID) -> returns a dictionary
index = imdb.get_word_index()
# reverse key-value pair in dictionary
reverse_index = dict([(value, key) for (key, value) in index.items()])

## N-Grams

In [3]:
# Get top 20 unigrams in the data set
# Steps:
# 1 decode all the sentences
# 2 tokenize
# 3 count how many times each word recurs
# 4 order the result and print the words
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
import numpy as np
# STEP 1: decode all the sentences
# extract all the sentences from training data and test data decoded
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)

decoded_sentences = []
# extract from training data
for x in range(len(data)):
    decoded_sentences.append(" ".join([reverse_index.get(i - 3, "") for i in data[x]]))


In [13]:
import re
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(decoded_sentences)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(decoded_sentences[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

In [14]:
decoded_sentencessentencessentences_sentences = documents

In [15]:
# STEP 2: Tokenize
from nltk import word_tokenize
from nltk.util import ngrams

# The function will split everything in Unigrams
def extract_unigram(sentences): 

  tokens = []
  for sentence in sentences:
    tok = word_tokenize(sentence)
    for t in tok:
      tokens.append(t)
  return tokens



# The function will split everything in Bigrams
def extract_bigrams(sentences): 

  all_bigrams = []
  for sentence in sentences:
    token = word_tokenize(sentence)
    bigrams = ngrams(token,2)
    for b in bigrams:
      all_bigrams.append(b)
  return all_bigrams

In [16]:
# Extract top unigrams
extracted_unigrams = extract_unigram(decoded_sentences);

# Get most frequent unigrams into a dictionary
frequent_unigrams = dict()
for x in extracted_unigrams:
    if x in frequent_unigrams:
            frequent_unigrams[x] += 1
    else:
        frequent_unigrams[x] = 1


In [17]:

reversed_frequent_unigrams = dict([(value, key) for (key, value) in frequent_unigrams.items()])
reversed_frequent_unigrams_top_values = sorted(reversed_frequent_unigrams.keys(), reverse=True)[:]
#print(reversed_frequent_unigrams_top_values)

# print unigrams top values
for x in reversed_frequent_unigrams_top_values:
    print(reversed_frequent_unigrams[x])

the
and
a
of
to
is
br
it
in
i
this
that
's
was
as
movie
for
with
but
film
you
on
n't
not
are
he
his
have
one
be
all
at
they
by
an
who
so
from
like
there
or
just
do
her
out
about
if
has
what
some
good
when
more
very
she
would
up
no
time
my
even
can
which
only
story
really
see
their
had
were
me
did
well
we
does
than
much
could
bad
get
been
other
people
great
will
also
into
because
how
him
first
most
'
made
its
them
then
make
way
too
movies
any
after
characters
think
watch
character
films
two
many
seen
being
never
plot
life
acting
where
show
best
know
little
over
off
ever
man
your
better
end
here
scene
still
say
these
scenes
why
should
while
something
've
such
go
through
back
those
'm
real
now
thing
watching
actors
director
years
though
old
10
another
work
before
actually
nothing
makes
look
find
going
same
lot
new
every
few
again
part
world
're
down
cast
us
things
want
quite
pretty
got
horror
around
ca
seems
young
take
however
thought
big
fact
enough
long
both
give
may
own
between
series


In [18]:
# Extract top bigrams
extracted_bigrams = extract_bigrams(decoded_sentences);

# Get most frequent bigrams into a dictionary
frequent_bigrams = dict()
for x in extracted_bigrams:
    if x in frequent_bigrams:
            frequent_bigrams[x] += 1
    else:
        frequent_bigrams[x] = 1
        
    
reversed_frequent_bigrams = dict([(value, key) for (key, value) in frequent_bigrams.items()])
reversed_frequent_bigrams_top_values = sorted(reversed_frequent_bigrams.keys(), reverse=True)[:20]
# print(reversed_frequent_bigrams_top_values)

# print unigrams top values
for x in reversed_frequent_bigrams_top_values:
    print(reversed_frequent_bigrams[x])

('br', 'br')
('of', 'the')
('in', 'the')
('it', "'s")
('this', 'movie')
('the', 'film')
('and', 'the')
('is', 'a')
('the', 'movie')
('to', 'the')
('to', 'be')
('this', 'film')
('it', 'is')
('this', 'is')
('it', 'was')
('on', 'the')
('in', 'a')
('do', "n't")
('one', 'of')
('for', 'the')


## BOW

In [None]:
# Extracting BOW
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
#CountVectorizer implements both tokenization and occurrence counting in a single class.

In [None]:

sentences_as_bag_of_words = vectorizer.fit_transform(decoded_sentences)


In [None]:
bow_vectors = sentences_as_bag_of_words.toarray()

In [None]:

labels = pd.DataFrame(targets)
dataset_frame = pd.DataFrame(data=bow_vectors)
dataset_frame['Class'] = labels

In [None]:
# select lessa data
selected_number = 2001
dataset_reduced = dataset_frame[:selected_number]

In [None]:
# decision tree
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 

#Split the data into a training and a testing set
bound_param = 1930
train_features = dataset_reduced.iloc[:bound_param,:-1] 
test_features = dataset_reduced.iloc[bound_param:,:-1] 
train_labels = dataset_reduced.iloc[:bound_param,-1] 
test_labels = dataset_reduced.iloc[bound_param:,-1]

tree = DecisionTreeClassifier(criterion = 'entropy').fit(train_features,train_labels)#The accuracy is then calculated through the score function offered by sklearn
print("The prediction accuracy is: ",tree.score(test_features,test_labels)*100,"%")


In [None]:
# convert test labels to array to iterate through
labels_dataFrame = pd.DataFrame(test_labels.values)
labels_dataFrame = labels_dataFrame.to_numpy()

# make predictions
prediction = tree.predict(test_features)
prediction

correct_count = 0
wrong_count = 0
for count,x in enumerate(prediction):
    if labels_dataFrame[count] == x:
        print(f'Correct, {x}')
        correct_count+=1
    else:
        print(f'Wrong, {x}')
        wrong_count+=1
        

#print(f'Correctly predicted: {correct_count} \nWrongly predicted: {wrong_count}')

In [None]:
import io
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = io.StringIO()
export_graphviz(tree, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
# Evaluation
# f1 score best value at 1 and worse at 0
from sklearn.metrics import f1_score
y_true = labels_dataFrame
y_pred = prediction
f1_score(y_true, y_pred, average='micro')
