In [56]:
import re
import pandas as pd
import numpy as np
import math

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict

import nltk
from nltk.corpus import stopwords
from nltk.corpus import gutenberg
from nltk.corpus import webtext
from nltk.corpus import udhr

### Reading data from csv

In [58]:
books = nltk.corpus.gutenberg
texts = nltk.corpus.webtext
spanish_cess = nltk.corpus.cess_esp
nltk.corpus.udhr.fileids()
nltk.corpus.udhr.words('Javanese-Latin1')


#data = IO CREATE A DATASET THAT ASSOCIATE TO THIS CORPUS A LABEL "english - 1" AND "nonenglish - 0"

['PRANYATAN', 'UMUM', 'NGENANI', 'HAK', '-', 'HAK', ...]

In [27]:
data = pd.read_csv('IMDB Dataset.csv')

Cose da fare:
1. Fare in modo che si possa dare un input oltre a quello già presente, fatto come si vuole
2. Modificare il codice piano piano
3. Timeline:
    - da consegnare entro il 30 nov.
    - prima implementazione entro il 20

### Removing Tags
Since the dataset presents both html tags, urls and so on

In [28]:
def remove_tags(string):
    removelist = ""
    result = re.sub('','',string)          #remove HTML tags
    result = re.sub('https://.*','',result)   #remove URLs
    result = re.sub(r'[^w'+removelist+']', ' ',result)    #remove non-alphanumeric characters 
    result = result.lower()
    return result

In [41]:
data['review']=data['review'].apply(lambda cw : remove_tags(cw)) 
stop_words = set(stopwords.words('english'))
data['review'] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

## Creates tokenizer, lematizer and lematize function

In [30]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

In [31]:
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st

In [32]:
data['review'] = data.review.apply(lemmatize_text)

## Splitting dataset from its labels
LabelEncoder() from sklearn.preprocessing is used to convert the labels (‘positive’, ‘negative’) into 1’s and 0’s respectively.

In [33]:
reviews = data['review'].values
labels = data['sentiment'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

## Creating training and testing set

In [34]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify = encoded_labels)

## Building the classifier
We are going to be building our own classifier from scratch using the formulas described earlier. We start by using the CountVectorizer from sklearn.feature_extraction.text to get the frequency of each word appearing in the training set. We store them in a dictionary called ‘word_counts’. All the unique words in the corpus are stored in ‘vocab’.


In [35]:
vec = CountVectorizer(max_features = 3000)
X = vec.fit_transform(train_sentences)
vocab = vec.get_feature_names()
X = X.toarray()
word_counts = {}
for l in range(2):
    word_counts[l] = defaultdict(lambda: 0)
for i in range(X.shape[0]):
    l = train_labels[i]
    for j in range(len(vocab)):
        word_counts[l][vocab[j]] += X[i][j]



## Laplace Smoothing

In [36]:
def laplace_smoothing(n_label_items, vocab, word_counts, word, text_label):
    a = word_counts[text_label][word] + 1
    b = n_label_items[text_label] + len(vocab)
    return math.log(a/b)

## Defining fit and predict functions

In [37]:
def group_by_label(x, y, labels):
    data = {}
    for l in labels:
        data[l] = x[np.where(y == l)]
    return data

In [38]:
def fit(x, y, labels):
    n_label_items = {}
    log_label_priors = {}
    n = len(x)
    grouped_data = group_by_label(x, y, labels)
    for l, data in grouped_data.items():
        n_label_items[l] = len(data)
        log_label_priors[l] = math.log(n_label_items[l] / n)
    return n_label_items, log_label_priors

In [39]:
def predict(n_label_items, vocab, word_counts, log_label_priors, labels, x):
    result = []
    for text in x:
        label_scores = {l: log_label_priors[l] for l in labels}
        words = set(w_tokenizer.tokenize(text))
        for word in words:
            if word not in vocab: continue
            for l in labels:
                log_w_given_l = laplace_smoothing(n_label_items, vocab, word_counts, word, l)
                label_scores[l] += log_w_given_l
        result.append(max(label_scores, key=label_scores.get))
    return result

## Fitting the Model on Training Set and Evaluating Accuracies on the Test Set

In [40]:
labels = [0,1]
n_label_items, log_label_priors = fit(train_sentences,train_labels,labels)
pred = predict(n_label_items, vocab, word_counts, log_label_priors, labels, test_sentences)
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred))

Accuracy of prediction on test set :  0.49992
