In [2]:
import pandas as pd

import numpy as np

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score

import math

import nltk

from nltk.tokenize import word_tokenize 

from sklearn.feature_extraction.text import CountVectorizer

from collections import defaultdict

data = pd.read_csv('test.csv', index_col=[0])
data

Unnamed: 0,comments,sentiments,clean_comments
0,Thanks a lot for the tutorial! It helps a lot!...,1,"['thanks', 'lot', 'tutorial', 'lot', 'definite..."
1,this is extremely helpful mate! keep making su...,1,"['extremely', 'helpful', 'mate', 'keep', 'make..."
2,can't we just use pytube to print the title?,0,"['cant', 'use', 'print', 'title']"
3,"Hi, is it legal to scrap youtube comments ? Th...",0,"['hi', 'legal', 'scrap', 'thanks']"
4,Is the code of this Video available? thanks,1,"['code', 'video', 'available', 'thanks']"
5,It is sayinf that >= does not work with int an...,0,"['work', 'non', 'type', 'help']"
6,it works Thanks but it is very slow. Can you d...,1,"['work', 'thanks', 'slow', 'via']"
7,Can we scrape emails from these comments,0,['scrape']
8,Where you run this code,0,"['run', 'code']"
9,i am so happy after watching this video. Than...,1,"['happy', 'watch', 'video', 'much', 'content']"


In [12]:
reviews = data['clean_comments'].values
labels = data['sentiments'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)
encoder

LabelEncoder()

In [4]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify = encoded_labels)


In [5]:
vec = CountVectorizer(max_features = 3000)
X = vec.fit_transform(train_sentences)
vocab = vec.get_feature_names()
X = X.toarray()
word_counts = {}
for l in range(2):
    word_counts[l] = defaultdict(lambda: 0)
for i in range(X.shape[0]):
    l = train_labels[i]
    for j in range(len(vocab)):
        word_counts[l][vocab[j]] += X[i][j]




In [6]:
def laplace_smoothing(n_label_items, vocab, word_counts, word, text_label):
    a = word_counts[text_label][word] + 1
    b = n_label_items[text_label] + len(vocab)
    return math.log(a/b)

In [7]:
def group_by_label(x, y, labels):
    data = {}
    for l in labels:
        data[l] = x[np.where(y == l)]
    return data

In [8]:
def fit(x, y, labels):
    n_label_items = {}
    log_label_priors = {}
    n = len(x)
    grouped_data = group_by_label(x, y, labels)
    for l, data in grouped_data.items():
        n_label_items[l] = len(data)
        log_label_priors[l] = math.log(n_label_items[l] / n)
    return n_label_items, log_label_priors

In [9]:
def predict(n_label_items, vocab, word_counts, log_label_priors, labels, x):
    result = []
    for text in x:
        label_scores = {l: log_label_priors[l] for l in labels}
        words = set(text)
        for word in words:
            if word not in vocab: continue
            for l in labels:
                log_w_given_l = laplace_smoothing(n_label_items, vocab, word_counts, word, l)
                label_scores[l] += log_w_given_l
        result.append(max(label_scores, key=label_scores.get))
    return result

In [10]:
labels = [0,1]
n_label_items, log_label_priors = fit(train_sentences,train_labels,labels)
pred = predict(n_label_items, vocab, word_counts, log_label_priors, labels, test_sentences)
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred))

Accuracy of prediction on test set :  0.4


In [11]:
pred

[0, 0, 0, 0, 0]