In [56]:
import multiprocessing
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import learning_curve
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import pickle
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_recall_curve, average_precision_score

def get_data_frame():
    dataset = pd.read_csv('Sentiment_Analysis_Dataset/labelled.csv')

    # get positive class
    class_positive = dataset[dataset['Label'] == '__label__2']

    # get negative class
    class_negative = dataset[dataset['Label'] == '__label__1']

    # under sample positive class randomly with the size of negative class
    positive_under = class_positive.sample(class_negative.shape[0],replace=True)

    # concatenate negative and new under sampled positive class
    df_test_under = pd.concat([positive_under, class_negative], axis=0)

    return df_test_under

def split_train_test(data):
    
    # creating the feature matrix
    matrix = CountVectorizer(max_features=1000, stop_words="english")
    X = matrix.fit_transform(data.iloc[:, -1].astype('U')).toarray()
    y_tmp = data.iloc[:, 0]
    
    y = []
    for idx, val in y_tmp.iteritems():
        if val == '__label__2':
            y.append("positive")
        elif val == '__label__1':
            y.append("negative")

    # split train and test data
    return train_test_split(X, y,test_size=0.15)


def y_to_float(y):
    y_float = []
    #print(y)
    for val in y:
        if val == "positive":
            y_float.append(1)
        elif val == "negative":
            y_float.append(0)
    #print(y_float)
    return y_float


def run_gaussianNB(X_train, X_test, y_train, y_test):
    y_train_f = y_to_float(y_train)
    #print(y_train_f)
    y_test_f = y_to_float(y_test)

    # Naive Bayes
    classifier = GaussianNB()
    classifier.fit(X_train, y_train_f)

    # predict class
    y_pred = classifier.predict(X_test)

    # Confusion matrix
    print("GAUSSIAN NAIVE BAYES")
    print(confusion_matrix(y_test_f, y_pred))
    print(classification_report(y_test_f, y_pred))
    print(accuracy_score(y_test_f, y_pred))


def run_multinomialNB(X_train, X_test, y_train, y_test):
    y_train_f = y_to_float(y_train)
    y_test_f = y_to_float(y_test)
    tf_transformer = TfidfTransformer().fit_transform(X_train)

    classifier = MultinomialNB()
    classifier.fit(tf_transformer, y_train_f)
    
    y_pred = classifier.predict(X_test)

    # Confusion matrix
    print("MULTINOMIAL NAIVE BAYES")
    print(confusion_matrix(y_test_f, y_pred))
    print(classification_report(y_test_f, y_pred))
    print(accuracy_score(y_test_f, y_pred))

def run_bernoulliNB(X_train, X_test, y_train, y_test):
    y_train_f = y_to_float(y_train)
    y_test_f = y_to_float(y_test)

    classifier = BernoulliNB()
    classifier.fit(X_train, y_train_f)

    #classifier.save('my_model') 

    y_pred = classifier.predict(X_test)

    # Confusion matrix
    print("BERNOULLI NAIVE BAYES")
    print(confusion_matrix(y_test_f, y_pred))
    print(classification_report(y_test_f, y_pred))
    print(accuracy_score(y_test_f, y_pred))


if __name__ == "__main__":
    data = get_data_frame()
    X_train, X_test, y_train, y_test = split_train_test(data)

    #run_gaussianNB(X_train, X_test, y_train, y_test)
    run_multinomialNB(X_train, X_test, y_train, y_test)
    #run_bernoulliNB(X_train, X_test, y_train, y_test)    
    

MULTINOMIAL NAIVE BAYES
[[73 20]
 [ 8 61]]
              precision    recall  f1-score   support

           0       0.90      0.78      0.84        93
           1       0.75      0.88      0.81        69

    accuracy                           0.83       162
   macro avg       0.83      0.83      0.83       162
weighted avg       0.84      0.83      0.83       162

0.8271604938271605


In [58]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import accuracy_score


model = Pipeline([('vect', TfidfVectorizer()), 
                      ('clf', MultinomialNB()) ])
#model = Pipeline([ ('vect', CountVectorizer(stop_words='english')),
#                  ('tfidf', TfidfTransformer()),
#                  ('clf', MultinomialNB()) ])
#print(model)

df = pd.read_csv('Sentiment_Analysis_Dataset/labelled.csv')
df.head()

X=df.drop(['Label'],axis=1)
y=df['Label']
#print(X.head())
#print(y.head())

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.15)

vt = Pipeline([('vect', TfidfVectorizer()), 
                      ('clf', MultinomialNB()) ])
print(vt)
vt.fit(X_train.transpose().apply(lambda x: ' '.join(x)),y_train)

prediction = vt.predict(X_test.transpose().apply(lambda x: ' '.join(x)))

print(accuracy_score(y_test, prediction))
print(classification_report(y_test, prediction))
print(confusion_matrix(y_test, prediction))

predict=vt.predict(['The Phone was of good quality but its has a large amount of issues within it.','not so good and nor bad','Very poor phone','Lithium batteries are something new introduced in the market there average developing cost is relatively high but Stallion doesn\'t compromise on quality and provides us with the best at a low cost.<br />There are so many in built technical assistants that act like a sensor in their particular fortÃ©. The battery keeps my phone charged up and it works at every voltage and a high voltage is never risked'])
print(predict)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)
0.8263473053892215
              precision   

  'precision', 'predicted', average, warn_for)
