In [15]:
import pandas as pd
import re
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from sklearn.model_selection import train_test_split

In [24]:
#import dataset
df = pd.read_csv("Reddit_Data.csv")
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [25]:
#preprocess the texts
stopword = set(stopwords.words("english"))

def process_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]','',text)
    words = [word for word in text.split() if word not in stopword]
    return words
    

In [26]:
df["Cleaned_Comment"] = df["clean_comment"].apply(process_text)
df.head()

Unnamed: 0,clean_comment,category,Cleaned_Comment
0,family mormon have never tried explain them t...,1,"[family, mormon, never, tried, explain, still,..."
1,buddhism has very much lot compatible with chr...,1,"[buddhism, much, lot, compatible, christianity..."
2,seriously don say thing first all they won get...,-1,"[seriously, say, thing, first, get, complex, e..."
3,what you have learned yours and only yours wha...,0,"[learned, want, teach, different, focus, goal,..."
4,for your own benefit you may want read living ...,1,"[benefit, may, want, read, living, buddha, liv..."


In [40]:
def word_features(words):
    return {word: True for word in words}

features = [(word_features(tokens), label) for tokens, label in zip(df["Cleaned_Comment"],df["category"])]
features

[({'family': True,
   'mormon': True,
   'never': True,
   'tried': True,
   'explain': True,
   'still': True,
   'stare': True,
   'puzzled': True,
   'time': True,
   'like': True,
   'kind': True,
   'strange': True,
   'creature': True,
   'nonetheless': True,
   'come': True,
   'admire': True,
   'patience': True,
   'calmness': True,
   'equanimity': True,
   'acceptance': True,
   'compassion': True,
   'developed': True,
   'things': True,
   'buddhism': True,
   'teaches': True},
  1),
 ({'buddhism': True,
   'much': True,
   'lot': True,
   'compatible': True,
   'christianity': True,
   'especially': True,
   'considering': True,
   'sin': True,
   'suffering': True,
   'almost': True,
   'thing': True,
   'caused': True,
   'wanting': True,
   'things': True,
   'want': True,
   'going': True,
   'getting': True,
   'wrong': True,
   'way': True,
   'christian': True,
   'would': True,
   'mean': True,
   'coincide': True,
   'god': True,
   'without': True,
   'aid': Tru

In [42]:
train_data, test_data = train_test_split(features, test_size=0.2,random_state=42, stratify=df["category"])
model = NaiveBayesClassifier.train(train_data)

In [44]:
print(f"accuracy: {accuracy(model,test_data):.2f}")

accuracy: 0.52


In [51]:
model.show_most_informative_features(20)

Most Informative Features
                    good = True                1 : 0      =    358.5 : 1.0
                   first = True                1 : 0      =    357.8 : 1.0
                    sure = True                1 : 0      =    239.4 : 1.0
                  really = True                1 : 0      =    189.9 : 1.0
                     far = True                1 : 0      =    177.9 : 1.0
                    kind = True                1 : 0      =    175.2 : 1.0
                     top = True                1 : 0      =    156.3 : 1.0
                     old = True                1 : 0      =    147.5 : 1.0
               seriously = True               -1 : 0      =    140.2 : 1.0
                    able = True                1 : 0      =    137.5 : 1.0
                    past = True               -1 : 0      =    137.1 : 1.0
                    hate = True               -1 : 0      =    134.5 : 1.0
                    poor = True               -1 : 0      =    132.8 : 1.0

In [54]:
with open('naive_bayes.pk', 'wb') as f:
    pickle.dump(model, f)
    
with open('naive_bayes.pk', 'rb') as f:
    loaded_model = pickle.load(f)

In [101]:
new_comments = [
    "I really admire them",
    "That was terrible and frustrating",
    "I like his art",
    "Insert bad comment here"
]

In [102]:
#predict the new comments

def predict_sentiment(comment):
    tokens = process_text(comment)
    feats = word_features(tokens)
    return loaded_model.classify(feats)

for comment in new_comments:
    sentiment = predict_sentiment(comment)
    print(f"Comment: '{comment}' -> predicted Sentiment: {sentiment}")

Comment: 'I really admire them' -> predicted Sentiment: 1
Comment: 'That was terrible and frustrating' -> predicted Sentiment: -1
Comment: 'I like his art' -> predicted Sentiment: 1
Comment: 'Insert bad comment here' -> predicted Sentiment: -1
