In [2]:
import sys
import pandas as pd
import nltk
nltk.download('punkt') 
from convokit import Corpus, download
corpus = Corpus(filename=download("reddit-corpus-small"))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Daniel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Dataset already exists at C:\Users\Daniel\.convokit\downloads\reddit-corpus-small


In [3]:
from random import randint
import nltk
from pprint import pprint
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [4]:
from nltk.corpus import twitter_samples
from random import shuffle
import numpy as np
import re
from statistics import mean

global counter
counter = 0


twitter_samples.fileids()
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

def is_wanted(word_tag_tuple: tuple) -> bool:    
    word, tag = word_tag_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag == "PRP$" or tag == "IN" or tag == "PRP" or tag == "MD" or tag == "TO" or len(tag) == 1:
        return False
    return True


def check_positive(tweet: str) -> bool:
    score = sia.polarity_scores(tweet)["compound"]
    if score > 0: return True
    return False


def is_correct(tweet: str, positive_check: bool, positive_tweets: list) -> bool:
    global counter
    if tweet in positive_tweets:
        if positive_check:
            counter += 1
            return True
        else:
            return False
    elif not positive_check:
        counter += 1
        return True
    return False


def find_expressions(tweet: str, sad_expression: list, happy_expression: list):
    exp_count = 0
    for emote in sad_expression:
        if emote in tweet:
            exp_count -= 1
    for emote in happy_expression:
        if emote in tweet:
            exp_count += 1
    return exp_count



def remove_tweet_unwanted(tweet: str) -> str:
    updated_tweet = re.sub("@\w+", "", tweet)
    updated_tweet = re.sub("http\S+", "", updated_tweet)
    updated_tweet = re.sub(" .{1} ", " ", updated_tweet)
    return updated_tweet

def extract_features(text):
    features = dict()
    pos_wordcount = 0
    compound_scores = list()
    positive_scores = list()
    exp_count = 0
    sad_expressions = [":(", ":'(", ":-(", ":'-(", "=("]
    happy_expressions = [":)", ":-)", ":D", "=)", ":]", ":>", ":^)"]

    for sentence in nltk.sent_tokenize(text):
        exp_count += find_expressions(sentence, sad_expressions, happy_expressions)
        for word in nltk.word_tokenize(sentence):
            if word.lower() in top_100_positive:
                pos_wordcount += 1
            elif word.lower() in top_100_negative:
                pos_wordcount -= 1
        compound_scores.append(sia.polarity_scores(sentence)["compound"])
        positive_scores.append(sia.polarity_scores(sentence)["pos"])

    # Adding 1 to the final compound score to always have positive numbers
    # since some classifiers you'll use later don't work with negative numbers.
    features["mean_compound"] = mean(compound_scores)
    features["mean_positive"] = mean(positive_scores)
    features["pos_wordcount"] = pos_wordcount
    # features["emote_score"] = exp_count

    return features

neg_tweets = twitter_samples.strings('negative_tweets.json')
# neg_tweets_exp_count = [find_expressions(tweet, sad_expressions) for tweet in neg_tweets]
# neg_tweets = [remove_tweet_unwanted(tweet) for tweet in neg_tweets]

pos_tweets = twitter_samples.strings('positive_tweets.json')
# pos_tweets_exp_count = [find_expressions(tweet, happy_expressions) for tweet in pos_tweets]
# pos_tweets = [remove_tweet_unwanted(tweet) for tweet in pos_tweets]

all_tweets = neg_tweets + pos_tweets
neg_tweet_words = []
pos_tweet_words = []
for i, tweet in enumerate(neg_tweets):
    tweet = remove_tweet_unwanted(tweet)
    tweet_words_tags = nltk.pos_tag(nltk.word_tokenize(tweet))
    neg_tweet_words += [word for word, tag in filter(is_wanted, tweet_words_tags)]
    pass

for i, tweet in enumerate(pos_tweets):
    tweet = remove_tweet_unwanted(tweet)
    tweet_words_tags = nltk.pos_tag(nltk.word_tokenize(tweet))
    pos_tweet_words += [word for word, tag in filter(is_wanted, tweet_words_tags)]
    pass

positive_fd = nltk.FreqDist(pos_tweet_words)
negative_fd = nltk.FreqDist(neg_tweet_words)

common_set = set(positive_fd).intersection(negative_fd)

for word in common_set:
    del positive_fd[word]
    del negative_fd[word]

top_100_positive = {word for word, count in positive_fd.most_common(100)}
top_100_negative = {word for word, count in negative_fd.most_common(100)}

shuffle(all_tweets)

# for i in range(0,len(tweet_samp)): is_correct(tweet_samp[i],check_positive(tweet_samp[i]),pos_tweets)
# print(f"Accuracy is: {counter/len(tweet_samp)}%")

features = [
    (extract_features(tweet),"pos")
    for tweet in pos_tweets
]
features.extend([
    (extract_features(tweet),"neg")
    for tweet in neg_tweets
])
pass

train_count = int(len(features) * (5/6))
shuffle(features)
classifier = nltk.NaiveBayesClassifier.train(features[:train_count])
classifier.show_most_informative_features(10)
print(nltk.classify.accuracy(classifier, features[train_count:]))

# for i in range(0,len(tweet_samp)): is_correct(tweet_samp[i],check_positive(tweet_samp[i]),pos_tweets)
# print(f"Accuracy is: {counter/len(tweet_samp)}%")


Most Informative Features
           mean_compound = -0.4404           neg : pos    =    426.4 : 1.0
           mean_compound = 0.4588            pos : neg    =    135.6 : 1.0
           mean_compound = 0.2294            pos : neg    =    131.6 : 1.0
           mean_positive = 0.5               pos : neg    =     74.2 : 1.0
           mean_positive = 0.375             pos : neg    =     49.2 : 1.0
           mean_compound = 0.3412            neg : pos    =     46.1 : 1.0
           pos_wordcount = 1                 pos : neg    =     43.0 : 1.0
           mean_positive = 0.6               pos : neg    =     39.9 : 1.0
           mean_positive = 0.1255            pos : neg    =     33.2 : 1.0
           mean_compound = 0.7096            pos : neg    =     27.7 : 1.0
0.8452309538092382


In [5]:
# movie_reviews = input()
# print(classifier.classify(extract_features(movie_reviews)))
# # print(classifier.classify({"mean_compound": extract_features(movie_reviews)["mean_compound"]}))
# extract_features(movie_reviews)


In [7]:
import pickle
f = open('classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()