# Authentication File

In [1]:
class Authentication:
    def __init__(self):
        cred = open("resources/MyCredentials.txt", "r", encoding='utf-8', errors='replace').read().split('\n')
        self.ckey = cred[0]
        self.csecret = cred[1]
        self.atoken = cred[2]
        self.asecret = cred[3]

# Get the Data Source, and Prepare Training and Test set

In [2]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag, FreqDist
import random
import pickle


def DataSources():
    positiveData = open("resources/positive.txt", "r", encoding='utf-8', errors='replace').read()
    negativeData = open("resources/negative.txt", "r", encoding='utf-8', errors='replace').read()
    return positiveData, negativeData


def PrepareData():
    train_pos, train_neg = DataSources()
    documents = []
    all_words = []
    
#    j is adjective, r is adverb, and v is verb
#    allowed_word_types = ["J","R","V"]
    allowed_word_types = ["J"]

    for p in train_pos.split('\n'):
        documents.append((p, "pos"))
        words = word_tokenize(p)
        pos = pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())

    for p in train_neg.split('\n'):
        documents.append((p, "neg"))
        words = word_tokenize(p)
        pos = pos_tag(words)
        for w in pos:
            if w[1][0] in allowed_word_types:
                all_words.append(w[0].lower())
    
    save_documents = open("saved/documents.p", "wb")
    pickle.dump(documents, save_documents)
    save_documents.close()

    all_words = FreqDist(all_words)
    word_features = list(all_words.keys())[:5000]

    save_word_features = open("saved/word_features5k.p", "wb")
    pickle.dump(word_features, save_word_features)
    save_word_features.close()

    features = [(find_features(rev, word_features), category) for (rev, category) in documents]
    return features


def find_features(document, word_features):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features


def TestTrainData():
    featuresets = PrepareData()
    random.shuffle(featuresets)
#    print(len(featuresets))
    testing_set = featuresets[10000:]
    training_set = featuresets[:10000]
    return training_set, testing_set

# Train Classifiers and store them 

In [3]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from nltk import NaiveBayesClassifier, classify
# from gsu.data import TestTrainData
import pickle


def TrainClassifiers():
    training_set, testing_set = TestTrainData()

    classifiers = list()
    classifier_name = list()

    NaiveBayesClassifier_classifier = NaiveBayesClassifier.train(training_set)
    classifiers.append(NaiveBayesClassifier_classifier)
    classifier_name.append("NaiveBayesClassifier")

    MNB_classifier = SklearnClassifier(MultinomialNB())
    MNB_classifier.train(training_set)
    classifiers.append(MNB_classifier)
    classifier_name.append("MultinomialNBClassifier")

    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    BernoulliNB_classifier.train(training_set)
    classifiers.append(BernoulliNB_classifier)
    classifier_name.append("BernoulliNBClassifier")

    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    LogisticRegression_classifier.train(training_set)
    classifiers.append(LogisticRegression_classifier)
    classifier_name.append("LogisticRegressionClassifier")

    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    LinearSVC_classifier.train(training_set)
    classifiers.append(LogisticRegression_classifier)
    classifier_name.append("LinearSVCClassifier")

    SGDC_classifier = SklearnClassifier(SGDClassifier())
    SGDC_classifier.train(training_set)
    classifiers.append(SGDC_classifier)
    classifier_name.append("SGDClassifier")

    print("Naive_Bayes Algo accuracy percent:", 
          (classify.accuracy(NaiveBayesClassifier_classifier, testing_set))*100)
    
    print("MNB_classifier accuracy percent:", 
          (classify.accuracy(MNB_classifier, testing_set))*100)
    
    print("BernoulliNB_classifier accuracy percent:", 
          (classify.accuracy(BernoulliNB_classifier, testing_set))*100)
    
    print("LogisticRegression_classifier accuracy percent:", 
          (classify.accuracy(LogisticRegression_classifier, testing_set))*100)
    
    print("LinearSVC_classifier accuracy percent:", 
          (classify.accuracy(LinearSVC_classifier, testing_set))*100)
    
    print("SGDClassifier accuracy percent:", 
          (classify.accuracy(SGDC_classifier, testing_set))*100)

    SaveClassifiers(classifiers, classifier_name)

    return classifiers


def SaveClassifiers(classifiers, classifier_name):

    for i in range(0, len(classifiers)):
        save_classifier_path = open("saved/" + classifier_name[i] + ".p", "wb")
        pickle.dump(classifiers[i], save_classifier_path)
        save_classifier_path.close()

    save_classifier_path = open("saved/classifier_name.p", "wb")
    pickle.dump(classifier_name, save_classifier_path)

# Rather than training the same classifiers everytime without any change of the data, or parameters, we are loading it from next time

In [4]:
import random
import pickle
from nltk.tokenize import word_tokenize
from nltk import classify



def find_features(document, word_features):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features


def LoadData(shuffle=False):

    documents_f = open("saved/documents.p", "rb")
    documents = pickle.load(documents_f)
    documents_f.close()

    documents_f = open("saved/word_features5k.p", "rb")
    word_features = pickle.load(documents_f)
    documents_f.close()

    features = [(find_features(rev, word_features), category) for (rev, category) in documents]

    if shuffle:
        random.shuffle(features)

    testing_set = features[10000:]
    training_set = features[:10000]

    return training_set, testing_set


def LoadClassifiers():
    document = open("saved/classifier_name.p", "rb")
    classifier_name = pickle.load(document)
    document.close()
    # print(classifier_name)
    classifiers = list()

#     training_set, testing_set = LoadData()

    for name in classifier_name:
        document = open("saved/" + name + ".p", "rb")
        classifier = pickle.load(document)
#         print(name + " Algo accuracy percent:", (classify.accuracy(classifier, testing_set))*100)

        classifiers.append(classifier)
        document.close()

    return classifiers


def LoadFeatures():
    documents_f = open("saved/word_features5k.p", "rb")
    word_features = pickle.load(documents_f)
    documents_f.close()
    return word_features

# This class gives us the classifiers and also gives the confidence

In [5]:
from nltk.classify import ClassifierI
from statistics import mode, StatisticsError


class VoteClassifier(ClassifierI):
    def __init__(self, classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        ret = "neg"
        try:
            ret = mode(votes)
        except StatisticsError:
            # print("Caught1")
            pass
        return ret

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        try:
            choice_votes = votes.count(mode(votes))
            conf = choice_votes / len(votes)
            return conf
        except StatisticsError:
            # print("Caught2")
            return 0.5

# This class gives us the sentiment of evry sentence wheather it is positive or negative

In [6]:
# # from gsu.train import TrainClassifiers
# from gsu.load import LoadClassifiers
# from gsu.load import LoadFeatures
# from gsu.data import find_features
# from gsu.VoteClassifier import VoteClassifier


class Sentiment:

    def __init__(self):
        # classifiers = TrainClassifiers()
        classifiers = LoadClassifiers()
        self.votedClassifier = VoteClassifier(classifiers)
        self.new_features = LoadFeatures()

    def Analyse(self, text):
        new_features = find_features(text, self.new_features)
        return self.votedClassifier.classify(new_features), self.votedClassifier.confidence(new_features)

#This will now predict wheather the sentence is positive or negative

In [7]:
# from gsu.Sentiment import Sentiment

s = Sentiment()
print(s.Analyse("This movie was awesome! The acting was great, plot was wonderful, and there were "
                    "pythons...so yea!"))

print(s.Analyse("I am happy and awesome"))

print(s.Analyse("This movie was awesome"))

('pos', 1.0)
('pos', 1.0)
('neg', 1.0)


# This exercise we did was to build a scalable project, with good coding practice. The performance could have been better if we had more data and time to tune the models

In [8]:
# from gsu.Sentiment import Sentiment
# from gsu.Authentication import Authentication
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import json
import os
# from elasticsearch import Elasticsearch


class Listener(StreamListener):

    def __init__(self):
        super().__init__()
        self.s = Sentiment()
#         self.es = Elasticsearch()
        self.count = 0
        try:
            os.remove("saved/twitter-out.txt")
        except OSError:
            pass

    def on_data(self, data):
        all_data = json.loads(data)
        # print("\n\nstarts\n\n", all_data)
        if all_data["lang"] != "en":
            return True
        self.count += 1
#         self.es.index(index="obama-index", doc_type='obama', id=self.count, body=all_data)
        tweet = all_data["text"]
        # print("\n\nneeded\n\n", tweet, "\n\nEnded\n\n")
        sentiment_value, confidence = self.s.Analyse(tweet)
        print(tweet, sentiment_value, confidence)

        if confidence*100 >= 80:
            output = open("saved/obama-out.txt", "a")
            output.write(sentiment_value)
            output.write('\n')
            output.close()
        return True

    def on_error(self, status):
        print(status)


def main():
    cred = Authentication()
    auth = OAuthHandler(cred.ckey, cred.csecret)
    auth.set_access_token(cred.atoken, cred.asecret)
    twitterStream = Stream(auth, Listener())
    twitterStream.filter(track=["Obama"])


if __name__ == '__main__':
    main()

The Obama Doctrine: https://t.co/5D0k2RWm5c neg 1.0
RT @Paul1Singh: On all this fury about Ken, let's remember what Boris said about #Obama. Has David Cameron suspended #BorisJohnson yet? neg 1.0
RT @boomerforbernie: @panegron Obama was defending his friends in the banks today. Guess he is on the list for big speaker fees. neg 0.8333333333333334
RT @TimGclaw: Plus nothing said about his overtly racist comments concerning Africans and Obama, strange that as well https://t.co/gK4JOpCk… neg 1.0
BREAKING: Government Of Kenya Has Released Obama's Authentic Birth Certificate https://t.co/Hkm4UIUALY via @usheraldnews neg 1.0
@realDonaldTrump what do you think about reports of Michelle Obama being a male ? neg 1.0
BREAKING: Government Of Kenya Has Released Obama's Authentic Birth Certificate https://t.co/oaMzGxwTNF via @usheraldnews neg 1.0
Michelle Obama In Harlem To Promote Higher Education During College Signing Day 
https://t.co/c5Kk6JSku7
#education neg 1.0
RT @JudgeMoroz: Every US Presid

KeyboardInterrupt: 