In [1]:
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer
nltk.download('stopwords')

stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package stopwords to /home/aditi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
import re 
import tweepy 
from tweepy import OAuthHandler 
from textblob import TextBlob 

class TwitterClient(object): 
    ''' 
    Generic Twitter Class for sentiment analysis. 
    '''
    def __init__(self): 
        ''' 
        Class constructor or initialization method. 
        '''
        # keys and tokens from the Twitter Dev Console 
        consumer_key = 'XXXXXXXXXXXXXXXXXXXXXXXXXX'
        consumer_secret = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
        access_token = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
        access_token_secret = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXxx'

        # attempt authentication 
        try: 
            # create OAuthHandler object 
            self.auth = OAuthHandler(consumer_key, consumer_secret) 
            # set access token and secret 
            self.auth.set_access_token(access_token, access_token_secret) 
            # create tweepy API object to fetch tweets 
            self.api = tweepy.API(self.auth) 
        except: 
            print("Error: Authentication Failed") 
            
            
    def preprocess(self, text, stem=False):
        # Remove link,user and special characters
        tokens = []
        for token in text.split():
            if token not in stop_words:
                if stem:
                    tokens.append(stemmer.stem(token))
                else:
                    tokens.append(token)
        return " ".join(tokens)

    def clean_tweet(self, tweet): 
        ''' 
        Utility function to clean tweet text by removing links, special characters 
        using simple regex statements. 
        '''
        tweet = tweet.lower()
        # Replaces URLs with the word URL
        tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
        # Replace @handle with the word USER_MENTION
        tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
        # Replaces #hashtag with hashtag
        tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
        # Remove RT (retweet)
        tweet = re.sub(r'\brt\b', '', tweet)
        # Replace 2+ dots with space
        tweet = re.sub(r'\.{2,}', ' ', tweet)
        # Strip space, " and ' from tweet
        tweet = tweet.strip(' "\'')
        # Replace emojis with either EMO_POS or EMO_NEG
        # Smile -- :), : ), :-), (:, ( :, (-:, :')
        tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
        # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
        tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
        # Love -- <3, :*
        tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
        # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
        tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
        # Sad -- :-(, : (, :(, ):, )-:
        tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
        # Cry -- :,(, :'(, :"(
        tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
        # Replace multiple spaces with a single space
        tweet = re.sub(r'\s+', ' ', tweet)
        tweet = self.preprocess(tweet)
        return ' '.join(tweet.split()) 
    

    def get_tweet_sentiment(self, tweet): 
        ''' 
        Utility function to classify sentiment of passed tweet 
        using textblob's sentiment method 
        '''
        # create TextBlob object of passed tweet text 
        analysis = TextBlob(self.clean_tweet(tweet)) 
        # set sentiment 
        if analysis.sentiment.polarity > 0: 
            return 'positive'
        elif analysis.sentiment.polarity == 0: 
            return 'neutral'
        else: 
            return 'negative'

    def get_tweets(self, query, count = 10): 
        ''' 
        Main function to fetch tweets and parse them. 
        '''
        # empty list to store parsed tweets 
        tweets = [] 

        try: 
            # call twitter api to fetch tweets 
            fetched_tweets = self.api.search(q = query, count = count) 

            # parsing tweets one by one 
            for tweet in fetched_tweets: 
                # empty dictionary to store required params of a tweet 
                parsed_tweet = {} 

                # saving text of tweet 
                parsed_tweet['text'] = self.clean_tweet(tweet.text) 
                # saving sentiment of tweet 
                parsed_tweet['resultant_sentiment'] = self.get_tweet_sentiment(tweet.text) 

                # appending parsed tweet to tweets list 
                if tweet.retweet_count > 0: 
                    # if tweet has retweets, ensure that it is appended only once 
                    if parsed_tweet not in tweets: 
                        tweets.append(parsed_tweet) 
                else: 
                    tweets.append(parsed_tweet) 

            # return parsed tweets 
            return tweets 

        except tweepy.TweepError as e: 
            # print error (if any) 
            print("Error : " + str(e)) 

 
    # creating object of TwitterClient Class 
api = TwitterClient() 
    # calling function to get tweets 


    # printing first 5 positive tweets 
#     print("\n\nPositive tweets:") 
#     for tweet in tweets[:10]: 
#         print("*"+tweet['text'])
import pandas as pd 
print("Enter the keyword for which you want to see the sentiments of the public.")
a = input()
tweets = api.get_tweets(query = a, count = 200) 
df = pd.DataFrame(tweets)

Enter the keyword for which you want to see the sentiments of the public.
Donald trump


In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
df.head()

Unnamed: 0,text,resultant_sentiment
0,"USER_MENTION kung biglang magsungit man ako, k...",negative
1,USER_MENTION spongebob never made bad track. g...,positive
2,USER_MENTION USER_MENTION think exact opposite...,positive
3,"USER_MENTION years still happening day, please...",positive
4,"USER_MENTION you’re bad day, remember. probabl...",negative


In [22]:
import pickle
filename = 'finalized_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))



In [36]:
documents = [_text.split() for _text in df.text] 

In [24]:
from gensim.models import Word2Vec

In [25]:
import numpy as np

In [26]:
test1 = [_text.split() for _text in ["we are going to love you"]] 
test2 = [_text.split() for _text in ["good boy"]] 

In [27]:
model = Word2Vec.load("word2vec.model")

In [37]:
def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector
    
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)


# get document level embeddings
w2v_feature_array = averaged_word_vectorizer(corpus=documents, model=model,
                                             num_features=100)
pd.DataFrame(w2v_feature_array)

  if __name__ == '__main__':


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.250259,0.363413,0.304923,-0.098786,-0.322563,0.143053,0.154211,-0.067273,-0.165551,0.134352,...,-0.224850,0.445878,0.219864,0.159322,0.459781,0.132345,-0.072165,-0.022730,-0.336262,-0.288855
1,0.666645,0.407935,-0.152610,0.065117,-0.036662,0.357301,0.387878,0.467660,0.150157,0.333346,...,-0.111404,0.158134,-0.013688,-0.527503,0.380493,-0.388031,-0.056256,0.563432,-0.389901,0.403929
2,0.283720,0.242255,-0.157147,0.142395,0.036252,0.363503,0.076850,0.163377,-0.388005,0.149404,...,-0.052186,0.148862,-0.249630,0.378697,0.181724,0.110277,-0.351581,-0.301933,-0.155605,0.210676
3,0.202060,0.490750,0.245878,-0.311543,-0.147089,0.208966,0.137801,0.345110,-0.015593,0.077107,...,-0.384091,0.128436,-0.029694,-0.034807,-0.257957,-0.171794,0.006300,-0.226728,-0.406216,0.043717
4,0.274666,0.412749,0.202931,-0.089640,-0.031670,0.259341,0.251115,0.353040,0.125677,0.161168,...,-0.002296,-0.109795,0.207997,0.072212,-0.025799,-0.193316,-0.228833,0.072230,-0.044962,0.222814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,0.294585,0.394379,0.091458,0.025754,-0.289807,-0.092539,-0.044057,0.264067,0.430964,-0.067173,...,-0.254263,0.201133,0.120866,-0.166033,0.087171,-0.281296,0.152616,-0.033951,-0.564000,0.071353
77,0.494898,-0.048262,-0.415276,-0.037659,-0.471465,-0.593650,0.086959,0.130786,0.110508,0.367451,...,0.143945,0.374847,0.135156,0.331725,0.133048,0.060897,-0.046874,-0.507817,-0.294147,-0.096055
78,-0.062984,-0.000565,0.009214,0.021747,-0.234296,0.121840,0.071136,-0.005935,0.162295,0.096997,...,-0.450455,0.269456,0.041603,0.118159,0.116374,-0.168266,0.212704,-0.125080,-0.057838,-0.031910
79,-0.429995,-0.029373,-0.002410,-0.380935,-0.318411,-0.773685,0.337890,0.190631,0.722611,0.270579,...,0.138991,1.169573,0.041825,0.013802,0.614721,-0.310854,-0.045122,0.601854,-0.574123,-0.521743


In [38]:
prediction = loaded_model.predict(w2v_feature_array) # predicting on the validation set
# prediction = loaded_model.predict(X_tf1) # predicting on the validation set

# prediction_int = prediction>= 0.3 # if prediction is greater than or equal to 0.3 than 1 else 0


In [39]:
prediction # 0 means negative and 1 means positive

array([1., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.])