In [83]:
from tweepy import Stream
import json
from keys import *
from companies import *
import pandas as pd
import time

In [84]:
import numpy as np
import pandas as pd
import os
import re
import string
import nltk
import matplotlib.pyplot as plt
plt.rc('figure',figsize=(17,13))
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as ex
from plotly.subplots import make_subplots
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import datetime
import warnings
warnings.filterwarnings("ignore")

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\darsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\darsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\darsh\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [79]:
class TwitterStream(Stream):

    def __init__(self, key, secret, token, token_secret, responses, df, limit = 1):
        """Configure the SentimentListener."""
        self.responses = responses
        self.df = df
        self.tweet_count = 0
        self.TWEET_LIMIT = limit

        super().__init__(key, secret, token, token_secret)


    """
    Inherits from tweepy's Stream class. We are only modifying the few functions
    we need to customize, namely what to do when new tweets come through.
    """

    def get_companies(self, tweet):
        """
        :param tweet: The string text of a tweet.
        :return: A list of companies mentioned in the tweet.
        """
        tweet = tweet.lower()
        companies_mentioned = []
        for stock in stocks:
            if stock in tweet:
                companies_mentioned.append(stock)

        return companies_mentioned
    
    def make_dataframe(self, responses):
        #Takes the text of the tweet and converts it into a dataframe.
        self.df.loc[len(self.df)] = self.responses
    
    def url_removal(self, df):
        #removes the url from the tweet
        self.df['Tweet'] = self.df['Tweet'].apply(lambda x:re.sub(r"http\S+", "", x))

    def punctuation_removal(self, df):
        #Removing of the Punctuations from the tweet's text
        punctuation_removal = string.punctuation
        def remove_punctuation(text):
            return text.translate(str.maketrans('', '', punctuation_removal))
        self.df['Tweet'] = self.df['Tweet'].apply(lambda text: remove_punctuation(text))
    
    def single_and_double_character_space_removal(self, df):
        #Removal of Single and Double character space from the tweet's text
        self.df['Tweet'] = self.df['Tweet'].apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))
        self.df['Tweet'] = self.df['Tweet'].apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))
    
    def stopwords_removal(self,df):
        #Removing Stop words from the tweet like ["like", "such", "a"]
        #NLTK has a predefined list of stopwords which makes it easy to just remove them from the tweet's text.
        STOPWORDS = set(stopwords.words('english'))
        def remove_stopwords(text):
            return " ".join([word for word in str(text).split() if word not in STOPWORDS])
        self.df['Tweet'] = self.df['Tweet'].apply(lambda text: remove_stopwords(text))

    def remove_emoji(self, df):
        #Removal of Emoji's from the tweet's text
        def emoji(string):
            emoji_pattern = re.compile("["
                                    u"\U0001F600-\U0001F64F"  # emoticons
                                    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                    u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                    u"\U00002500-\U00002BEF"  # chinese char
                                    u"\U00002702-\U000027B0"
                                    u"\U00002702-\U000027B0"
                                    u"\U000024C2-\U0001F251"
                                    u"\U0001f926-\U0001f937"
                                    u"\U00010000-\U0010ffff"
                                    u"\u2640-\u2642"
                                    u"\u2600-\u2B55"
                                    u"\u200d"
                                    u"\u23cf"
                                    u"\u23e9;"
                                    u"\u231a"
                                    u"\ufe0f"  # dingbats
                                    u"\u3030"
                                    "]+", flags=re.UNICODE)
            return emoji_pattern.sub(r'', string)
        self.df['Tweet'] = self.df['Tweet'].apply(str)
        self.df['Tweet'] = self.df['Tweet'].apply(emoji)

    def tokenization(self, df):
        #tokenizing the entire tweet's text
        #example: "I like dog" becomes ["I", "like", "dog"]
        def tokens(text):
            text = re.split('\W+', text)
            return text
        self.df['Tokenized'] = self.df['Tweet'].apply(lambda x: tokens(x.lower()))
    
    def lemmatizer(self, df):
        #lemmatization-capable machine would know that “studies” is the singular verb form of the word “study” in the present tense.
        wn = nltk.WordNetLemmatizer()
        def lemmatizor(text):
            text = [wn.lemmatize(word) for word in text]
            return text 
        self.df['Lemmatized'] = self.df['Tokenized'].apply(lambda x: lemmatizor(x))
    
    def sentiment_score_generator(self, df):
        #Generating the Sentiment Score
        #The compound score generated indicates the level of polarity towards positive and Negative sentiment.
        #If a compound score is more towards -1, then the tweet has a strong negative sentiment.
        #If a compound score is more towards +1, then the tweet has a strong positive sentiment.
        sid = SIA()
        self.df['sentiments'] = self.df["Tweet"].apply(lambda x: sid.polarity_scores(' '.join(re.findall(r'\w+',str(x).lower()))))
        self.df['Positive Sentiment'] = self.df['sentiments'].apply(lambda x: x['pos']+1*(10**-6))
        self.df['Neutral Sentiment'] = self.df['sentiments'].apply(lambda x: x['neu']+1*(10**-6))
        self.df['Negative Sentiment'] = self.df['sentiments'].apply(lambda x: x['neg']+1*(10**-6))



    def on_data(self, raw_data):
        """
        Gets called every time a new tweet gets filtered through.
        """
        if 'extended_tweet' in self.process_data(raw_data):
            self.responses.append(self.process_data(raw_data)['extended_tweet']['full_text'])
        else:
            self.responses.append(self.process_data(raw_data)['text'])

        self.make_dataframe(responses)
        self.url_removal(df)
        self.punctuation_removal(df)
        self.single_and_double_character_space_removal(df)
        self.stopwords_removal(df)
        self.remove_emoji(df)
        self.tokenization(df)
        self.lemmatizer(df)
        self.sentiment_score_generator(df)


        self.tweet_count += 1  # track number of tweets processed

        #if TWEET_LIMIT is reached, return False to terminate streaming
        if self.tweet_count == self.TWEET_LIMIT:
            self.disconnect()



    def process_data(self, raw_data):
        """
        Sends data through the Kafka log.
        """
        response = json.loads(raw_data)
        

        # Get tweet text
        if 'extended_tweet' in response:
            tweet_text = response['extended_tweet']['full_text']
        else:
            tweet_text = response['text']
        

        # Associate each company mentioned with the tweet
        companies_mentioned = self.get_companies(tweet_text)
        for corp in companies_mentioned:
            response['company_name'] = corp

        return response
            
    def on_error(self, status_code):
        """
        Returning false disconnects the stream.
        """
        if status_code == 420:
            return False

In [80]:
limit = 1
responses= []
df = pd.DataFrame(columns=["Tweet"])
stream = TwitterStream(API_KEY, API_KEY_SECERT,ACCESS_TOKEN, ACCESS_TOKEN_SECRET, responses, df, limit)

In [81]:
stream.filter(track=['Microsoft'], languages=['en'])

Stream connection closed by Twitter


In [82]:
pd.set_option('display.max_colwidth', None)
df

Unnamed: 0,Tweet,Tokenized,Lemmatized,sentiments,Positive Sentiment,Neutral Sentiment,Negative Sentiment
0,RT ionstorm Your AD compromised yet SOC chats Microsoft Teams AD Connected VOIP without knowing full details breach a…,"[rt, ionstorm, your, ad, compromised, yet, soc, chats, microsoft, teams, ad, connected, voip, without, knowing, full, details, breach, a, ]","[rt, ionstorm, your, ad, compromised, yet, soc, chat, microsoft, team, ad, connected, voip, without, knowing, full, detail, breach, a, ]","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",1e-06,1.000001,1e-06
