<a href="https://colab.research.google.com/github/ChristianBugtai/Twitter-Sentiment-Analysis/blob/main/utils.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install twython

Collecting twython
  Downloading https://files.pythonhosted.org/packages/24/80/579b96dfaa9b536efde883d4f0df7ea2598a6f3117a6dd572787f4a2bcfb/twython-3.8.2-py3-none-any.whl
Installing collected packages: twython
Successfully installed twython-3.8.2


In [None]:
import pandas as pd
import re

from twython import Twython
import json

from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

import pickle


def searchTweets(query, result_type='popular', count=1000, lang='en'):
    """ returns a dict"""
    
    # Load credentials from json file
    with open("/content/drive/MyDrive/Lighthouselabs/Project_Planning/Final_Project/twitter_credentials.json", "r") as file:
        creds = json.load(file)

    # Instantiate an object
    python_tweets = Twython(creds['CONSUMER_KEY'], creds['CONSUMER_SECRET'])

    # Create our query
    query = {'q': query,
            'result_type': result_type,
            'count': count,
            'lang': lang,
            }
    
    
    # Search tweets
    dict_ = {'user': [], 'date': [], 'text': [], 'favorite_count': [], 'location':[]}
    for status in python_tweets.search(**query)['statuses']:
        dict_['user'].append(status['user']['screen_name'])
        dict_['date'].append(status['created_at'])
        dict_['text'].append(status['text'])
        dict_['favorite_count'].append(status['favorite_count'])
        dict_['location'].append(status['user']['location'])

    return dict_


# Defining dictionary containing all emojis with their meanings.
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

## Defining set containing all stopwords in english.
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from', 
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

def preprocess(textdata):
    processedText = []
    
    # Create Lemmatizer and Stemmer.
    wordLemm = WordNetLemmatizer()
    
    # Defining regex patterns.
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    alphaPattern      = "[^a-zA-Z0-9]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    
    for tweet in textdata:
        tweet = tweet.lower()
        
        # Replace all URls with 'URL'
        tweet = re.sub(urlPattern,' URL',tweet)
        # Replace all emojis.
        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])        
        # Replace @USERNAME to 'USER'.
        tweet = re.sub(userPattern,' USER', tweet)        
        # Replace all non alphabets.
        tweet = re.sub(alphaPattern, " ", tweet)
        # Replace 3 or more consecutive letters by 2 letter.
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

        tweetwords = ''
        for word in tweet.split():
            # Checking if the word is a stopword.
            #if word not in stopwordlist:
            if len(word)>1:
                # Lemmatizing the word.
                word = wordLemm.lemmatize(word)
                tweetwords += (word+' ')
            
        processedText.append(tweetwords)
        
    return processedText


def load_models():
    '''
    Replace '..path/' by the path of the saved models.
    '''
    
    # Load the vectoriser.
    file = open('/content/drive/MyDrive/Lighthouselabs/Project_Planning/Final_Project/vectoriser.pickle', 'rb')
    vectoriser = pickle.load(file)
    file.close()

    # Load the LR Model.
    file = open('/content/drive/MyDrive/Lighthouselabs/Project_Planning/Final_Project/LogisticRegression.pickle', 'rb')
    LRmodel = pickle.load(file)
    file.close()

    ## Load the BNB Model.
    #file = open('/content/drive/MyDrive/Lighthouselabs/Project_Planning/Final_Project/NaiveBayes.pickle', 'rb')
    #BNBModel = pickle.load(file)
    #file.close()
    
    return vectoriser, LRmodel #, BNBModel

def getConfidence(sentiment, probaScore):
    data = []
    for i in range(len(sentiment)):
      data.append(round(probaScore[i][sentiment[i]]*100,2))
    return data

def predict(vectoriser, model, text):
    # Predict the sentiment
    textdata = vectoriser.transform(preprocess(text))
    sentiment = model.predict(textdata)
    probaScore = LRmodel.predict_proba(textdata)
    confidence = getConfidence(sentiment, probaScore)

    
    # Make a list of text with sentiment.
    data = []
    for text, pred, conf in zip(text, sentiment, confidence):
        data.append((text,pred, conf))
        
    # Convert the list into a Pandas DataFrame.
    df = pd.DataFrame(data, columns = ['text','sentiment', 'confidence'])
    df = df.replace([0,1], ["Negative","Positive"])
    return df

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def getSentiment(query, result_type='popular', count=1000, lang='en'):

  try:
    #get tweets
    tweets = searchTweets(query=query, result_type=result_type, count=count, lang=lang)
    #import model
    vectoriser, LRmodel = load_models()
    #predict
    df = predict(vectoriser, LRmodel, tweets['text'])

    return df
  except ValueError:
    print('No Results')

In [None]:
vectoriser, LRmodel = load_models()

In [None]:
TEXT = ['This text is just a test, i love DataScience!']
pred = predict(vectoriser, LRmodel, TEXT)

In [None]:
pred

Unnamed: 0,text,sentiment
0,"This text is just a test, i love DataScience!",Positive


In [None]:
import requests

In [None]:
query = 'covid'
parameters = {'q':f'{query}','lang':'en','count':100}
r = requests.get('https://api.twitter.com/1.1/search/tweets.json?', params=parameters)
print(r.url)

https://api.twitter.com/1.1/search/tweets.json?q=covid&lang=en&count=100


In [None]:
!pip install twitter

Collecting twitter
[?25l  Downloading https://files.pythonhosted.org/packages/85/e2/f602e3f584503f03e0389491b251464f8ecfe2596ac86e6b9068fe7419d3/twitter-1.18.0-py2.py3-none-any.whl (54kB)
[K     |██████                          | 10kB 15.6MB/s eta 0:00:01[K     |████████████                    | 20kB 20.4MB/s eta 0:00:01[K     |██████████████████              | 30kB 10.9MB/s eta 0:00:01[K     |████████████████████████        | 40kB 9.2MB/s eta 0:00:01[K     |██████████████████████████████▏ | 51kB 4.4MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.1MB/s 
[?25hInstalling collected packages: twitter
Successfully installed twitter-1.18.0


In [None]:
from twitter import *
import json
from IPython.display import JSON

In [None]:
with open("/content/drive/MyDrive/Lighthouselabs/Project_Planning/Final_Project/twitter_credentials.json", "r") as file:
        creds = json.load(file)
t = Twitter(
    auth=OAuth(creds['ACCESS_TOKEN'], creds['ACCESS_SECRET'], creds['CONSUMER_KEY'], creds['CONSUMER_SECRET']))

In [None]:
result = t.search.tweets(q="data science", lang = 'en', count = 100, result_type = 'mixed')

In [None]:
len(result['statuses'])

97

In [None]:
count = result['search_metadata']['count']

In [None]:
# Search tweets
dict_ = {'user': [], 'date': [], 'text': [], 'location':[]}
for i in range(count):
  dict_['user'].append(result['statuses'][i]['user']['screen_name'])
  dict_['date'].append(result['statuses'][i]['created_at'])
  dict_['text'].append(result['statuses'][i]['text'])
  dict_['location'].append(result['statuses'][i]['user']['location'])

In [None]:
import pandas as pd
pd.DataFrame(dict_)

Unnamed: 0,user,date,text,location
0,Jillie_Alexis,Sun Nov 29 18:30:22 +0000 2020,There is no data or science that supports the ...,"Washington, DC"
1,BogochIsaac,Mon Nov 30 12:14:32 +0000 2020,Moderna's #COVID19 vaccine final results are i...,University of Toronto
2,SecAzar,Mon Nov 30 13:40:09 +0000 2020,"With today's announcement from Moderna, we'll ...","Washington, DC"
3,Sdimaso,Mon Nov 30 13:43:33 +0000 2020,Why? Our RT was 1.14 as of yesterday! \nGov he...,New Jersey
4,laura_nelson,Mon Nov 30 00:08:52 +0000 2020,There are about 15 people here in Echo Park pr...,"Los Angeles, CA"
5,johnandkenshow,Mon Nov 30 22:10:54 +0000 2020,No Science - No Data - No Shutdown // Dozens o...,Southern California
6,fchollet,Mon Nov 30 16:51:03 +0000 2020,"Every year, Kaggle runs a large-sample-size su...",United States
7,Esri,Sun Nov 29 18:25:04 +0000 2020,Find free guided lessons in imagery and remote...,"Redlands, CA"
8,SteveHiltonx,Tue Dec 01 01:44:09 +0000 2020,"why are schools STILL closed when DATA, SCIENC...",California
9,wef,Mon Nov 30 05:00:04 +0000 2020,"The key to stopping Alzheimer's, according to ...","Geneva, Switzerland"


In [None]:
def searchTweets(query, count=100, result_type='popular'):
    """ returns a dict"""
    
    # Load credentials from json file
    with open("/content/drive/MyDrive/Lighthouselabs/Project_Planning/Final_Project/twitter_credentials.json", "r") as file:
        creds = json.load(file)
    
    # Instantiate an object
    t = Twitter(
    auth=OAuth(creds['ACCESS_TOKEN'], creds['ACCESS_SECRET'], creds['CONSUMER_KEY'], creds['CONSUMER_SECRET']))

    # Create our query
    result = t.search.tweets(q=query, count = count, result_type = result_type, lang = 'en')
    
    
    # Search tweets
    dict_ = {'user': [], 'date': [], 'text': [], 'location':[]}
    for i in range(count):
      try:
        dict_['user'].append(result['statuses'][i]['user']['screen_name'])
        dict_['date'].append(result['statuses'][i]['created_at'])
        dict_['text'].append(result['statuses'][i]['text'])
        dict_['location'].append(result['statuses'][i]['user']['location'])
      except:
        continue

    return dict_

In [None]:
result = getSentiment('covid')

UFuncTypeError: ignored

In [None]:
result

Unnamed: 0,text,sentiment,confidence
0,"33,772,990 people around the world have gotten...",Negative,52.57
1,My father-in-law ‘Coco’s dad’ was a serious ‘N...,Negative,89.51
2,Our response to covid sometimes makes me feel ...,Negative,70.53
3,Percent of wages currently subsidized by gover...,Positive,52.62
4,"A friend’s husband, quarantined at home with C...",Negative,87.99
5,I strongly support this Supreme Court ruling d...,Positive,78.24
6,8+ months into COVID and the average American ...,Negative,56.04
7,We have no time to lose to address the economi...,Negative,63.57
8,"To all the “tough guys"" who don't want to wear...",Negative,59.17
9,Florida Gov. Ron DeSantis says public schools ...,Negative,53.52


In [None]:
vectoriser, LRmodel = load_models()

In [None]:
textdata = vectoriser.transform(preprocess(['I Hate Python!','I love Python!', 'python is not bad', 'python is not good']))
sentiment = LRmodel.predict(textdata)
probaScore = LRmodel.predict_proba(textdata)
confidence = getConfidence(sentiment, probaScore)

In [None]:
probaScore

array([[0.96285099, 0.03714901],
       [0.0203129 , 0.9796871 ],
       [0.22348136, 0.77651864],
       [0.98662315, 0.01337685]])

In [None]:
sentiment

array([0, 1, 1, 0])

In [None]:
confidence

97.96870986282545

In [None]:
getConfidence(sentiment, probaScore)

[96.29, 97.97, 77.65, 98.66]