In [1]:
import pandas as pd
import numpy as np
import re
import string
import emoji
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Global Parameters
#stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Christopher\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
df = pd.read_csv('Resources/covid_vaccine_090721.csv') #Dataset from https://www.kaggle.com/kaushiksuresh147/covidvaccine-tweets
df.head()


Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,NSW Multicultural Health Communication Service,,Official account for NSW Multicultural Health ...,2018-08-20 02:00:05,1277,243,2311,True,2021-09-06 23:55:48,“Please get vaccinated. Take COVID seriously. ...,,TweetDeck,False
1,Climate Detective,,Physicist (MA & PhD) & socialist exploring cli...,2020-11-07 17:03:33,94,155,354,False,2021-09-06 23:46:16,@apsmunro That is a lot more than the risk fro...,['CovidVaccine'],Twitter Web App,False
2,david sirias,"Port Townsend, WA",Lefty songwriter/musician/producer/publisher. ...,2011-10-28 20:14:55,1262,1115,489,False,2021-09-06 23:38:34,"As I said months ago, anyone supporting a blan...","['FascistEmployerMandate', 'FascistVaccinePass...",Twitter for iPad,False
3,COVID News,Estados Unidos,News about COVID-19,2013-06-05 08:52:31,3330,1472,130,False,2021-09-06 23:36:50,More crazy Chiropractors!!! This time in Flori...,,Revive Social App,False
4,gain-of-function research,Undr the pine hunting lobsters,@Ayjchan's mandate: 'ONLY look @ a -Leak- frm ...,2013-12-24 20:30:14,272,20,24001,False,2021-09-06 23:29:20,Great 18 point case about why you should not t...,"['CovidVaccine', 'COVIDVaccination']",Twitter for Android,False


In [3]:
df.dtypes

user_name           object
user_location       object
user_description    object
user_created        object
user_followers       int64
user_friends         int64
user_favourites      int64
user_verified         bool
date                object
text                object
hashtags            object
source              object
is_retweet            bool
dtype: object

In [4]:
# def load_dataset(filename):
#     dataset = pd.read_csv('../Resources/covid_vaccine_090721.csv')
#     dataset.fillna('', inplace = True)
#     #dataset.columns = cols
#     return dataset

In [5]:
n_df = df.drop(columns = ['user_name', 'user_location','user_description','user_created',
                         'user_followers','user_friends','user_favourites','source','is_retweet'], axis = 1)
n_df

Unnamed: 0,user_verified,date,text,hashtags
0,True,2021-09-06 23:55:48,“Please get vaccinated. Take COVID seriously. ...,
1,False,2021-09-06 23:46:16,@apsmunro That is a lot more than the risk fro...,['CovidVaccine']
2,False,2021-09-06 23:38:34,"As I said months ago, anyone supporting a blan...","['FascistEmployerMandate', 'FascistVaccinePass..."
3,False,2021-09-06 23:36:50,More crazy Chiropractors!!! This time in Flori...,
4,False,2021-09-06 23:29:20,Great 18 point case about why you should not t...,"['CovidVaccine', 'COVIDVaccination']"
...,...,...,...,...
5674,False,2021-09-01 00:04:20,Excellent summary of where we are headed as a ...,
5675,False,2021-09-01 00:02:26,@krispykreme is #sweetening its #COVIDvaccine ...,"['sweetening', 'COVIDvaccine']"
5676,False,2021-09-01 00:02:16,#CovidVaccine Does not work.. https://t.co/Xoi...,['CovidVaccine']
5677,False,2021-09-01 00:00:41,"We are excited to be soaring together again, b...",


In [6]:
#regex the hashtags 
n_df['hashtags'] = n_df['hashtags'].apply(lambda x: re.sub('\[.*?\]',' ', str(x)))
n_df['hashtags']

0       nan
1          
2          
3       nan
4          
       ... 
5674    nan
5675       
5676       
5677    nan
5678    nan
Name: hashtags, Length: 5679, dtype: object

In [7]:
# def remove_unwanted_cols(dataset, cols):
#     for col in cols:
#         del dataset[col]
#     return dataset

In [8]:
n_df[['text']]

Unnamed: 0,text
0,“Please get vaccinated. Take COVID seriously. ...
1,@apsmunro That is a lot more than the risk fro...
2,"As I said months ago, anyone supporting a blan..."
3,More crazy Chiropractors!!! This time in Flori...
4,Great 18 point case about why you should not t...
...,...
5674,Excellent summary of where we are headed as a ...
5675,@krispykreme is #sweetening its #COVIDvaccine ...
5676,#CovidVaccine Does not work.. https://t.co/Xoi...
5677,"We are excited to be soaring together again, b..."


In [9]:
def cleaner(tweet):
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = ''.join(c for c in tweet if c not in emoji.UNICODE_EMOJI) #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) \
         if w.lower() in words or not w.isalpha())
    return tweet
n_df['text'] = n_df['text'].map(lambda x: cleaner(x))
#n_df.to_csv('') #specify location

In [10]:
n_df.head()

Unnamed: 0,user_verified,date,text,hashtags
0,True,2021-09-06 23:55:48,“ Please get . Take COVID seriously . It ’ s n...,
1,False,2021-09-06 23:46:16,That is a lot more than the risk from the . An...,
2,False,2021-09-06 23:38:34,"As I said ago , anyone supporting a blanket is...",
3,False,2021-09-06 23:36:50,More crazy !!! This time in where a non - medi...,
4,False,2021-09-06 23:29:20,Great 18 point case about why you should not t...,


In [11]:
def clean_tweet(temp):
    
    #for temp in range(0, len(n_df)) :
         #if type(temp) == float:
         #    return ""
         #else: 
            #print(temp)
    temp = temp.lower()
    temp = re.sub("'", "", str(temp)) # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+","", str(temp))
    temp = re.sub("#[A-Za-z0-9_]+","", str(temp))
    temp = re.sub(r"www.\S+", "", temp)
    temp = re.sub(r"http\S+", "", temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = temp.split()
    stopwords = ["for", "on", "an", "a", "of", "and", "in", "the", "to", "from"]
    temp = [w for w in temp if not w in stopwords]
    temp = " ".join(word for word in temp)
    return temp
n_df['text'] = n_df['text'].map(lambda x: clean_tweet(x))

In [12]:
n_df[['text']]

Unnamed: 0,text
0,please get take covid seriously it s not joke
1,that is lot more than risk risk vaccine is pos...
2,as i said ago anyone supporting blanket is not...
3,more crazy this time where non medical doctor ...
4,great 18 point case about why you should not take
...,...
5674,excellent summary where we are headed as race ...
5675,is sweetening its deal 3 0 through sept 5 will...
5676,does not work
5677,we are excited be soaring together again but i...


In [13]:
# #lower case
temp = n_df['text'].str.lower()
df = pd.DataFrame(temp)
df
#temp

Unnamed: 0,text
0,please get take covid seriously it s not joke
1,that is lot more than risk risk vaccine is pos...
2,as i said ago anyone supporting blanket is not...
3,more crazy this time where non medical doctor ...
4,great 18 point case about why you should not take
...,...
5674,excellent summary where we are headed as race ...
5675,is sweetening its deal 3 0 through sept 5 will...
5676,does not work
5677,we are excited be soaring together again but i...


In [14]:
#remove the mentions and hashtags 

def remove_mentions(temp):
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    return temp
    
#temp1 = 
#temp1 = remove_mentions(df['text'])
o_df = df['text'].map(lambda x: remove_mentions(x))
o_df = pd.DataFrame(o_df)

In [15]:
#remove links 
def remove_links(temp):
    temp = re.sub(r"http\S+", "", temp)
    temp = re.sub(r"www.\S+", "", temp)
    return temp
#print(temp)

p_df = o_df['text'].map(lambda x: remove_links(x))

p_df = pd.DataFrame(p_df)
p_df

Unnamed: 0,text
0,please get take covid seriously it s not joke
1,that is lot more than risk risk vaccine is pos...
2,as i said ago anyone supporting blanket is not...
3,more crazy this time where non medical doctor ...
4,great 18 point case about why you should not take
...,...
5674,excellent summary where we are headed as race ...
5675,is sweetening its deal 3 0 through sept 5 will...
5676,does not work
5677,we are excited be soaring together again but i...


In [16]:
# #remove punctuations

def remove_punctation(temp):
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    return temp
p_df = p_df['text'].map(lambda x: remove_punctation(x))
p_df = pd.DataFrame(p_df)
p_df

Unnamed: 0,text
0,please get take covid seriously it s not joke
1,that is lot more than risk risk vaccine is pos...
2,as i said ago anyone supporting blanket is not...
3,more crazy this time where non medical doctor ...
4,great 18 point case about why you should not take
...,...
5674,excellent summary where we are headed as race ...
5675,is sweetening its deal 3 0 through sept 5 will...
5676,does not work
5677,we are excited be soaring together again but i...


In [17]:
# #Filtering non-alphanumeric characters

def non_alpha(temp):
    temp = re.sub("[^a-z0-9]"," ", temp)
    return temp

p_df = p_df['text'].map(lambda x: non_alpha(x))
n_df = pd.DataFrame(p_df)
n_df

Unnamed: 0,text
0,please get take covid seriously it s not joke
1,that is lot more than risk risk vaccine is pos...
2,as i said ago anyone supporting blanket is not...
3,more crazy this time where non medical doctor ...
4,great 18 point case about why you should not take
...,...
5674,excellent summary where we are headed as race ...
5675,is sweetening its deal 3 0 through sept 5 will...
5676,does not work
5677,we are excited be soaring together again but i...


In [20]:
# #tokenize 

def tokenize(temp):
    temp = temp.split()
    return temp

n_df = n_df['text'].map(lambda x: tokenize(x))
n_df = pd.DataFrame(p_df)
n_df


Unnamed: 0,text
0,please get take covid seriously it s not joke
1,that is lot more than risk risk vaccine is pos...
2,as i said ago anyone supporting blanket is not...
3,more crazy this time where non medical doctor ...
4,great 18 point case about why you should not take
...,...
5674,excellent summary where we are headed as race ...
5675,is sweetening its deal 3 0 through sept 5 will...
5676,does not work
5677,we are excited be soaring together again but i...


In [35]:
temp = df[['text']]
temp

Unnamed: 0,text
0,please get take covid seriously it s not joke
1,that is lot more than risk risk vaccine is pos...
2,as i said ago anyone supporting blanket is not...
3,more crazy this time where non medical doctor ...
4,great 18 point case about why you should not take
...,...
5674,excellent summary where we are headed as race ...
5675,is sweetening its deal 3 0 through sept 5 will...
5676,does not work
5677,we are excited be soaring together again but i...


In [37]:
# #remove stopwords
def stopwords(tweet):
        
    stopwords = ["for", "on", "an", "a", "of", "and", "in", "the", "to", "from"]
    tweet = tweet.split()
    temp = [w for w in tweet if not w in stopwords]
    tweet = " ".join(word for word in temp)
    return tweet




p_df = temp['text'].map(lambda x: stopwords(x))




p_df = pd.DataFrame(p_df)
p_df



Unnamed: 0,text
0,please get take covid seriously it s not joke
1,that is lot more than risk risk vaccine is pos...
2,as i said ago anyone supporting blanket is not...
3,more crazy this time where non medical doctor ...
4,great 18 point case about why you should not take
...,...
5674,excellent summary where we are headed as race ...
5675,is sweetening its deal 3 0 through sept 5 will...
5676,does not work
5677,we are excited be soaring together again but i...


In [38]:
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

In [39]:
def int_to_string(sentiment):
    if sentiment == 0:
        return "Negative"
    elif sentiment == 2:
        return "Neutral"
    else:
        return "Positive"

In [42]:
temp = df[['text']]
temp

Unnamed: 0,text
0,please get take covid seriously it s not joke
1,that is lot more than risk risk vaccine is pos...
2,as i said ago anyone supporting blanket is not...
3,more crazy this time where non medical doctor ...
4,great 18 point case about why you should not take
...,...
5674,excellent summary where we are headed as race ...
5675,is sweetening its deal 3 0 through sept 5 will...
5676,does not work
5677,we are excited be soaring together again but i...


In [43]:

# Split dataset into Train, Test

# Same tf vector will be used for Testing sentiments on unseen trending data
tf_vector = get_feature_vector(np.array(n_df.iloc[:, 1]).ravel())
X = tf_vector.transform(np.array(n_df.iloc[:, 1]).ravel())
y = np.array(n_df.iloc[:, 0]).ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

# Training Naive Bayes model
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)
print(accuracy_score(y_test, y_predict_nb))

# Training Logistics Regression model
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, y_predict_lr))

IndexError: single positional indexer is out-of-bounds

In [None]:
import tweepy as tw
import config
from tqdm import tqdm
import os

In [None]:
consumer_api_key = config.consumer_key
consumer_api_secret = config.consumer_secret

In [None]:
auth = tw.OAuthHandler(consumer_api_key, consumer_api_secret)
api = tw.API(auth, wait_on_rate_limit=True)

In [None]:
search_words = "#covidvaccine -filter:retweets" 

date_since = "2021-09-02"
date_until="2021-09-08"
# Collect tweets
tweets = tw.Cursor(api.search,
              q=search_words,
              lang="en",
              since=date_since,
              until=date_until     
              ).items(7500) 

In [None]:
tweets_copy = []
for tweet in tqdm(tweets):
    tweets_copy.append(tweet)

In [None]:
print(f"New tweets retrieved: {len(tweets_copy)}")

In [None]:
tweets_df = pd.DataFrame()
for tweet in tqdm(tweets_copy):
    hashtags = []
    try:
        for hashtag in tweet.entities["hashtags"]:
            hashtags.append(hashtag["text"])
    except:
        pass
    tweets_df = tweets_df.append(pd.DataFrame({'user_name': tweet.user.name, 
                                               'user_location': tweet.user.location,\
                                               'user_description': tweet.user.description,
                                               'user_created': tweet.user.created_at,
                                               'user_followers': tweet.user.followers_count,
                                               'user_friends': tweet.user.friends_count,
                                               'user_favourites': tweet.user.favourites_count,
                                               'user_verified': tweet.user.verified,
                                               'date': tweet.created_at,
                                               'text': tweet.text, 
                                               'hashtags': [hashtags if hashtags else None],
                                               'source': tweet.source,
                                               'is_retweet': tweet.retweeted}, index=[0]))

In [None]:
tweets_df

In [None]:
#tweets_df.to_csv('../Resources/covid_vaccine_090721.csv',index=False)

In [None]:
n_tweets_df = tweets_df.drop(columns = ['user_name', 'user_location','user_description','user_created',
                         'user_followers','user_friends','user_favourites','source','is_retweet'], axis = 1)
n_tweets_df.head()

In [None]:
n_tweets_df['text'] = n_tweets_df['text'].map(lambda x: cleaner(x))
n_tweets_df.head()

In [None]:


# Creating text feature
n_tweets_df['text'] = n_tweets_df['text'].map(lambda x: cleaner(x))
test_feature = tf_vector.transform(np.array(n_tweets_df.iloc[:, 1]).ravel())

# Using Logistic Regression model for prediction
test_prediction_lr = LR_model.predict(test_feature)

# Averaging out the hashtags result
test_result_ds = pd.DataFrame({'hashtag': n_tweets_df.hashtags, 'prediction':test_prediction_lr})
test_result = test_result_ds.groupby(['hashtags']).max().reset_index()
test_result.columns = ['hashtags', 'predictions']
test_result.predictions = test_result['predictions'].apply(int_to_string)

print(test_result)