In [1]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Global Parameters
stop_words = set(stopwords.words('english'))

In [2]:
df = pd.read_csv('../Resources/covidvaccine.csv') #Dataset from https://www.kaggle.com/kaushiksuresh147/covidvaccine-tweets
df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,MyNewsNE,Assam,MyNewsNE a dedicated multi-lingual media house...,24-05-2020 10:18,64.0,11.0,110.0,False,18-08-2020 12:55,Australia to Manufacture Covid-19 Vaccine and ...,['CovidVaccine'],Twitter Web App,False
1,Shubham Gupta,,I will tell about all experiences of my life f...,14-08-2020 16:42,1.0,17.0,0.0,False,18-08-2020 12:55,#CoronavirusVaccine #CoronaVaccine #CovidVacci...,"['CoronavirusVaccine', 'CoronaVaccine', 'Covid...",Twitter for Android,False
2,Journal of Infectiology,,Journal of Infectiology (ISSN 2689-9981) is ac...,14-12-2017 07:07,143.0,566.0,8.0,False,18-08-2020 12:46,Deaths due to COVID-19 in Affected Countries\n...,,Twitter Web App,False
3,Zane,,Fresher than you.,18-09-2019 11:01,29.0,25.0,620.0,False,18-08-2020 12:45,@Team_Subhashree @subhashreesotwe @iamrajchoco...,,Twitter for Android,False
4,Ann-Maree O’Connor,"Adelaide, South Australia",Retired university administrator. Melburnian b...,24-01-2013 14:53,83.0,497.0,10737.0,False,18-08-2020 12:45,@michellegrattan @ConversationEDU This is what...,,Twitter Web App,False


In [2]:
def load_dataset(filename, cols):
    dataset = pd.read_csv('../Resources/covidvaccine.csv', encoding='latin-1')
    dataset.columns = cols
    return dataset

In [3]:
def remove_unwanted_cols(dataset, cols):
    for col in cols:
        del dataset[col]
    return dataset

In [4]:
def preprocess_tweet_text(tweet):
    #tweet.lower()
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#','', tweet)
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    
    #ps = PorterStemmer()
    #stemmed_words = [ps.stem(w) for w in filtered_words]
    #lemmatizer = WordNetLemmatizer()
    #lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]
    
    return " ".join(filtered_words)

In [5]:
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

In [6]:
def int_to_string(sentiment):
    if sentiment == 0:
        return "Negative"
    elif sentiment == 2:
        return "Neutral"
    else:
        return "Positive"

In [7]:
# Load dataset
dataset = load_dataset("..Resources/covidvaccine.csv", cols=['user_name', 'user_location','user_description',
                                          'user_created','user_followers','user_friends', 'text','date',
                                           'user_favourites','user_verified','hashtags','source',
                                           'is_retweet'])
# Remove unwanted columns from dataset
n_dataset = remove_unwanted_cols(dataset, cols=['user_name','user_location','user_description',
                                                'user_created','user_followers','user_friends',
                                                'user_favourites','user_verified', 
                                                'source', 'is_retweet'])
#Preprocess data
dataset.text = dataset['text'].apply(preprocess_tweet_text)
# Split dataset into Train, Test

# Same tf vector will be used for Testing sentiments on unseen trending data
tf_vector = get_feature_vector(np.array(dataset.iloc[:, 1]).ravel())
X = tf_vector.transform(np.array(dataset.iloc[:, 1]).ravel())
y = np.array(dataset.iloc[:, 0]).ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

# Training Naive Bayes model
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)
print(accuracy_score(y_test, y_predict_nb))

# Training Logistics Regression model
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, y_predict_lr))

TypeError: expected string or bytes-like object

In [10]:
#df = df.drop(columns = ['user_name','user_location','user_description',
                       #'user_created','user_followers','user_friends',
                       #'user_favourites','user_verified', 'hashtags',
                       #'source', 'is_retweet'],axis=1)
#df.head()

In [10]:
import tweepy as tw
import config
from tqdm import tqdm
import os

In [11]:
consumer_api_key = config.consumer_key
consumer_api_secret = config.consumer_secret

In [12]:
auth = tw.OAuthHandler(consumer_api_key, consumer_api_secret)
api = tw.API(auth, wait_on_rate_limit=True)

In [16]:
search_words = "#covidvaccine -filter:retweets" 

date_since = "2021-09-01"
date_until="2021-09-07"
# Collect tweets
tweets = tw.Cursor(api.search_tweets,
              q=search_words,
              lang="en",
              since=date_since,
              until=date_until     
              ).items(7500) 

In [17]:
tweets_copy = []
for tweet in tqdm(tweets):
    tweets_copy.append(tweet)

0it [00:00, ?it/s]Unexpected parameter: since
1it [00:00,  3.75it/s]Unexpected parameter: since
14it [00:00, 28.89it/s]Unexpected parameter: since
29it [00:00, 40.55it/s]Unexpected parameter: since
43it [00:01, 46.50it/s]Unexpected parameter: since
57it [00:01, 50.41it/s]Unexpected parameter: since
72it [00:01, 53.65it/s]Unexpected parameter: since
84it [00:01, 52.72it/s]Unexpected parameter: since
97it [00:02, 52.06it/s]Unexpected parameter: since
112it [00:02, 54.02it/s]Unexpected parameter: since
127it [00:02, 53.44it/s]Unexpected parameter: since
142it [00:02, 56.63it/s]Unexpected parameter: since
157it [00:03, 58.48it/s]Unexpected parameter: since
172it [00:03, 59.67it/s]Unexpected parameter: since
186it [00:03, 52.62it/s]


KeyboardInterrupt: 

In [19]:
print(f"New tweets retrieved: {len(tweets_copy)}")

New tweets retrieved: 5679


In [20]:
tweets_df = pd.DataFrame()
for tweet in tqdm(tweets_copy):
    hashtags = []
    try:
        for hashtag in tweet.entities["hashtags"]:
            hashtags.append(hashtag["text"])
    except:
        pass
    tweets_df = tweets_df.append(pd.DataFrame({'user_name': tweet.user.name, 
                                               'user_location': tweet.user.location,\
                                               'user_description': tweet.user.description,
                                               'user_created': tweet.user.created_at,
                                               'user_followers': tweet.user.followers_count,
                                               'user_friends': tweet.user.friends_count,
                                               'user_favourites': tweet.user.favourites_count,
                                               'user_verified': tweet.user.verified,
                                               'date': tweet.created_at,
                                               'text': tweet.text, 
                                               'hashtags': [hashtags if hashtags else None],
                                               'source': tweet.source,
                                               'is_retweet': tweet.retweeted}, index=[0]))

100%|██████████| 5679/5679 [00:23<00:00, 238.24it/s]


In [21]:
tweets_df

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,NSW Multicultural Health Communication Service,,Official account for NSW Multicultural Health ...,2018-08-20 02:00:05,1277,243,2311,True,2021-09-06 23:55:48,“Please get vaccinated. Take COVID seriously. ...,,TweetDeck,False
0,Climate Detective,,Physicist (MA & PhD) & socialist exploring cli...,2020-11-07 17:03:33,94,155,354,False,2021-09-06 23:46:16,@apsmunro That is a lot more than the risk fro...,[CovidVaccine],Twitter Web App,False
0,david sirias,"Port Townsend, WA",Lefty songwriter/musician/producer/publisher. ...,2011-10-28 20:14:55,1262,1115,489,False,2021-09-06 23:38:34,"As I said months ago, anyone supporting a blan...","[FascistEmployerMandate, FascistVaccinePassport]",Twitter for iPad,False
0,COVID News,Estados Unidos,News about COVID-19,2013-06-05 08:52:31,3330,1472,130,False,2021-09-06 23:36:50,More crazy Chiropractors!!! This time in Flori...,,Revive Social App,False
0,gain-of-function research,Undr the pine hunting lobsters,@Ayjchan's mandate: 'ONLY look @ a -Leak- frm ...,2013-12-24 20:30:14,272,20,24001,False,2021-09-06 23:29:20,Great 18 point case about why you should not t...,"[CovidVaccine, COVIDVaccination]",Twitter for Android,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,🇦🇺 I say 'HELL NO'!🇦🇺,Tristan da Cunha,Immunity: Natural\nVaccination Status: Uncomp...,2021-04-18 22:22:03,444,697,4041,False,2021-09-01 00:04:20,Excellent summary of where we are headed as a ...,,Twitter for Android,False
0,Stephanie,Cleveland!,Hey Its Stephanie!\nVideo/Social Media Produce...,2010-02-01 23:49:52,3668,5001,8745,False,2021-09-01 00:02:26,@krispykreme is #sweetening its #COVIDvaccine ...,"[sweetening, COVIDvaccine]",Twitter for Android,False
0,Not in My Arm-Russ,Somewhere (NOT) Vaccinated,"It's easy to make money at the Track, Bet more...",2014-12-17 23:34:04,4507,5000,13368,False,2021-09-01 00:02:16,#CovidVaccine Does not work.. https://t.co/Xoi...,[CovidVaccine],Twitter Web App,False
0,UH-Clear Lake 😷🧴,"Houston, TX, United States","University of Houston-Clear Lake, home of the ...",2009-02-10 17:41:52,5778,900,5384,False,2021-09-01 00:00:41,"We are excited to be soaring together again, b...",,Hootsuite Inc.,False


In [22]:
tweets_df.to_csv('../Resources/covid_vaccine_090721.csv',index=False)

In [24]:
#load testing file
test_file_name = "../Resources/covid_vaccine_090721.csv" 
test_ds = load_dataset(test_file_name, ['user_name', 'user_location','user_description',
                                          'user_created','user_followers','user_friends', 'text','date',
                                           'user_favourites','user_verified','hashtags','source',
                                           'is_retweet'])
test_ds = remove_unwanted_cols(test_ds, ['user_name','user_location','user_description',
                                                'user_created','user_followers','user_friends',
                                                'user_favourites','user_verified', 
                                                'source', 'is_retweet'])

# Creating text feature
test_ds.text = test_ds["text"].apply(preprocess_tweet_text)
test_feature = tf_vector.transform(np.array(test_ds.iloc[:, 1]).ravel())

# Using Logistic Regression model for prediction
test_prediction_lr = LR_model.predict(test_feature)

# Averaging out the hashtags result
test_result_ds = pd.DataFrame({'hashtag': test_ds.hashtag, 'prediction':test_prediction_lr})
test_result = test_result_ds.groupby(['hashtag']).max().reset_index()
test_result.columns = ['heashtag', 'predictions']
test_result.predictions = test_result['predictions'].apply(int_to_string)

print(test_result)

TypeError: expected string or bytes-like object