In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import string
import re
import fasttext
import contractions
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import sys

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Cisco\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
tweets = pd.read_csv('covidvaccine.csv')

In [3]:
tweets.shape

(38459, 13)

In [4]:
#Checks out what type of data we're working with.

tweets.dtypes

user_name           object
user_location       object
user_description    object
user_created        object
user_followers       int64
user_friends         int64
user_favourites      int64
user_verified         bool
date                object
text                object
hashtags            object
source              object
is_retweet            bool
dtype: object

In [5]:
# Checks out what columns have null values.

for col in tweets.columns:
    print(col,tweets[col].isnull().sum())

user_name 0
user_location 8443
user_description 2373
user_created 0
user_followers 0
user_friends 0
user_favourites 0
user_verified 0
date 0
text 0
hashtags 11621
source 1
is_retweet 0


In [6]:
#Filters data, so the results have are on topic to what this analysis is looking for.

search_Keywords = ['vaccine','vaccines', 'covid-19', 'covid', 'cure','cures', 'corona','virus' 'coronavirus', '#covidvaccine','#coronavaccine'
                  '#coronavirusvaccine','#covid-19','#vaccine','#covid19vaccine']
filter_tweets_by_keywords = tweets['text'].apply(lambda x: len(set(search_Keywords) - set(x.lower().split()))) < len(search_Keywords)
tweets_filtered = tweets[filter_tweets_by_keywords].reset_index()

In [7]:
#Deletes an unnecessary column for our analysis.

del tweets_filtered['is_retweet']

In [8]:
tweets_filtered.shape

(24526, 13)

In [9]:
#Remove any contractions within the 'text' column.
#Then we recombine the elements within the 'no_contractions' column into a string.

tweets_filtered['no_contractions'] = tweets_filtered['text'].apply(lambda x: [contractions.fix(word) for word in x.split()])
tweets_filtered['text_str'] = [' '.join(map(str, l)) for l in tweets_filtered['no_contractions']]
tweets_filtered.head()

Unnamed: 0,index,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,no_contractions,text_str
0,0,MyNewsNE,Assam,MyNewsNE a dedicated multi-lingual media house...,24-05-2020 10:18,64,11,110,False,18-08-2020 12:55,Australia to Manufacture Covid-19 Vaccine and ...,['CovidVaccine'],Twitter Web App,"[Australia, to, Manufacture, Covid-19, Vaccine...",Australia to Manufacture Covid-19 Vaccine and ...
1,1,Shubham Gupta,,I will tell about all experiences of my life f...,14-08-2020 16:42,1,17,0,False,18-08-2020 12:55,#CoronavirusVaccine #CoronaVaccine #CovidVacci...,"['CoronavirusVaccine', 'CoronaVaccine', 'Covid...",Twitter for Android,"[#CoronavirusVaccine, #CoronaVaccine, #CovidVa...",#CoronavirusVaccine #CoronaVaccine #CovidVacci...
2,2,Journal of Infectiology,,Journal of Infectiology (ISSN 2689-9981) is ac...,14-12-2017 07:07,143,566,8,False,18-08-2020 12:46,Deaths due to COVID-19 in Affected Countries\n...,,Twitter Web App,"[Deaths, due, to, COVID-19, in, Affected, Coun...",Deaths due to COVID-19 in Affected Countries R...
3,7,AKisASocialisolationist wash yer damn hands,The Great Pacific Northwest,"Iconoclast, cat person, soccer fan, textile & ...",07-02-2015 07:24,2321,3236,264351,False,18-08-2020 12:30,"@MSNBC Well, let’s qualify that: would anyone ...",['CovidVaccine'],Twitter for iPhone,"[@MSNBC, Well,, let us, qualify, that:, would,...","@MSNBC Well, let us qualify that: would anyone..."
4,9,VUMC OAP,"Nashville, TN","Office of Advanced Practice, Vanderbilt Univer...",16-03-2017 20:22,282,96,788,False,18-08-2020 11:57,#DNA zooms up charts in 1st week; hear #vacci...,"['DNA', 'vaccines', 'pandemic', 'COVID19', 'Co...",Twitter Web App,"[#DNA, zooms, up, charts, in, 1st, week;, hear...",#DNA zooms up charts in 1st week; hear #vaccin...


In [10]:
#Tokenize the words in each of the tweets & lowercase all its elements.

tweets_filtered['tokenized'] = tweets_filtered['text_str'].apply(word_tokenize)
tweets_filtered['low_tokenized'] = tweets_filtered.text.str.lower()
tweets_filtered.head()

Unnamed: 0,index,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,no_contractions,text_str,tokenized,low_tokenized
0,0,MyNewsNE,Assam,MyNewsNE a dedicated multi-lingual media house...,24-05-2020 10:18,64,11,110,False,18-08-2020 12:55,Australia to Manufacture Covid-19 Vaccine and ...,['CovidVaccine'],Twitter Web App,"[Australia, to, Manufacture, Covid-19, Vaccine...",Australia to Manufacture Covid-19 Vaccine and ...,"[Australia, to, Manufacture, Covid-19, Vaccine...",australia to manufacture covid-19 vaccine and ...
1,1,Shubham Gupta,,I will tell about all experiences of my life f...,14-08-2020 16:42,1,17,0,False,18-08-2020 12:55,#CoronavirusVaccine #CoronaVaccine #CovidVacci...,"['CoronavirusVaccine', 'CoronaVaccine', 'Covid...",Twitter for Android,"[#CoronavirusVaccine, #CoronaVaccine, #CovidVa...",#CoronavirusVaccine #CoronaVaccine #CovidVacci...,"[#, CoronavirusVaccine, #, CoronaVaccine, #, C...",#coronavirusvaccine #coronavaccine #covidvacci...
2,2,Journal of Infectiology,,Journal of Infectiology (ISSN 2689-9981) is ac...,14-12-2017 07:07,143,566,8,False,18-08-2020 12:46,Deaths due to COVID-19 in Affected Countries\n...,,Twitter Web App,"[Deaths, due, to, COVID-19, in, Affected, Coun...",Deaths due to COVID-19 in Affected Countries R...,"[Deaths, due, to, COVID-19, in, Affected, Coun...",deaths due to covid-19 in affected countries\n...
3,7,AKisASocialisolationist wash yer damn hands,The Great Pacific Northwest,"Iconoclast, cat person, soccer fan, textile & ...",07-02-2015 07:24,2321,3236,264351,False,18-08-2020 12:30,"@MSNBC Well, let’s qualify that: would anyone ...",['CovidVaccine'],Twitter for iPhone,"[@MSNBC, Well,, let us, qualify, that:, would,...","@MSNBC Well, let us qualify that: would anyone...","[@, MSNBC, Well, ,, let, us, qualify, that, :,...","@msnbc well, let’s qualify that: would anyone ..."
4,9,VUMC OAP,"Nashville, TN","Office of Advanced Practice, Vanderbilt Univer...",16-03-2017 20:22,282,96,788,False,18-08-2020 11:57,#DNA zooms up charts in 1st week; hear #vacci...,"['DNA', 'vaccines', 'pandemic', 'COVID19', 'Co...",Twitter Web App,"[#DNA, zooms, up, charts, in, 1st, week;, hear...",#DNA zooms up charts in 1st week; hear #vaccin...,"[#, DNA, zooms, up, charts, in, 1st, week, ;, ...",#dna zooms up charts in 1st week; hear #vacci...


In [12]:
#Here we'll remove the punctiation.

p = re.compile(r'[^\w\s]+')
tweets_filtered['no_punct'] = [p.sub('', x) for x in tweets_filtered['low_tokenized'].tolist()]

In [15]:
#Removes any stopwords from out data.

stop_words = set(stopwords.words('english'))
tweets_filtered['stopwords_rem'] = tweets_filtered['no_punct'].apply(lambda x: [word for word in x.split() if word not in stop_words])
tweets_filtered.head()

Unnamed: 0,index,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,no_contractions,text_str,tokenized,low_tokenized,no_punct,stopwords_rem
0,0,MyNewsNE,Assam,MyNewsNE a dedicated multi-lingual media house...,24-05-2020 10:18,64,11,110,False,18-08-2020 12:55,Australia to Manufacture Covid-19 Vaccine and ...,['CovidVaccine'],Twitter Web App,"[Australia, to, Manufacture, Covid-19, Vaccine...",Australia to Manufacture Covid-19 Vaccine and ...,"[Australia, to, Manufacture, Covid-19, Vaccine...",australia to manufacture covid-19 vaccine and ...,australia to manufacture covid19 vaccine and g...,"[australia, manufacture, covid19, vaccine, giv..."
1,1,Shubham Gupta,,I will tell about all experiences of my life f...,14-08-2020 16:42,1,17,0,False,18-08-2020 12:55,#CoronavirusVaccine #CoronaVaccine #CovidVacci...,"['CoronavirusVaccine', 'CoronaVaccine', 'Covid...",Twitter for Android,"[#CoronavirusVaccine, #CoronaVaccine, #CovidVa...",#CoronavirusVaccine #CoronaVaccine #CovidVacci...,"[#, CoronavirusVaccine, #, CoronaVaccine, #, C...",#coronavirusvaccine #coronavaccine #covidvacci...,coronavirusvaccine coronavaccine covidvaccine ...,"[coronavirusvaccine, coronavaccine, covidvacci..."
2,2,Journal of Infectiology,,Journal of Infectiology (ISSN 2689-9981) is ac...,14-12-2017 07:07,143,566,8,False,18-08-2020 12:46,Deaths due to COVID-19 in Affected Countries\n...,,Twitter Web App,"[Deaths, due, to, COVID-19, in, Affected, Coun...",Deaths due to COVID-19 in Affected Countries R...,"[Deaths, due, to, COVID-19, in, Affected, Coun...",deaths due to covid-19 in affected countries\n...,deaths due to covid19 in affected countries\nr...,"[deaths, due, covid19, affected, countries, re..."
3,7,AKisASocialisolationist wash yer damn hands,The Great Pacific Northwest,"Iconoclast, cat person, soccer fan, textile & ...",07-02-2015 07:24,2321,3236,264351,False,18-08-2020 12:30,"@MSNBC Well, let’s qualify that: would anyone ...",['CovidVaccine'],Twitter for iPhone,"[@MSNBC, Well,, let us, qualify, that:, would,...","@MSNBC Well, let us qualify that: would anyone...","[@, MSNBC, Well, ,, let, us, qualify, that, :,...","@msnbc well, let’s qualify that: would anyone ...",msnbc well lets qualify that would anyone of a...,"[msnbc, well, lets, qualify, would, anyone, pa..."
4,9,VUMC OAP,"Nashville, TN","Office of Advanced Practice, Vanderbilt Univer...",16-03-2017 20:22,282,96,788,False,18-08-2020 11:57,#DNA zooms up charts in 1st week; hear #vacci...,"['DNA', 'vaccines', 'pandemic', 'COVID19', 'Co...",Twitter Web App,"[#DNA, zooms, up, charts, in, 1st, week;, hear...",#DNA zooms up charts in 1st week; hear #vaccin...,"[#, DNA, zooms, up, charts, in, 1st, week, ;, ...",#dna zooms up charts in 1st week; hear #vacci...,dna zooms up charts in 1st week hear vaccines...,"[dna, zooms, charts, 1st, week, hear, vaccines..."


In [16]:
#Identifies each word's part of speech.

tweets_filtered['pos_tags'] = tweets_filtered['stopwords_rem'].apply(nltk.tag.pos_tag)
tweets_filtered.head()

Unnamed: 0,index,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,no_contractions,text_str,tokenized,low_tokenized,no_punct,stopwords_rem,pos_tags
0,0,MyNewsNE,Assam,MyNewsNE a dedicated multi-lingual media house...,24-05-2020 10:18,64,11,110,False,18-08-2020 12:55,Australia to Manufacture Covid-19 Vaccine and ...,['CovidVaccine'],Twitter Web App,"[Australia, to, Manufacture, Covid-19, Vaccine...",Australia to Manufacture Covid-19 Vaccine and ...,"[Australia, to, Manufacture, Covid-19, Vaccine...",australia to manufacture covid-19 vaccine and ...,australia to manufacture covid19 vaccine and g...,"[australia, manufacture, covid19, vaccine, giv...","[(australia, JJ), (manufacture, NN), (covid19,..."
1,1,Shubham Gupta,,I will tell about all experiences of my life f...,14-08-2020 16:42,1,17,0,False,18-08-2020 12:55,#CoronavirusVaccine #CoronaVaccine #CovidVacci...,"['CoronavirusVaccine', 'CoronaVaccine', 'Covid...",Twitter for Android,"[#CoronavirusVaccine, #CoronaVaccine, #CovidVa...",#CoronavirusVaccine #CoronaVaccine #CovidVacci...,"[#, CoronavirusVaccine, #, CoronaVaccine, #, C...",#coronavirusvaccine #coronavaccine #covidvacci...,coronavirusvaccine coronavaccine covidvaccine ...,"[coronavirusvaccine, coronavaccine, covidvacci...","[(coronavirusvaccine, NN), (coronavaccine, NN)..."
2,2,Journal of Infectiology,,Journal of Infectiology (ISSN 2689-9981) is ac...,14-12-2017 07:07,143,566,8,False,18-08-2020 12:46,Deaths due to COVID-19 in Affected Countries\n...,,Twitter Web App,"[Deaths, due, to, COVID-19, in, Affected, Coun...",Deaths due to COVID-19 in Affected Countries R...,"[Deaths, due, to, COVID-19, in, Affected, Coun...",deaths due to covid-19 in affected countries\n...,deaths due to covid19 in affected countries\nr...,"[deaths, due, covid19, affected, countries, re...","[(deaths, NNS), (due, JJ), (covid19, RB), (aff..."
3,7,AKisASocialisolationist wash yer damn hands,The Great Pacific Northwest,"Iconoclast, cat person, soccer fan, textile & ...",07-02-2015 07:24,2321,3236,264351,False,18-08-2020 12:30,"@MSNBC Well, let’s qualify that: would anyone ...",['CovidVaccine'],Twitter for iPhone,"[@MSNBC, Well,, let us, qualify, that:, would,...","@MSNBC Well, let us qualify that: would anyone...","[@, MSNBC, Well, ,, let, us, qualify, that, :,...","@msnbc well, let’s qualify that: would anyone ...",msnbc well lets qualify that would anyone of a...,"[msnbc, well, lets, qualify, would, anyone, pa...","[(msnbc, RB), (well, RB), (lets, VBZ), (qualif..."
4,9,VUMC OAP,"Nashville, TN","Office of Advanced Practice, Vanderbilt Univer...",16-03-2017 20:22,282,96,788,False,18-08-2020 11:57,#DNA zooms up charts in 1st week; hear #vacci...,"['DNA', 'vaccines', 'pandemic', 'COVID19', 'Co...",Twitter Web App,"[#DNA, zooms, up, charts, in, 1st, week;, hear...",#DNA zooms up charts in 1st week; hear #vaccin...,"[#, DNA, zooms, up, charts, in, 1st, week, ;, ...",#dna zooms up charts in 1st week; hear #vacci...,dna zooms up charts in 1st week hear vaccines...,"[dna, zooms, charts, 1st, week, hear, vaccines...","[(dna, NN), (zooms, NNS), (charts, NNS), (1st,..."


In [17]:
#This will concise the part of speech tags from above (Is it accurate?).

def wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
tweets_filtered['wordnet_pos'] = tweets_filtered['pos_tags'].apply(lambda x: [(word, wordnet_pos(pos_tag)) for (word, pos_tag) in x])
tweets_filtered.head()

Unnamed: 0,index,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,...,hashtags,source,no_contractions,text_str,tokenized,low_tokenized,no_punct,stopwords_rem,pos_tags,wordnet_pos
0,0,MyNewsNE,Assam,MyNewsNE a dedicated multi-lingual media house...,24-05-2020 10:18,64,11,110,False,18-08-2020 12:55,...,['CovidVaccine'],Twitter Web App,"[Australia, to, Manufacture, Covid-19, Vaccine...",Australia to Manufacture Covid-19 Vaccine and ...,"[Australia, to, Manufacture, Covid-19, Vaccine...",australia to manufacture covid-19 vaccine and ...,australia to manufacture covid19 vaccine and g...,"[australia, manufacture, covid19, vaccine, giv...","[(australia, JJ), (manufacture, NN), (covid19,...","[(australia, a), (manufacture, n), (covid19, n..."
1,1,Shubham Gupta,,I will tell about all experiences of my life f...,14-08-2020 16:42,1,17,0,False,18-08-2020 12:55,...,"['CoronavirusVaccine', 'CoronaVaccine', 'Covid...",Twitter for Android,"[#CoronavirusVaccine, #CoronaVaccine, #CovidVa...",#CoronavirusVaccine #CoronaVaccine #CovidVacci...,"[#, CoronavirusVaccine, #, CoronaVaccine, #, C...",#coronavirusvaccine #coronavaccine #covidvacci...,coronavirusvaccine coronavaccine covidvaccine ...,"[coronavirusvaccine, coronavaccine, covidvacci...","[(coronavirusvaccine, NN), (coronavaccine, NN)...","[(coronavirusvaccine, n), (coronavaccine, n), ..."
2,2,Journal of Infectiology,,Journal of Infectiology (ISSN 2689-9981) is ac...,14-12-2017 07:07,143,566,8,False,18-08-2020 12:46,...,,Twitter Web App,"[Deaths, due, to, COVID-19, in, Affected, Coun...",Deaths due to COVID-19 in Affected Countries R...,"[Deaths, due, to, COVID-19, in, Affected, Coun...",deaths due to covid-19 in affected countries\n...,deaths due to covid19 in affected countries\nr...,"[deaths, due, covid19, affected, countries, re...","[(deaths, NNS), (due, JJ), (covid19, RB), (aff...","[(deaths, n), (due, a), (covid19, r), (affecte..."
3,7,AKisASocialisolationist wash yer damn hands,The Great Pacific Northwest,"Iconoclast, cat person, soccer fan, textile & ...",07-02-2015 07:24,2321,3236,264351,False,18-08-2020 12:30,...,['CovidVaccine'],Twitter for iPhone,"[@MSNBC, Well,, let us, qualify, that:, would,...","@MSNBC Well, let us qualify that: would anyone...","[@, MSNBC, Well, ,, let, us, qualify, that, :,...","@msnbc well, let’s qualify that: would anyone ...",msnbc well lets qualify that would anyone of a...,"[msnbc, well, lets, qualify, would, anyone, pa...","[(msnbc, RB), (well, RB), (lets, VBZ), (qualif...","[(msnbc, r), (well, r), (lets, v), (qualify, v..."
4,9,VUMC OAP,"Nashville, TN","Office of Advanced Practice, Vanderbilt Univer...",16-03-2017 20:22,282,96,788,False,18-08-2020 11:57,...,"['DNA', 'vaccines', 'pandemic', 'COVID19', 'Co...",Twitter Web App,"[#DNA, zooms, up, charts, in, 1st, week;, hear...",#DNA zooms up charts in 1st week; hear #vaccin...,"[#, DNA, zooms, up, charts, in, 1st, week, ;, ...",#dna zooms up charts in 1st week; hear #vacci...,dna zooms up charts in 1st week hear vaccines...,"[dna, zooms, charts, 1st, week, hear, vaccines...","[(dna, NN), (zooms, NNS), (charts, NNS), (1st,...","[(dna, n), (zooms, n), (charts, n), (1st, n), ..."


In [20]:
#Converts words into their root form. (didn't work for some - like 'affected' on index[2]).

word_Lemm = WordNetLemmatizer()
tweets_filtered['lemmatized'] = tweets_filtered['wordnet_pos'].apply(lambda x: [word_Lemm.lemmatize(word, tag) for word, tag in x])
tweets_filtered['lemmatized'].head(20)

0     [australia, manufacture, covid19, vaccine, giv...
1     [coronavirusvaccine, coronavaccine, covidvacci...
2     [death, due, covid19, affected, country, read,...
3     [msnbc, well, let, qualify, would, anyone, par...
4     [dna, zoom, chart, 1st, week, hear, vaccine, e...
5     [covid19millionares, covid19, corona, covidvac...
6     [great, news, pfizers, vaccine, enter, phase, ...
7     [dangerous, yet, come, covidvaccine, corona, i...
8     [whole, narrative, covidvaccine, go, deathly, ...
9     [seruminstindia, look, raise, 1, billion, arou...
10    [global, vaccine, tracker, vaccine, covidvacci...
11    [coronavirus, update, american, businessman, t...
12    [search, chennai, corona, affect, street, area...
13    [vaccinate, world, vacine, vaccinate, coronavi...
14    [islamabad, pakistan, begin, phase, iii, clini...
15    [covid19, vocabulary, english, incubation, per...
16    [omg, expose, covid19, vaccine, ingredient, su...
17    [covidvaccine, 29, candidate, begin, trial