This notebook contains code for the cleaning the raw data obtained from the Twitter query.

In [None]:
!pip install nltk

Importing Libraries

In [None]:
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import RegexpTokenizer

import pandas as pd
import numpy as np

nltk.download('wordnet')

Getting the data needs to be cleaned

In [2]:
data = pd.read_csv("covid19_tweets.csv", encoding = "ISO-8859-1", engine="python")
print(data.head())

                       user_name  ... is_retweet
0  Covid Waves - Barbara Volkwyn  ...      False
1                        ET Edge  ...      False
2         Tag My School Magazine  ...      False
3                   Nine Dragons  ...      False
4                 Rajesh Kshitij  ...      False

[5 rows x 13 columns]


Only text data is needed for the sentiment analysis

In [3]:
data=data[['text']]
data['text']=data['text'].str.lower() # all tweets cconverted to the lowercase for working better
print(data.head())

                                                text
0  #saps cops told theyâll face attempted murde...
1  come, be a part of the august audience and joi...
2  lgbtqi activist on the verge of glory read mor...
3  vaccine progress:\nlooking at the progress in ...
4  #himachalpradesh - 155 new #covid19 cases, 187...


Cleaning and removing punctuations

In [4]:
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
data['text']= data['text'].apply(lambda x: cleaning_punctuations(x))
data['text'].head()

0    saps cops told theyâll face attempted murder...
1    come be a part of the august audience and join...
2    lgbtqi activist on the verge of glory read mor...
3    vaccine progress\nlooking at the progress in t...
4    himachalpradesh  155 new covid19 cases 1878 re...
Name: text, dtype: object

Removing email, URL's & Numeric numbers

In [5]:
def cleaning_email(data):
    return re.sub('@[^\s]+', ' ', data)

data['text']= data['text'].apply(lambda x: cleaning_email(x))

def cleaning_URLs(data):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',data)

data['text'] = data['text'].apply(lambda x: cleaning_URLs(x))


def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)

data['text'] = data['text'].apply(lambda x: cleaning_numbers(x))
data['text'].head()

0    saps cops told theyâll face attempted murder...
1    come be a part of the august audience and join...
2    lgbtqi activist on the verge of glory read mor...
3    vaccine progress\nlooking at the progress in t...
4    himachalpradesh   new covid cases  recoveries ...
Name: text, dtype: object

Cleaning stopwords from the dataset

In [6]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
data['text'] = data['text'].apply(lambda text: cleaning_stopwords(text))
data['text'].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


0    saps cops told theyâll face attempted murder...
1    come part august audience join us virtual summ...
2    lgbtqi activist verge glory read httpstcobkkcf...
3    vaccine progress looking progress countries kn...
4    himachalpradesh new covid cases recoveries amp...
Name: text, dtype: object

Tokenization

In [7]:
tokenizer = RegexpTokenizer(r'\w+')
data['text'] = data['text'].apply(tokenizer.tokenize)
data['text'].head()

0    [saps, cops, told, theyâ, ll, face, attempted,...
1    [come, part, august, audience, join, us, virtu...
2    [lgbtqi, activist, verge, glory, read, httpstc...
3    [vaccine, progress, looking, progress, countri...
4    [himachalpradesh, new, covid, cases, recoverie...
Name: text, dtype: object

Stemming

In [8]:
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data

data['text']= data['text'].apply(lambda x: stemming_on_text(x))
data['text'].head()

0    [saps, cops, told, theyâ, ll, face, attempted,...
1    [come, part, august, audience, join, us, virtu...
2    [lgbtqi, activist, verge, glory, read, httpstc...
3    [vaccine, progress, looking, progress, countri...
4    [himachalpradesh, new, covid, cases, recoverie...
Name: text, dtype: object

Applying Lemmatizer

In [9]:
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data

data['text'] = data['text'].apply(lambda x: lemmatizer_on_text(x))
data['text'].head()

0    [saps, cops, told, theyâ, ll, face, attempted,...
1    [come, part, august, audience, join, us, virtu...
2    [lgbtqi, activist, verge, glory, read, httpstc...
3    [vaccine, progress, looking, progress, countri...
4    [himachalpradesh, new, covid, cases, recoverie...
Name: text, dtype: object

In [10]:
data.to_csv('test.csv')