# <div align="center"><h1>Preprocessing Covid 19 Dataset

## Importing Libraries

In [155]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

In [156]:
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

In [157]:
# downloading NLTK packages
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
stopword = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Loading Data

In [158]:
# Importing data
data = pd.read_csv('covid19_tweets.csv')

In [159]:
data.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,False
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android,False
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android,False
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],Twitter for iPhone,False
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False


In [160]:
data.columns

Index(['user_name', 'user_location', 'user_description', 'user_created',
       'user_followers', 'user_friends', 'user_favourites', 'user_verified',
       'date', 'text', 'hashtags', 'source', 'is_retweet'],
      dtype='object')

In [161]:
data.dtypes

user_name           object
user_location       object
user_description    object
user_created        object
user_followers       int64
user_friends         int64
user_favourites      int64
user_verified         bool
date                object
text                object
hashtags            object
source              object
is_retweet            bool
dtype: object

In [162]:
# Checking for Null values
print(data.isnull().sum())

print("\n---------------------------------")
print("There are missing or Null Value")
print("---------------------------------")

user_name               0
user_location       36771
user_description    10286
user_created            0
user_followers          0
user_friends            0
user_favourites         0
user_verified           0
date                    0
text                    0
hashtags            51334
source                 77
is_retweet              0
dtype: int64

---------------------------------
There are missing or Null Value
---------------------------------


In [163]:
# removing NaN values

data.dropna(inplace=True)

In [164]:
# Checking for Null values
print(data.isnull().sum())

print("\n---------------------------------")
print("There is no missing or Null Value")
print("---------------------------------")

user_name           0
user_location       0
user_description    0
user_created        0
user_followers      0
user_friends        0
user_favourites     0
user_verified       0
date                0
text                0
hashtags            0
source              0
is_retweet          0
dtype: int64

---------------------------------
There is no missing or Null Value
---------------------------------


## Preprocess data

In [165]:
# Define a function to preprocess text
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])
    # Convert to lowercase
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a string
    preprocessed_text = ' '.join(filtered_tokens)
    return preprocessed_text

# Apply preprocessing to text, user_description, and user_name columns
data['text'] = data['text'].apply(preprocess_text)
data['user_description'] = data['user_description'].apply(preprocess_text)
data['user_name'] = data['user_name'].apply(preprocess_text)

In [166]:
data.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
2,time4fisticuffs,"Pewee Valley, KY",christian catholic conservative reagan republi...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,diane3443 wdunlap realdonaldtrump trump never ...,['COVID19'],Twitter for Android,False
3,ethel mertz,Stuck in the Middle,browns indians clevelandproud cavs resist,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,brookbanktv one gift covid19 give appreciation...,['COVID19'],Twitter for iPhone,False
4,diprjk,Jammu and Kashmir,official twitter handle department information...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 july media bulletin novel coronavirusupdate...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False
5,franz schubert,Новоро́ссия,новороссия novorossiya оставайсядома stayathom...,2018-03-19 16:29:52,1180,1071,1287,False,2020-07-25 12:27:06,coronavirus covid19 deaths continue rise almos...,"['coronavirus', 'covid19']",Twitter Web App,False
6,hr bartender,"Gainesville, FL",workplace tips advice served friendly place co...,2008-08-12 18:19:49,79956,54810,3801,False,2020-07-25 12:27:03,covid19 change work general recruiting specifi...,"['COVID19', 'Recruiting']",Buffer,False


### Dropping unwanted columns 

In [167]:
# Drop unwanted columns
unwanted_columns = ['user_created', 'user_verified', 'date', 'hashtags', 'source']
data.drop(columns=unwanted_columns, inplace=True)

In [168]:
data.head()

Unnamed: 0,user_name,user_location,user_description,user_followers,user_friends,user_favourites,text,is_retweet
2,time4fisticuffs,"Pewee Valley, KY",christian catholic conservative reagan republi...,9275,9525,7254,diane3443 wdunlap realdonaldtrump trump never ...,False
3,ethel mertz,Stuck in the Middle,browns indians clevelandproud cavs resist,197,987,1488,brookbanktv one gift covid19 give appreciation...,False
4,diprjk,Jammu and Kashmir,official twitter handle department information...,101009,168,101,25 july media bulletin novel coronavirusupdate...,False
5,franz schubert,Новоро́ссия,новороссия novorossiya оставайсядома stayathom...,1180,1071,1287,coronavirus covid19 deaths continue rise almos...,False
6,hr bartender,"Gainesville, FL",workplace tips advice served friendly place co...,79956,54810,3801,covid19 change work general recruiting specifi...,False
