# <div align="center"><h1>Preprocessing Covid 19 Dataset

## Importing Libraries

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
from wordReplace import bruteGen

In [2]:
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

In [3]:
# downloading NLTK packages
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
stopword = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Loading Data

In [4]:
# Importing data
data = pd.read_csv('canada_tweets.csv')

In [5]:
data.head()

Unnamed: 0,has_media,hashtags,img_urls,is_replied,is_reply_to,likes,links,parent_tweet_id,replies,reply_to_users,...,text,text_html,timestamp,timestamp_epochs,tweet_id,tweet_url,user_id,username,video_url,city
0,False,"['COVID19', 'onpoli']",[],False,True,1,[],1.235931e+18,0,"[{'screen_name': 'Yair_Rosenberg', 'user_id': ...",...,As a recent cancer patient w/ long-standing re...,"<p class=""TweetTextSize js-tweet-text tweet-te...",2020-03-06 14:38:24,2020-03-06 14:38:24,1235937783283437568,/marlawd/status/1235937783283437573,49687037,Marla Waltman 🎯,,Toronto
1,False,"['COVID19', 'onpoli']",[],False,True,1,[],1.235931e+18,0,"[{'screen_name': 'Yair_Rosenberg', 'user_id': ...",...,As a recent cancer patient w/ long-standing re...,"<p class=""TweetTextSize js-tweet-text tweet-te...",2020-03-06 14:38:24,2020-03-06 14:38:24,1235937783283437568,/marlawd/status/1235937783283437573,49687037,Marla Waltman 🎯,,Toronto
2,False,"['Covid_19', 'onpoli']",[],True,False,4,[],,4,[],...,I’m supposed to attend the @OntLiberal leaders...,"<p class=""TweetTextSize js-tweet-text tweet-te...",2020-03-03 14:10:21,2020-03-03 14:10:21,1234843558009544704,/marlawd/status/1234843558009544704,49687037,Marla Waltman 🎯,,Toronto
3,False,"['Covid_19', 'onpoli']",[],True,False,4,[],,4,[],...,I’m supposed to attend the @OntLiberal leaders...,"<p class=""TweetTextSize js-tweet-text tweet-te...",2020-03-03 14:10:21,2020-03-03 14:10:21,1234843558009544704,/marlawd/status/1234843558009544704,49687037,Marla Waltman 🎯,,Toronto
4,False,[],[],False,True,1,[],1.233909e+18,0,"[{'screen_name': 'RockyMountViews', 'user_id':...",...,Buy in bulk when things are on sale and my fre...,"<p class=""TweetTextSize js-tweet-text tweet-te...",2020-03-01 02:28:25,2020-03-01 02:28:25,1233942136384577536,/ghlake/status/1233942136384577536,169051022,Linda Bowser@🏠,,Toronto


In [6]:
data.columns

Index(['has_media', 'hashtags', 'img_urls', 'is_replied', 'is_reply_to',
       'likes', 'links', 'parent_tweet_id', 'replies', 'reply_to_users',
       'retweets', 'screen_name', 'text', 'text_html', 'timestamp',
       'timestamp_epochs', 'tweet_id', 'tweet_url', 'user_id', 'username',
       'video_url', 'city'],
      dtype='object')

In [7]:
data.dtypes

has_media              bool
hashtags             object
img_urls             object
is_replied             bool
is_reply_to            bool
likes                 int64
links                object
parent_tweet_id     float64
replies               int64
reply_to_users       object
retweets              int64
screen_name          object
text                 object
text_html            object
timestamp            object
timestamp_epochs     object
tweet_id              int64
tweet_url            object
user_id               int64
username             object
video_url           float64
city                 object
dtype: object

In [12]:
# Checking for Null values
print(data.isnull().sum())

print("\n-------------------------------------------------------------------------------------")
print("There are missing or Null Value and those columns are not needed so we can drop them")
print("-------------------------------------------------------------------------------------")

has_media              0
hashtags               0
img_urls               0
is_replied             0
is_reply_to            0
likes                  0
links                  0
parent_tweet_id     4952
replies                0
reply_to_users         0
retweets               0
screen_name            0
text                   0
text_html              0
timestamp              0
timestamp_epochs       0
tweet_id               0
tweet_url              0
user_id                0
username               0
video_url           9206
city                   0
dtype: int64

-------------------------------------------------------------------------------------
There are missing or Null Value and those columns are not needed so we can drop them
-------------------------------------------------------------------------------------


### Dropping unwanted columns 

In [13]:
# Drop unwanted columns
unwanted_columns = ['has_media', 'hashtags', 'img_urls', 'is_replied', 'is_reply_to', 'links','parent_tweet_id','replies','reply_to_users','retweets', 'text_html','timestamp_epochs','tweet_url','video_url']
data.drop(columns=unwanted_columns, inplace=True)

In [14]:
data.head()

Unnamed: 0,likes,screen_name,text,timestamp,tweet_id,user_id,username,city
0,1,marlawd,As a recent cancer patient w/ long-standing re...,2020-03-06 14:38:24,1235937783283437568,49687037,Marla Waltman 🎯,Toronto
1,1,marlawd,As a recent cancer patient w/ long-standing re...,2020-03-06 14:38:24,1235937783283437568,49687037,Marla Waltman 🎯,Toronto
2,4,marlawd,I’m supposed to attend the @OntLiberal leaders...,2020-03-03 14:10:21,1234843558009544704,49687037,Marla Waltman 🎯,Toronto
3,4,marlawd,I’m supposed to attend the @OntLiberal leaders...,2020-03-03 14:10:21,1234843558009544704,49687037,Marla Waltman 🎯,Toronto
4,1,ghlake,Buy in bulk when things are on sale and my fre...,2020-03-01 02:28:25,1233942136384577536,169051022,Linda Bowser@🏠,Toronto


### Function to add abbreviations 

In [15]:
def replace_abbreviations(tweet):
    abbreviations = {}
    with open('Abbreviations.txt', 'r') as file:
        for line in file:
            abbreviation, full_form = line.strip().split('=')
            abbreviations[abbreviation] = full_form
    
    for abbreviation, full_form in abbreviations.items():
        tweet = tweet.replace(abbreviation, full_form)
    
    return tweet

## Preprocess data

In [16]:
# Define a function to preprocess text
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])
    # Convert to lowercase
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a string
    preprocessed_text = ' '.join(filtered_tokens)
    return preprocessed_text

# Apply preprocessing to text, user_description, and user_name columns
data['text'] = data['text'].apply(preprocess_text)
data['username'] = data['username'].apply(preprocess_text)

In [17]:
data.head()

Unnamed: 0,likes,screen_name,text,timestamp,tweet_id,user_id,username,city
0,1,marlawd,recent cancer patient w longstanding respirato...,2020-03-06 14:38:24,1235937783283437568,49687037,marla waltman,Toronto
1,1,marlawd,recent cancer patient w longstanding respirato...,2020-03-06 14:38:24,1235937783283437568,49687037,marla waltman,Toronto
2,4,marlawd,im supposed attend ontliberal leadership conve...,2020-03-03 14:10:21,1234843558009544704,49687037,marla waltman,Toronto
3,4,marlawd,im supposed attend ontliberal leadership conve...,2020-03-03 14:10:21,1234843558009544704,49687037,marla waltman,Toronto
4,1,ghlake,buy bulk things sale freezer stocked im good e...,2020-03-01 02:28:25,1233942136384577536,169051022,linda bowser,Toronto


In [18]:
# Perform additional replacements using perform_replacements function
data['text'] = data['text'].apply(bruteGen)

In [19]:
# Perform additional replacements using replace_abbreviations function
data['text'] = data['text'].apply(replace_abbreviations)

In [21]:
# Save the preprocessed dataset to a new CSV file
data.to_csv('preprocessed_canada_tweets.csv', index=False)

print("Preprocessed dataset saved successfully.")

Preprocessed dataset saved successfully.
