# Preprocessing - COVID

### 0. Imports

In [110]:
import emoji
import pandas as pd
import re
import string

from textblob import TextBlob

### 1. Loading data

In [111]:
COVID_PATH = "data/basic/covid/data.csv"

In [112]:
dataset = pd.read_csv(COVID_PATH)

In [113]:
dataset

Unnamed: 0,headlines,outcome
0,A post claims compulsory vacination violates t...,0
1,A photo claims that this person is a doctor wh...,0
2,Post about a video claims that it is a protest...,0
3,All deaths by respiratory failure and pneumoni...,0
4,The dean of the College of Biologists of Euska...,0
...,...,...
10196,A Chinese market caused the new coronavirus (v...,0
10197,The peak of the new coronavirus will happen in...,0
10198,Stores and supermarkets in Veracruz (Mexico) w...,0
10199,"A chain message circulated on Tuesday, Jan. 14...",0


### 2. Hashtags preprocessing

In [114]:
def find_hashtags(row: pd.DataFrame) -> list:
    hashtags = re.findall(r"#\w+", row['headlines'])
    hashtags = [re.sub(r"^#", "", hashtag) for hashtag in hashtags]

    return hashtags

In [115]:
def remove_hashtags(row: pd.DataFrame) -> str:
    return re.sub(r"#", "", row['headlines'])

In [116]:
def preprocess_hashtags(dataset: pd.DataFrame) -> pd.DataFrame:
    dataset['hashtags'] = dataset.apply(find_hashtags, axis=1)
    dataset['headlines'] = dataset.apply(remove_hashtags, axis=1)

    return dataset

In [117]:
dataset = preprocess_hashtags(dataset)

In [118]:
dataset

Unnamed: 0,headlines,outcome,hashtags
0,A post claims compulsory vacination violates t...,0,[]
1,A photo claims that this person is a doctor wh...,0,[]
2,Post about a video claims that it is a protest...,0,[]
3,All deaths by respiratory failure and pneumoni...,0,[]
4,The dean of the College of Biologists of Euska...,0,[]
...,...,...,...
10196,A Chinese market caused the new coronavirus (v...,0,[]
10197,The peak of the new coronavirus will happen in...,0,[]
10198,Stores and supermarkets in Veracruz (Mexico) w...,0,[]
10199,"A chain message circulated on Tuesday, Jan. 14...",0,[]


### 3. Lowercase preprocessing

In [119]:
def convert_to_lowercase(dataset: pd.DataFrame) -> pd.DataFrame:
    dataset['headlines'] = dataset['headlines'].apply(lambda text: text.lower())

    return dataset

In [120]:
dataset = convert_to_lowercase(dataset)

In [121]:
dataset

Unnamed: 0,headlines,outcome,hashtags
0,a post claims compulsory vacination violates t...,0,[]
1,a photo claims that this person is a doctor wh...,0,[]
2,post about a video claims that it is a protest...,0,[]
3,all deaths by respiratory failure and pneumoni...,0,[]
4,the dean of the college of biologists of euska...,0,[]
...,...,...,...
10196,a chinese market caused the new coronavirus (v...,0,[]
10197,the peak of the new coronavirus will happen in...,0,[]
10198,stores and supermarkets in veracruz (mexico) w...,0,[]
10199,"a chain message circulated on tuesday, jan. 14...",0,[]


### 4. Punctuation preprocessing

In [122]:
def remove_punctuation(row: pd.DataFrame) -> str:
    return "".join([char for char in row['headlines'] if char not in string.punctuation])

In [123]:
def preprocess_punctuation(dataset: pd.DataFrame) -> pd.DataFrame:
    dataset['headlines'] = dataset.apply(remove_punctuation, axis=1)

    return dataset

In [124]:
dataset = preprocess_punctuation(dataset)

In [125]:
dataset

Unnamed: 0,headlines,outcome,hashtags
0,a post claims compulsory vacination violates t...,0,[]
1,a photo claims that this person is a doctor wh...,0,[]
2,post about a video claims that it is a protest...,0,[]
3,all deaths by respiratory failure and pneumoni...,0,[]
4,the dean of the college of biologists of euska...,0,[]
...,...,...,...
10196,a chinese market caused the new coronavirus video,0,[]
10197,the peak of the new coronavirus will happen in...,0,[]
10198,stores and supermarkets in veracruz mexico wil...,0,[]
10199,a chain message circulated on tuesday jan 14 w...,0,[]


### 5. Emojis preprocessing

In [126]:
def find_emojis(row: pd.DataFrame) -> list:
    return list(set(emoji.distinct_emoji_list(row['headlines'])))

def interpret_emojis(row: pd.DataFrame) -> list:
    return [emoji.demojize(emoji_item, delimiters=("", "")) for emoji_item in row['emojis']]

def remove_emojis(row: pd.DataFrame) -> str:
    return emoji.replace_emoji(row['headlines'])

def preprocess_emojis(dataset: pd.DataFrame) -> pd.DataFrame:
    dataset['emojis'] = dataset.apply(find_emojis, axis=1)
    dataset['emojis'] = dataset.apply(interpret_emojis, axis=1)
    dataset['headlines'] = dataset.apply(remove_emojis, axis=1)

    return dataset

In [127]:
dataset = preprocess_emojis(dataset)

In [128]:
dataset

Unnamed: 0,headlines,outcome,hashtags,emojis
0,a post claims compulsory vacination violates t...,0,[],[trade_mark]
1,a photo claims that this person is a doctor wh...,0,[],[]
2,post about a video claims that it is a protest...,0,[],[]
3,all deaths by respiratory failure and pneumoni...,0,[],[]
4,the dean of the college of biologists of euska...,0,[],[trade_mark]
...,...,...,...,...
10196,a chinese market caused the new coronavirus video,0,[],[]
10197,the peak of the new coronavirus will happen in...,0,[],[]
10198,stores and supermarkets in veracruz mexico wil...,0,[],[]
10199,a chain message circulated on tuesday jan 14 w...,0,[],[]


### 6. Sentiment analysis

In [129]:
def define_sentiment(row: pd.DataFrame) -> str:
    return 'negative' if row['polarity'] < 0 else 'positive' if row['polarity'] > 0 else 'neutral'

def analyze_sentiment(dataset: pd.DataFrame) -> pd.DataFrame:
    sentiment_items = [TextBlob(text) for text in dataset['headlines'].tolist()]
    dataset['polarity'] = [text.sentiment.polarity for text in sentiment_items]
    dataset['subjectivity'] = [text.sentiment.subjectivity for text in sentiment_items]
    dataset['sentiment'] = dataset.apply(define_sentiment, axis=1)

    return dataset

In [130]:
dataset = analyze_sentiment(dataset)

In [131]:
dataset

Unnamed: 0,headlines,outcome,hashtags,emojis,polarity,subjectivity,sentiment
0,a post claims compulsory vacination violates t...,0,[],[trade_mark],0.033333,0.500000,positive
1,a photo claims that this person is a doctor wh...,0,[],[],0.500000,0.500000,positive
2,post about a video claims that it is a protest...,0,[],[],0.000000,0.000000,neutral
3,all deaths by respiratory failure and pneumoni...,0,[],[],-0.316667,0.300000,negative
4,the dean of the college of biologists of euska...,0,[],[trade_mark],-0.400000,0.600000,negative
...,...,...,...,...,...,...,...
10196,a chinese market caused the new coronavirus video,0,[],[],0.068182,0.227273,positive
10197,the peak of the new coronavirus will happen in...,0,[],[],0.123603,0.572811,positive
10198,stores and supermarkets in veracruz mexico wil...,0,[],[],0.003788,0.276515,positive
10199,a chain message circulated on tuesday jan 14 w...,0,[],[],0.300000,0.450000,positive
