# Preprocessing - COVID

### 0. Imports

In [3]:
import pandas as pd
import re
import string
from textblob import TextBlob

ModuleNotFoundError: No module named 'textblob'

### 1. Loading data

In [None]:
COVID_PATH = "data/basic/covid/data.csv"


In [None]:
dataset = pd.read_csv(COVID_PATH)

In [None]:
dataset

### 2. Hashtags preprocessing

In [None]:
def find_hashtags(row: pd.DataFrame) -> list:
    hashtags = re.findall(r"#\w+", row['headlines'])
    hashtags = [re.sub(r"^#", "", hashtag) for hashtag in hashtags]

    return hashtags

In [None]:
def remove_hashtags(row: pd.DataFrame) -> str:
    return re.sub(r"#", "", row['headlines'])

In [None]:
def preprocess_hashtags(dataset: pd.DataFrame) -> pd.DataFrame:
    dataset['hashtags'] = dataset.apply(find_hashtags, axis=1)
    dataset['headlines'] = dataset.apply(remove_hashtags, axis=1)

    return dataset

In [None]:
dataset = preprocess_hashtags(dataset)

In [None]:
dataset

### 3. Lowercase preprocessing

In [None]:
def convert_to_lowercase(dataset: pd.DataFrame) -> pd.DataFrame:
    dataset['headlines'] = dataset['headlines'].apply(lambda text: text.lower())

    return dataset

In [None]:
dataset = convert_to_lowercase(dataset)

In [None]:
dataset

### 4. Punctuation preprocessing

In [None]:
def remove_punctuation(row: pd.DataFrame) -> str:
    return "".join([char for char in row['headlines'] if char not in string.punctuation])

In [None]:
def preprocess_punctuation(dataset: pd.DataFrame) -> pd.DataFrame:
    dataset['headlines'] = dataset.apply(remove_punctuation, axis=1)

    return dataset

In [None]:
dataset = preprocess_punctuation(dataset)

In [None]:
dataset

### 5. Emojis preprocessing

In [None]:
def find_emojis(row: pd.DataFrame) -> list:
    return list(set(emoji.distinct_emoji_list(row['headlines'])))

def interpret_emojis(row: pd.DataFrame) -> list:
    return [emoji.demojize(emoji_item, delimiters=("", "")) for emoji_item in row['emojis']]

def remove_emojis(row: pd.DataFrame) -> str:
    return emoji.replace_emoji(row['headlines'])

def preprocess_emojis(dataset: pd.DataFrame) -> pd.DataFrame:
    dataset['emojis'] = dataset.apply(find_emojis, axis=1)
    dataset['emojis'] = dataset.apply(interpret_emojis, axis=1)
    dataset['headlines'] = dataset.apply(remove_emojis, axis=1)

    return dataset

In [None]:
dataset = preprocess_emojis(dataset)

In [None]:
dataset

### 6. Sentiment analysis

In [None]:
def define_sentiment(row: pd.DataFrame) -> str:
    return 'negative' if row['polarity'] < 0 else 'positive' if row['polarity'] > 0 else 'neutral'

def analyze_sentiment(dataset: pd.DataFrame) -> pd.DataFrame:
    sentiment_items = [TextBlob(text) for text in dataset['headlines'].tolist()]
    dataset['polarity'] = [text.sentiment.polarity for text in sentiment_items]
    dataset['subjectivity'] = [text.sentiment.subjectivity for text in sentiment_items]
    dataset['sentiment'] = dataset.apply(define_sentiment, axis=1)

    return dataset

In [None]:
dataset = analyze_sentiment(dataset)

In [None]:
dataset