# Big Data Project

In [28]:
import re
import string

import numpy as np
import pandas as pd

from nltk.corpus import stopwords

# Data Preparation

In [16]:
# Loading the dataset
# Notice delimiter and encoding
df = pd.read_csv('data/Corona_NLP_train.csv', delimiter=',', encoding='latin-1')
print(f'Original number of rows: {df.shape[0]} - Original number of columns: {df.shape[1]}')

# To create a text classifier, we just need corpus and target variable
df = df[['OriginalTweet', 'Sentiment']]

display(df.head())

Original number of rows: 41157 - Original number of columns: 6


Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


## Preprocessing Steps

- Remove URL
- Remove HTML escape characters
- Remove punctuation
- Remove stopwords
- Remove double space
- Maybe: remove mention @?

In [27]:
def preprocess(text):
    """
    Preprocess a text applying the above steps.
    """
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove HTML tags
    text = re.sub(r'<[^>]*>', '', text)
    # Remove new line breaks \n 1r
    text = " ".join(text.splitlines())
    # Remove punctuation
    characters_to_remove = string.punctuation.replace('@', '').replace('#', '')
    text = "".join([char for char in text if char not in characters_to_remove])
    # Remove stop words
    
    text = "".join([])
    
    
    return text

preprocess(df['OriginalTweet'][3])

'my food stock is not the only one which is empty    please dont panic there will be enough food for everyone if you do not take more than you need   stay calm stay safe    #covid19france #covid19 #covid19 #coronavirus #confinement #confinementotal #confinementgeneral '

In [29]:

stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

'!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~'