In [121]:
import pandas as pd
import numpy as np
import plotly.express as px
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

In [122]:
print(np.__version__)

2.2.1


In [123]:
print("Hello World")

Hello World


In [124]:
data = pd.read_csv(r'data/Corona_NLP_train.csv', encoding= 'ISO-8859-1')

In [125]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


In [126]:
# filter the data to only conbtain the tweets and the sentiment
data = data[['OriginalTweet', 'Sentiment']]
data.head()

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [127]:
# plot the sentiment distribution using plotly
px.histogram(data, x='Sentiment', title='Sentiment Distribution')

In [128]:
# The data seems balanced, we can now proceed to clean the data

In [129]:
# Initialize the tokenizer and other resources
tokenizer = TweetTokenizer()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [130]:
# function to clean the data
def preprocess_tweet(tweet):
    # Remove accents
    tweet = unidecode(tweet)
    # make the tweet to lower case
    tweet = tweet.lower()
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    # Remove mentions
    tweet = re.sub(r'@\w+', '', tweet)
    # Replace hastags with the space
    tweet = re.sub(r'#', ' ', tweet)
    # Remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Tokenize tweet
    tokens = tokenizer.tokenize(tweet)
    # Remove stop words and do lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back to string
    tweet = ' '.join(tokens)
    # Remove multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
    # Strip leading/trailing spaces
    tweet = tweet.strip()
    return tweet

In [131]:
# Apply preprocessing to the 'OriginalTweet	' column
data['cleaned_tweet'] = data['OriginalTweet'].apply(preprocess_tweet)

In [None]:
def extract_features(method, data, **kwargs):
    """
    Extract features from text data using the specified method.
    
    Parameters:
    - method (str): The feature extraction method ('tfidf', 'bow').
    - data (list): The list of text data to be transformed.
    - kwargs: Additional arguments for the feature extraction methods.
    
    Returns:
    - features: The extracted features.
    """
    if method == 'tfidf':
        vectorizer = TfidfVectorizer(**kwargs)
        features = vectorizer.fit_transform(data)
        return features, vectorizer.get_feature_names_out()
    
    elif method == 'bow':
        vectorizer = CountVectorizer(**kwargs)
        features = vectorizer.fit_transform(data)
        return features, vectorizer.get_feature_names_out()
    
    elif method == 'spacy':
        # Use spaCy to get word embeddings
        features = np.array([nlp(tweet).vector for tweet in data])
        return features, nlp
    
    else:
        raise ValueError("Method must be 'tfidf', 'bow', or 'spacy'")