In [None]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

### Load data

In [2]:
# path_to_data = "../data/interim/train/interim_data_v0.1.csv"
path_to_data = "../data/interim/test/interim_data_v0.1.csv"
df = pd.read_csv(path_to_data, encoding="utf-8")

### Feature engineering

In [3]:
df.head()

Unnamed: 0,review,rating
0,Its ice cream cone is the best part of this br...,4
1,"The drivethru line at this location is, had be...",4
2,Stopped here after picking up my brother from ...,5
3,Service was quick despite longer line. Order h...,4
4,Fries are always good. Ice cream is ALWAYS not...,3


In [4]:
# Remove reviews with rating '3' as neutrals, because we classify only positive/negative
df = df[df['rating'] != 3]

In [5]:
# Classify scores as positive (1) and negative (-1)
df['sentiment'] = df['rating'].apply(lambda rating : 1 if rating > 3 else -1 if rating < 3 else np.nan)

In [6]:
df.head(2)

Unnamed: 0,review,rating,sentiment
0,Its ice cream cone is the best part of this br...,4,1
1,"The drivethru line at this location is, had be...",4,1


In [7]:
reviews_num = df.shape[0]
print(f"Number of reviews: {reviews_num}")

Number of reviews: 2880


In [26]:
# Convert to lowercase
df['review'] = df['review'].apply(lambda x: x.lower() if isinstance(x, str) else x)

In [27]:
# Remove non-word and non-whitespace characters
df['review'] = df['review'].replace(to_replace=r'[^\w\s]', value='', regex=True)

In [28]:
# Remove digits
df['review'] = df['review'].replace(to_replace=r'\d', value='', regex=True)

In [29]:
# Tokenization
df['review'] = df['review'].apply(word_tokenize)

In [30]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
df['review'] = df['review'].apply(lambda x: [word for word in x if word not in stop_words])

In [31]:
# Stemming

# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Define a function to perform stemming on the 'text' column
def stem_words(words):
    return [stemmer.stem(word) for word in words]

df['review_stemmed'] = df['review'].apply(stem_words)

In [32]:
# Lemmatization

nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)

lemmatizer = WordNetLemmatizer()

# define function to lemmatize tokens
def lemmatize_tokens(tokens):
    # convert POS tag to WordNet format
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    
    # lemmatize tokens
    lemmas = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]

    return lemmas

# apply lemmatization function to column of dataframe
df['review_lemma'] = df['review'].apply(lemmatize_tokens)

In [33]:
df.head()

Unnamed: 0,review,rating,sentiment,review_stemmed,review_lemma
0,"[ice, cream, cone, best, part, branch]",4,1,"[ice, cream, cone, best, part, branch]","[ice, cream, cone, best, part, branch]"
1,"[drivethru, line, location, always, broken, do...",4,1,"[drivethru, line, locat, alway, broken, dont, ...","[drivethru, line, location, always, broken, do..."
2,"[stopped, picking, brother, school, bit, line,...",5,1,"[stop, pick, brother, school, bit, line, move,...","[stop, pick, brother, school, bit, line, move,..."
3,"[service, quick, despite, longer, line, order,...",4,1,"[servic, quick, despit, longer, line, order, a...","[service, quick, despite, longer, line, order,..."
5,[mcdonalds],4,1,[mcdonald],[mcdonalds]


In [34]:
# Save processed data
df.to_csv(path_to_data.replace("interim", "processed"), index=False)