In [94]:
import nltk
import nltk.corpus
from nltk.corpus import stopwords
import pandas as pd
import re
from typing import List
from functools import reduce
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from pathlib import Path
from typing import Iterable, Any, Union
from sklearn.feature_extraction.text import TfidfVectorizer

In [72]:
import os
nb_full_path = os.path.join(os.getcwd())
nltk.download('wordnet',download_dir=nb_full_path)
nltk.data.path.append(nb_full_path)

[nltk_data] Downloading package wordnet to /home/somi/ampba/taba/taba-
[nltk_data]     assignment/src...
[nltk_data]   Package wordnet is already up-to-date!


In [73]:
# Provide proper path to your dataset
PATH = "../data/uber_reviews_itune.csv"
STOPWORDS = stopwords.words("english")
STEMMER = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [4]:
df = pd.read_csv(PATH, encoding="cp1252", delimiter=",", )

### Text cleaning 
1. Performed keeping these pointers in mind : 
   1. Normalize Text
   2. Remove unicode characters
   3. Remove Stopwords
   4. Perform Stemming and Lemmatization

In [91]:

def normalize_text(text: str) -> str:
    try: 
        normalized_text = text.lower()
    except Exception as e :
        print(f"Error while normalizing text `{text}` , due to : {e}")
        return ""
    else:
        # print(f"normalize_text : {normalized_text}")
        return normalized_text

def remove_unicode_characters(text:str) -> str:
    unicode_free_text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    # print(f"remove_unicode_char : {unicode_free_text}")
    return unicode_free_text

def remove_stopwords(text: str, stops: List[str] = STOPWORDS) -> str:
    filtered_text = " ".join(
        [word for word in text.split() if word not in stops]
    )
    # print(f"remove_stopwords: {filtered_text}")
    return filtered_text

def tokenize_and_stem(text, lemmatizer = lemmatizer) -> List[str]:
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) using regex
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [lemmatizer.lemmatize(t) for t in filtered_tokens]
    # print(f"tokenize : {stems}")
    return stems

In [114]:
def compose_func_chain(*functions):
    return reduce(lambda f, g: lambda x: g(f(x)), functions, lambda x: x)

# chain list of operation together
clean_text = compose_func_chain(
    normalize_text,
    remove_unicode_characters,
    remove_stopwords
)

def clean_text_wrapper(iterable:Iterable[Union[str, Any]], compose_func) -> List[List[str]]:
    cleaned_corpus = list(
        map(
            compose_func, iterable
        )
    )
    return cleaned_corpus

corpus = clean_text_wrapper(
    df["Review"], clean_text
)

In [115]:
# defining parms for the tfidf-tokenizer here
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english',
    use_idf=True, 
    tokenizer=tokenize_and_stem, 
)

# note magic cmd %time
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)    # 6.05 secs

print(tfidf_matrix.shape)    # dimns of the tfidf matrix



(490, 2890)


In [122]:
terms = tfidf_vectorizer.get_feature_names()
print(terms[-30:])

['write', 'writing', 'wrong', 'wrote', 'x', 'xoxo', 'ya', 'yall', 'yard', 'yea', 'yeah', 'year', 'yeeesh', 'yelling', 'yes', 'yesterday', 'yk', 'yo', 'york', 'youd', 'yougood', 'youll', 'young', 'youre', 'youve', 'youworst', 'yu', 'zero', 'zone', 'zoom']


matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])