# Libraries

In [53]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')

from typing import List

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\htc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\htc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data

In [3]:
json_file_path = r"E:\DATA SCIENCE\NLP-Tea\Data\yelp_academic_dataset_tip.json\yelp_academic_dataset_tip.json"
df = pd.read_json(json_file_path, lines=True)

df.head()

Unnamed: 0,user_id,business_id,text,date,compliment_count
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban...,2013-02-05 18:35:10,0
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,It's open even when you think it isn't,2013-08-18 00:56:08,0
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,Very decent fried chicken,2017-06-27 23:05:38,0
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,Appetizers.. platter special for lunch,2012-10-06 19:43:09,0


In [4]:
print(df.shape)

(908915, 5)


In [5]:
text_data = list(df["text"][:1000]) # First 1000 Row Only 
text_data[:5]

['Avengers time with the ladies.',
 'They have lots of good deserts and tasty cuban sandwiches',
 "It's open even when you think it isn't",
 'Very decent fried chicken',
 'Appetizers.. platter special for lunch']

# Preprocessing

In [54]:
test_text = text_data[101]
print(test_text)

Self serve onions, relish, mayo?  And FREE caramelized onions?  Yes!


## Case Normalization (lowercase)


In [55]:
text_lower = test_text.lower()
print(f"original text : {test_text}")
print(f"lowercase text: {text_lower}")


original text : Self serve onions, relish, mayo?  And FREE caramelized onions?  Yes!
lowercase text: self serve onions, relish, mayo?  and free caramelized onions?  yes!


## Removes punctuation and digits

In [56]:
text_lower = test_text.lower()
text_no_punct = re.sub(r'[^a-zA-z\s]', '', text_lower) # keep only letters and space
text_no_punct = re.sub(r'[^a-zA-z\s0-9]', '', text_lower) # Keep numbers 


print(f"original text : {test_text}")
print(f"preprocessed  : {text_no_punct}")

original text : Self serve onions, relish, mayo?  And FREE caramelized onions?  Yes!
preprocessed  : self serve onions relish mayo  and free caramelized onions  yes


In [66]:
text_no_punct1 = re.sub(r'[^a-zA-z\s\']', '', "don't")
text_no_punct2 = re.sub(r'[^a-zA-z\s]', '', "don't")

print(f"original text : don't")
print(f"with \\'        : {text_no_punct1}")
print(f"with out \\'    : {text_no_punct2}")

original text : don't
with \'        : don't
with out \'    : dont


## Tokens

In [57]:
text_lower = test_text.lower()
text_no_punct = re.sub(r'[^a-zA-z\s]', '', text_lower) 
tokens = re.split(r"\s+", text_no_punct) 

# or 
tokens_v1 = word_tokenize(text_no_punct)

print(f"original text : {test_text}")
print(f"preprocessed  : {tokens}")
print(f"preprocessed_1: {tokens_v1}")

original text : Self serve onions, relish, mayo?  And FREE caramelized onions?  Yes!
preprocessed  : ['self', 'serve', 'onions', 'relish', 'mayo', 'and', 'free', 'caramelized', 'onions', 'yes']
preprocessed_1: ['self', 'serve', 'onions', 'relish', 'mayo', 'and', 'free', 'caramelized', 'onions', 'yes']


## Removes stopwords


In [None]:
text_lower = test_text.lower()
text_no_punct = re.sub(r'[^a-zA-z\s]', '', text_lower)
tokens = re.split(r"\s+", text_no_punct) 

stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]



print(f"original text : {test_text}")
print(f"preprocessed  : {tokens}")

original text : Self serve onions, relish, mayo?  And FREE caramelized onions?  Yes!
preprocessed  : ['self', 'serve', 'onions', 'relish', 'mayo', 'free', 'caramelized', 'onions', 'yes']


## Stemming and lemmatization

In [None]:
# Initialize stemmer 
stemmer = PorterStemmer()
stem_tokens = [stemmer.stem(token) for token in tokens]

print(f"original text : {test_text}")
print(f"preprocessed  : {stem_tokens}")

#server =>> serv
# yes =>> ye

original text : Self serve onions, relish, mayo?  And FREE caramelized onions?  Yes!
preprocessed  : ['self', 'serv', 'onion', 'relish', 'mayo', 'free', 'caramel', 'onion', 'ye']


In [None]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
lemma_tokens = [lemmatizer.lemmatize(token) for token in tokens]

print(f"original text : {test_text}")
print(f"preprocessed  : {lemma_tokens}")


original text : Self serve onions, relish, mayo?  And FREE caramelized onions?  Yes!
preprocessed  : ['self', 'serve', 'onion', 'relish', 'mayo', 'free', 'caramelized', 'onion', 'yes']


In [50]:
words = ["running", "better", "flies", "cities", "served", "children"]

for word in words:
    print(f"Word: {word}")
    print(f"  Stemmed:     {stemmer.stem(word)}")
    print(f"  Lemmatized:  {lemmatizer.lemmatize(word)}")
    print()

Word: running
  Stemmed:     run
  Lemmatized:  running

Word: better
  Stemmed:     better
  Lemmatized:  better

Word: flies
  Stemmed:     fli
  Lemmatized:  fly

Word: cities
  Stemmed:     citi
  Lemmatized:  city

Word: served
  Stemmed:     serv
  Lemmatized:  served

Word: children
  Stemmed:     children
  Lemmatized:  child



## ALL

In [None]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk
import emoji
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')

from typing import List

In [None]:
def preprocessing(text: str) -> list[str]:
    """
    Preprocesses a given text:
    - Lowercases text
    - Removes punctuation and digits
    - Removes stopwords
    - Tokenizes into words
    - Applies lemmatization or stemming

    Args:
        document (str): The raw input text

    Returns:
        List of str: Cleaned and preprocessed text

    Example:
        >>> preprocess("I love Python! 😊 It's awesome 👍")
        ['love', 'python', 'smiling_face', 'awesome', 'thumbs_up']
    """

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()

    # Convert Text to Lowercase (Normalization)
    text_lower = text.lower()

    # Removing Punctuation
    text_no_punct = re.sub(r'[^a-zA-Z\s\']', '', text_lower) # \' for keep apostrophes (e.g. don't, it's)


    # 3. Tokens
    tokens = re.split(r"\s+", text_no_punct) 
    tokens = [t for t in tokens if t]
    # or use nltk tokenizer
    tokens = word_tokenize(text_no_punct)

    # 4. Stop word removal
    filtered_tokens  = [token for token in tokens if token not in stop_words]

    # 5. Lemmatization 
    lemma_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens ]
    # or stemmer
    stemm_tokens = [stemmer.stem(token) for token in filtered_tokens ]

    return lemma_tokens

text_data = list(df["text"][:100]) # First 1000 Row Only
preprocessed_text = [preprocessing(text) for text in text_data]
print(preprocessed_text[:5])

[['avenger', 'time', 'lady'], ['lot', 'good', 'desert', 'tasty', 'cuban', 'sandwich'], ["'s", 'open', 'even', 'think', "n't"], ['decent', 'fried', 'chicken'], ['appetizer', 'platter', 'special', 'lunch']]


In [None]:
print(preprocessing("I love Python! 😊 It's awesome 👍"))
text = emoji.demojize("I love 🍕 and 😄!", delimiters=(" ", " ")) 
text = re.sub(r'_', ' ', text)
text

'I love  pizza  and  grinning face with smiling eyes !'

In [None]:
#  Remove URLs, emails, and Twitter mentions
text = re.sub(r'(https?://\S+|www\.\S+)', ' ', text)   # URLs
text = re.sub(r'\S+@\S+', ' ', text)                    # Email addresses
text = re.sub(r'@\w+', ' ', text)                       # Mentions