In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import joblib
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

In [None]:
file_path = './Datasets/Yelp Amazon Mixed Reviews.csv'
df = pd.read_csv(file_path)

print(df.head())
print(df.columns.tolist())

In [None]:
def preprocess_text(text):
    normalized = re.sub(r'\s+', ' ', str(text)).strip().lower()
    
    tokens = word_tokenize(normalized)
    
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    return normalized, tokens, filtered_tokens, lemmatized_tokens


sample_df = df.head(5).copy()
preprocessed_results = [preprocess_text(text) for text in sample_df['ReviewText']]
sample_df['normalized_text'] = [result[0] for result in preprocessed_results]
sample_df['tokens'] = [result[1] for result in preprocessed_results]
sample_df['tokens_no_stopwords'] = [result[2] for result in preprocessed_results]
sample_df['lemmatized_tokens'] = [result[3] for result in preprocessed_results]

for i, row in sample_df.iterrows():
    print(f"\nIndex {i}:")
    print(f"Original: {row['ReviewText'][:50]}...")
    print(f"Normalized: {row['normalized_text'][:50]}...")
    print(f"First 5 tokens: {row['tokens'][:5]}")
    print(f"First 5 tokens without stopwords: {row['tokens_no_stopwords'][:5]}")
    print(f"First 5 lemmatized tokens: {row['lemmatized_tokens'][:5]}")

df_processed = df.copy()
preprocessed_results = [preprocess_text(text) for text in df['ReviewText']]

df_processed['preprocessed_text'] = [' '.join(result[3]) for result in preprocessed_results]
df_processed['lemmatized_tokens'] = [result[3] for result in preprocessed_results]

print(df_processed[['ReviewText', 'preprocessed_text']].head(3))

# TF-IDF, N-Gram

In [None]:
tfidf_vectorizer = TfidfVectorizer(
    min_df=5,              
    max_df=0.95,           
    ngram_range=(1, 2), 
    sublinear_tf=True      
)

X_tfidf = tfidf_vectorizer.fit_transform(df_processed['preprocessed_text'])

feature_names = tfidf_vectorizer.get_feature_names_out()

bigrams = [feature for feature in feature_names if ' ' in feature]

sample_idx = 0
sample_vector = X_tfidf[sample_idx]
non_zero_indices = sample_vector.nonzero()[1]

y = df_processed['Label']
print(f"\nLabels distribution:")
print(y.value_counts())

prepared_data = {
    'X_tfidf': X_tfidf,
    'y': y,
    'feature_names': feature_names,
    'vectorizer': tfidf_vectorizer
}

In [None]:
save_dir = '../Experiment/PreprocessedData'
os.makedirs(save_dir, exist_ok=True)

save_path = os.path.join(save_dir, 'Yelp&AmazonPrepared.joblib')
joblib.dump(prepared_data, save_path)