In [None]:
!pip install nltk
!pip install numpy
!pip install pandas
!pip install textblob
!pip install scikit-learn
!pip install IPython

In [None]:
import re
import nltk
import numpy as np
import pandas as pd
from textblob import TextBlob 
from multiprocessing import Pool
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from IPython.display import display, HTML


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

In [51]:
df = pd.read_csv('data/dataset.csv')
df = df.rename(columns={'Review': 'review', 'Rating': 'rating'})

In [52]:
df.head(5)

Unnamed: 0,review,rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [61]:
def preprocess_text(text):
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Basic spell correction using TextBlob
    text = str(TextBlob(text).correct())
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join tokens back to text
    return ' '.join(tokens)

def parallel_preprocess(texts, n_cores=4):
    with Pool(n_cores) as pool:
        processed_texts = pool.map(preprocess_text, texts)
    return processed_texts

In [62]:
df['review'][0]

'nice hotel expensive parking got good deal stay hotel anniversary, arrived late evening took advice previous reviews did valet parking, check quick easy, little disappointed non-existent view room room clean nice size, bed comfortable woke stiff neck high pillows, not soundproof like heard music room night morning loud bangs doors opening closing hear people talking hallway, maybe just noisy neighbors, aveda bath products nice, did not goldfish stay nice touch taken advantage staying longer, location great walking distance shopping, overall nice experience having pay 40 parking night,  '

In [63]:
preprocess_text(df['review'][0])

'nice hotel expensive parking got good deal stay hotel anniversary arrived late evening took advice previous reviews valet parking check quick easy little disappointed nonexistent view room room clean nice size bed comfortable woke stiff neck high pillows soundproof like heard music room night morning loud banks doors opening closing hear people talking halfway maybe noisy neighbors area bath products nice goldfish stay nice touch taken advantage staying longer location great walking distance shopping overall nice experience pay parking night'

In [64]:
df['review'][2]

"nice rooms not 4* experience hotel monaco seattle good hotel n't 4* level.positives large bathroom mediterranean suite comfortable bed pillowsattentive housekeeping staffnegatives ac unit malfunctioned stay desk disorganized, missed 3 separate wakeup calls, concierge busy hard touch, did n't provide guidance special requests.tv hard use ipod sound dock suite non functioning. decided book mediterranean suite 3 night weekend stay 1st choice rest party filled, comparison w spent 45 night larger square footage room great soaking tub whirlpool jets nice shower.before stay hotel arrange car service price 53 tip reasonable driver waiting arrival.checkin easy downside room picked 2 person jacuzi tub no bath accessories salts bubble bath did n't stay, night got 12/1a checked voucher bottle champagne nice gesture fish waiting room, impression room huge open space felt room big, tv far away bed chore change channel, ipod dock broken disappointing.in morning way asked desk check thermostat said 6

In [65]:
preprocess_text(df['review'][2])

'nice rooms experience hotel monarch seattle good hotel levelpositives large bathroom mediterranean suite comfortable bed pillowsattentive housekeeping staffnegatives ac unit malfunctioned stay desk disorganized missed separate wake calls converge busy hard touch provide guidance special requests hard use upon sound dock suite non sanctioning decided book mediterranean suite night weekend stay st choice rest party filled comparison w spent night larger square forage room great soaking tub whirlpool jets nice showerbefore stay hotel arrange car service price tip reasonable driver waiting arrivalcheckin easy downside room picked person jacuzi tub bath accessories salts bubble bath stay night got checked voucher bottle champagne nice gesture fish waiting room impression room huge open space felt room big far away bed chose change channel upon dock broken disappointing morning way asked desk check thermostat said f degrees warm try cover face night bright blue light kept got room night st 

In [None]:
# 3. Apply preprocessing
df['cleaned_review'] = parallel_preprocess(df['review'])

In [None]:
df.to_csv('data/cleaned_dataset.csv', index=False)

In [49]:

# 1. Simple side-by-side comparison
def show_comparison(original, cleaned, n_samples=5):
    comparison_df = pd.DataFrame({
        'Original Text': original,
        'Cleaned Text': cleaned
    })
    
    # Show random samples
    display(comparison_df.sample(n=n_samples))

# 2. Highlighting differences
def highlight_differences(text1, text2):
    """Highlight words that are different"""
    words1 = set(text1.split())
    words2 = set(text2.split())
    
    different_words = words1.symmetric_difference(words2)
    
    html = f"""
    <div style="display: flex; gap: 20px;">
        <div style="flex: 1;">
            <h4>Original:</h4>
            <p>{''.join([f'<span style="background-color: yellow">{w}</span> ' if w in different_words else f'{w} ' for w in text1.split()])}</p>
        </div>
        <div style="flex: 1;">
            <h4>Cleaned:</h4>
            <p>{''.join([f'<span style="background-color: yellow">{w}</span> ' if w in different_words else f'{w} ' for w in text2.split()])}</p>
        </div>
    </div>
    """
    return HTML(html)

In [None]:
# 4. Create feature matrix using TF-IDF
tfidf = TfidfVectorizer(
    max_features=5000,  # Limit vocabulary size
    min_df=5,          # Minimum document frequency
    max_df=0.95        # Maximum document frequency
)

X = tfidf.fit_transform(df['cleaned_review'])
y = df['rating']

# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6. Train model
model = LogisticRegression(
    multi_class='multinomial',
    max_iter=1000
)
model.fit(X_train, y_train)

# 7. Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))