<a href="https://colab.research.google.com/github/BastienCherel/TripAdvisor-Recommendation-Challenge/blob/main/NLP_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing  : First Project
## TripAdvisor Recommendation Challenge
Beating BM25


### Installing BM25

In [3]:
!pip install rank_bm25
!pip install kagglehub



### Download data

In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("joebeachcapital/hotel-reviews")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/joebeachcapital/hotel-reviews/versions/2


### Model versus

In [10]:
import pandas as pd
import numpy as np
import re
import os
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import warnings
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from rank_bm25 import BM25Okapi

# Ignore warnings
warnings.filterwarnings('ignore')

# Ensure all necessary NLTK data is downloaded
def download_nltk_data():
    try:
        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('punkt_tab')
        nltk.download('averaged_perceptron_tagger')
        nltk.download('maxent_ne_chunker')
        nltk.download('words')
        print("NLTK data download complete")
    except Exception as e:
        print(f"NLTK data download failed: {e}")

# Download NLTK data
download_nltk_data()

def load_data(base_path):
    """Load data"""
    try:
        reviews_path = os.path.join(base_path, "reviews.csv")
        reviews = pd.read_csv(reviews_path)
        print(f"Successfully loaded {len(reviews)} reviews")

        # Parse ratings string to dictionary
        reviews['ratings'] = reviews['ratings'].apply(lambda x: json.loads(x.replace("'", '"')))
        return reviews
    except Exception as e:
        print(f"Data loading failed: {e}")
        return False


def load_sampled_data(file_path):
    print(f"\nLoading cleaned data from {file_path}...")
    data = pd.read_csv(file_path)
    print(f"Successfully loaded {len(data)} reviews")
    return data.sample(n=100, random_state=42)  # Sample 100 rows for testing

def clean_text(text):
    """Text cleaning"""
    try:
        # Basic cleaning
        text = re.sub(r'<.*?>', '', str(text))  # Remove HTML tags
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)  # Remove non-alphabetic characters
        text = text.lower()  # Convert to lowercase

        # Tokenization and stopword removal
        words = text.split()  # Use simple tokenization
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]

        return ' '.join(words)
    except Exception as e:
        print(f"Text cleaning failed: {e}")
        return ""

def preprocess_data(reviews , base_path):
    """Data preprocessing"""
    try:
        print("Starting data preprocessing...")
        cleaned_texts = []

        # Use tqdm to show progress
        for text in tqdm(reviews['text'], desc="Cleaning text"):
            cleaned_texts.append(clean_text(text))

        reviews['cleaned_text'] = cleaned_texts

        # Save preprocessed data
        cleaned_path = os.path.join(base_path, "cleaned_reviews.csv")
        reviews.to_csv(cleaned_path, index=False)
        print("Data preprocessing complete")
        return True
    except Exception as e:
        print(f"Data preprocessing failed: {e}")
        return False

# Generate embeddings using SentenceTransformer
def generate_embeddings(data, batch_size=16):
    print("\nStarting embedding generation...")
    model = SentenceTransformer('all-mpnet-base-v2', device='cpu')  # Use CPU
    embeddings = []

    for i in tqdm(range(0, len(data), batch_size), desc="Generating embeddings"):
        batch = data['cleaned_text'].iloc[i:i + batch_size].tolist()
        batch_embeddings = model.encode(batch, show_progress_bar=False)
        embeddings.extend(batch_embeddings)

    data['embedding'] = embeddings
    print("Embedding generation complete.")
    return data

# Compute semantic similarity
def compute_similarity(data, query):
    print(f"\nProcessing query: '{query}'")
    model = SentenceTransformer('all-mpnet-base-v2', device='cpu')  # Use CPU
    query_embedding = model.encode(query, show_progress_bar=False)
    candidate_embeddings = np.array(list(data['embedding']))
    similarities = cosine_similarity([query_embedding], candidate_embeddings)[0]
    data['semantic_similarity'] = similarities
    print("Similarity computation completed.")
    return data

# Prepare data for BM25
def prepare_for_bm25(data):
    print("\nPreparing data for BM25...")
    data = data.dropna(subset=['cleaned_text'])  # Drop rows with missing cleaned_text
    data['cleaned_text'] = data['cleaned_text'].astype(str)  # Ensure all texts are strings
    print(f"BM25 preparation complete. Total valid rows: {len(data)}")
    return data

# BM25 implementation
def run_bm25(data, query, top_k=5):
    print("\nRunning BM25...")
    tokenized_corpus = [doc.split() for doc in data['cleaned_text']]
    bm25 = BM25Okapi(tokenized_corpus)
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    data['bm25_score'] = scores
    sorted_data = data.sort_values(by='bm25_score', ascending=False).head(top_k)
    print("BM25 computation completed.")
    return data, sorted_data

# Calculate Mean Squared Error (MSE)
def calculate_mse(model_scores, bm25_scores):
    scaler = MinMaxScaler()
    model_scores_normalized = scaler.fit_transform(np.array(model_scores).reshape(-1, 1)).flatten()
    bm25_scores_normalized = scaler.fit_transform(np.array(bm25_scores).reshape(-1, 1)).flatten()
    mse = mean_squared_error(model_scores_normalized, bm25_scores_normalized)
    return mse

# Main function
def main():
    try:
        # File path for cleaned reviews
        file_path = path + "/cleaned_reviews.csv"
        reviews = load_data(path)
        preprocess_data(reviews , path)
        # Load cleaned data and sample 100 rows
        sampled_data = load_sampled_data(file_path)

        # Generate embeddings
        sampled_data = generate_embeddings(sampled_data)

        # Query for similarity comparison
        query = "I enjoyed the cozy atmosphere and excellent service."

        # Compute semantic similarity
        sampled_data = compute_similarity(sampled_data, query)

        # Prepare data for BM25
        sampled_data = prepare_for_bm25(sampled_data)

        # Run BM25
        sampled_data, bm25_results = run_bm25(sampled_data, query)

        # Calculate MSE between semantic similarity and BM25 scores
        mse_value = calculate_mse(sampled_data['semantic_similarity'], sampled_data['bm25_score'])
        print(f"\nMSE between the semantic model and BM25: {mse_value}")

        # Display top 5 results for both models
        print("\nSemantic Model Top 5 Recommendations:")
        print(sampled_data.sort_values(by='semantic_similarity', ascending=False)[['cleaned_text', 'semantic_similarity']].head(5))

        print("\nBM25 Top 5 Recommendations:")
        print(bm25_results[['cleaned_text', 'bm25_score']])

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


NLTK data download complete
Successfully loaded 878561 reviews
Starting data preprocessing...


Cleaning text: 100%|██████████| 878561/878561 [03:17<00:00, 4437.44it/s]


Data preprocessing complete

Loading cleaned data from /root/.cache/kagglehub/datasets/joebeachcapital/hotel-reviews/versions/2/cleaned_reviews.csv...
Successfully loaded 878561 reviews

Starting embedding generation...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings: 100%|██████████| 7/7 [01:55<00:00, 16.44s/it]


Embedding generation complete.

Processing query: 'I enjoyed the cozy atmosphere and excellent service.'
Similarity computation completed.

Preparing data for BM25...
BM25 preparation complete. Total valid rows: 99

Running BM25...
BM25 computation completed.

MSE between the semantic model and BM25: 0.23815760503968111

Semantic Model Top 5 Recommendations:
                                             cleaned_text  semantic_similarity
828869  friendly helpful informative staff clean brigh...             0.693619
867791  went friend evening room luxurious loved ameni...             0.658303
563727  great location center shopping district great ...             0.637995
257070  beautiful time staff service location excellen...             0.591770
35227   great location rooms clean beds comfortable cl...             0.588958

BM25 Top 5 Recommendations:
                                             cleaned_text  bm25_score
169919  looked deciding belvedere read many reviews ar...    5.007