In [3]:
import pandas as pd
import re
import nltk
import os
import json
from datetime import datetime

# Setup NLTK
nltk_data_path = os.path.join(os.path.expanduser('~'), 'nltk_data')
os.environ['NLTK_DATA'] = nltk_data_path

try:
    nltk.download('punkt', force=True)
    nltk.download('punkt_tab', force=True)
except:
    print("Automatic download failed - proceeding to manual installation")

from nltk.tokenize import sent_tokenize

print("Loading dataset...")
df = pd.read_csv("tripadvisor_hotel_reviews.csv")

print(f"Original dataset shape: {df.shape}")
print(f"Dataset columns: {df.columns.tolist()}")
print(f"Missing values per column:\n{df.isnull().sum()}")
print(f"Data types:\n{df.dtypes}")

# Handle NaN values strategically
print("\nHandling missing values...")

# For reviews: replace NaN with empty string to preserve rating data
df['Review'] = df['Review'].fillna('')

# For ratings: replace NaN with 0 (neutral) to preserve review data
df['Rating'] = df['Rating'].fillna(0)

# Create a flag to track which entries had missing data
df['missing_review'] = df['Review'] == ''
df['missing_rating'] = df['Rating'] == 0

print(f"After handling NaNs - shape: {df.shape}")
print(f"Entries with missing reviews: {df['missing_review'].sum()}")
print(f"Entries with missing ratings: {df['missing_rating'].sum()}")

# Enhanced cleaning function
def clean_text(text):
    if pd.isna(text) or text == '':
        return ''
    
    # Remove HTML tags if any
    text = re.sub(r'<[^>]+>', '', str(text))
    
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Keep alphanumeric, spaces, and basic punctuation
    text = re.sub(r'[^\w\s.,!?-]', ' ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text.lower()

# Apply cleaning
print("Cleaning text...")
df['cleaned_review'] = df['Review'].apply(clean_text)

# Remove completely empty reviews after cleaning
df = df[df['cleaned_review'] != '']
print(f"After removing empty reviews - shape: {df.shape}")

# Enhanced chunking function
def create_chunks(text, max_tokens=500):
    if not text or text.strip() == '':
        return []
    
    try:
        sentences = sent_tokenize(text)
    except:
        # Fallback to simple split if nltk fails
        sentences = text.split('. ')
    
    chunks = []
    current_chunk = []
    current_length = 0
    
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
            
        sentence_length = len(sentence.split())
        
        if current_length + sentence_length <= max_tokens and current_length > 0:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

# Create chunks with enhanced metadata
print("Creating chunks...")
chunks_data = []
total_chunks = 0

for idx, row in df.iterrows():
    if idx % 1000 == 0:
        print(f"Processing row {idx}...")
    
    chunks = create_chunks(row['cleaned_review'])
    
    for i, chunk in enumerate(chunks):
        chunk_length = len(chunk.split())
        if chunk_length < 5:  # Skip very short chunks
            continue
            
        chunks_data.append({
            'review_id': idx,
            'chunk_id': i,
            'text': chunk,
            'rating': int(row['Rating']) if not pd.isna(row['Rating']) else 0,
            'has_rating': not row['missing_rating'],
            'chunk_length': chunk_length,
            'source': 'tripadvisor_hotel_reviews.csv',
            'created_at': datetime.now().isoformat(),
            'category': 'hotel_review'
        })
        total_chunks += 1

print(f"Created {total_chunks} chunks from {len(df)} reviews")

# Create DataFrame and save
chunks_df = pd.DataFrame(chunks_data)

# Add some statistics
print(f"\nChunk Statistics:")
print(f"Total chunks: {len(chunks_df)}")
print(f"Average chunk length: {chunks_df['chunk_length'].mean():.2f} words")
print(f"Rating distribution:\n{chunks_df['rating'].value_counts().sort_index()}")
print(f"Chunks with ratings: {chunks_df['has_rating'].sum()}")

# Save processed data
print("Saving processed data...")
chunks_df.to_csv('processed_chunks.csv', index=False)

# Also save as JSON for easier loading in other scripts
chunks_json = chunks_df.to_dict('records')
with open('processed_chunks.json', 'w') as f:
    json.dump(chunks_json, f, indent=2)

# Save metadata
metadata = {
    'total_reviews': len(df),
    'total_chunks': len(chunks_df),
    'average_chunk_length': float(chunks_df['chunk_length'].mean()),
    'rating_distribution': chunks_df['rating'].value_counts().to_dict(),
    'processing_date': datetime.now().isoformat(),
    'source_file': 'tripadvisor_hotel_reviews.csv'
}

with open('processing_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("Data ingestion complete!")
print(f"Files created:")
print(f"- processed_chunks.csv ({len(chunks_df)} rows)")
print(f"- processed_chunks.json")
print(f"- processing_metadata.json")

# Display sample data
print(f"\nSample processed chunks:")
print(chunks_df[['text', 'rating', 'chunk_length']].head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Loading dataset...
Original dataset shape: (20491, 2)
Dataset columns: ['Review', 'Rating']
Missing values per column:
Review    0
Rating    0
dtype: int64
Data types:
Review    object
Rating     int64
dtype: object

Handling missing values...
After handling NaNs - shape: (20491, 4)
Entries with missing reviews: 0
Entries with missing ratings: 0
Cleaning text...
After removing empty reviews - shape: (20491, 5)
Creating chunks...
Processing row 0...
Processing row 1000...
Processing row 2000...
Processing row 3000...
Processing row 4000...
Processing row 5000...
Processing row 6000...
Processing row 7000...
Processing row 8000...
Processing row 9000...
Processing row 10000...
Processing row 11000...
Processing row 12000...
Processing row 13000...
Processing row 14000...
Processing row 15000...
Processing row 16000...
Processing row 17000...
Processing row 18000...
Processing row 19000...
Processing row 20000...
Created 20517 chunks from 20491 reviews

Chunk Statistics:
Total chunks: 205