In [1]:
from table_bert import TableBertModel

model = TableBertModel.from_pretrained(
    "tabert_base_k1/model.bin",
)

print("Model initialized")


  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


Model initialized


In [2]:
# Cell 5: Load your mobile reviews dataset
import pandas as pd
import numpy as np
from tqdm import tqdm

# Load your dataset
df = pd.read_csv("Mobile Reviews Sentiment.csv")  # Update this path
df = df.head(5000)  # Use 5000 rows as you mentioned

print(f"üìä Dataset Shape: {df.shape}")
print(f"üìã Columns: {df.columns.tolist()}")
print(f"\n‚úÖ Dataset loaded successfully!")

üìä Dataset Shape: (5000, 25)
üìã Columns: ['review_id', 'customer_name', 'age', 'brand', 'model', 'price_usd', 'price_local', 'currency', 'exchange_rate_to_usd', 'rating', 'review_text', 'sentiment', 'country', 'language', 'review_date', 'verified_purchase', 'battery_life_rating', 'camera_rating', 'performance_rating', 'design_rating', 'display_rating', 'review_length', 'word_count', 'helpful_votes', 'source']

‚úÖ Dataset loaded successfully!


In [17]:
df

Unnamed: 0,review_id,customer_name,age,brand,model,price_usd,price_local,currency,exchange_rate_to_usd,rating,...,verified_purchase,battery_life_rating,camera_rating,performance_rating,design_rating,display_rating,review_length,word_count,helpful_votes,source
0,1,Aryan Maharaj,45,Realme,Realme 12 Pro,337.31,‚Çπ27996.73,INR,83.00,2,...,True,1,1,3,2,1,46,7,1,Amazon
1,2,Davi Miguel Sousa,18,Realme,Realme 12 Pro,307.78,R$1754.35,BRL,5.70,4,...,True,3,2,4,3,2,74,12,5,Flipkart
2,3,Pahal Balay,27,Google,Pixel 6,864.53,‚Çπ71755.99,INR,83.00,4,...,True,3,5,3,2,4,55,11,8,AliExpress
3,4,David Guzman,19,Xiaomi,Redmi Note 13,660.94,ÿØ.ÿ•2425.65,AED,3.67,3,...,False,1,3,2,1,2,66,11,3,Amazon
4,5,Yago Le√£o,38,Motorola,Edge 50,792.13,R$4515.14,BRL,5.70,3,...,True,3,3,2,2,1,73,12,0,BestBuy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,Leslie Lewis,19,Apple,iPhone 14,906.48,C$1250.94,CAD,1.38,2,...,True,2,1,1,1,1,68,14,4,Amazon
4996,4997,Henry Hughes,37,Apple,iPhone SE,1409.27,¬£1099.23,GBP,0.78,1,...,False,1,1,1,1,1,54,12,2,Amazon
4997,4998,Vansha Acharya,33,Samsung,Galaxy Z Flip,419.13,‚Çπ34787.79,INR,83.00,3,...,True,3,2,4,4,3,56,9,4,Flipkart
4998,4999,Christopher Bray,55,Realme,Realme 12 Pro,279.90,C$386.26,CAD,1.38,3,...,True,2,1,3,2,2,72,10,2,Amazon


In [3]:
# Cell 6: Helper functions to convert DataFrame rows to TaBERT Tables

from table_bert import Table, Column

def dataframe_row_to_tabert_table(row, include_columns=None):
    """
    Convert a pandas DataFrame row into TaBERT Table object
    
    Args:
        row: pandas Series (single row)
        include_columns: list of column names to include
    
    Returns:
        TaBERT Table object
    """
    if include_columns is None:
        # Select important columns for table representation
        include_columns = [
            'brand', 'model', 'price_usd', 'rating',
            'battery_life_rating', 'camera_rating', 
            'performance_rating', 'design_rating', 'display_rating',
            'verified_purchase', 'sentiment', 'country'
        ]
    
    # Define which columns are numeric (type='real') vs text
    numeric_columns = {
        'price_usd', 'rating', 'battery_life_rating', 
        'camera_rating', 'performance_rating', 
        'design_rating', 'display_rating'
    }
    
    # Create Table columns
    columns = []
    data_row = []
    
    for col_name in include_columns:
        if col_name in row.index:
            value = row[col_name]
            
            # Handle missing/null values
            if pd.isna(value):
                value = 'unknown'
            
            # Convert boolean to string
            if isinstance(value, bool):
                value = 'yes' if value else 'no'
            
            # Determine column type (real for numeric, text for others)
            col_type = 'real' if col_name in numeric_columns else 'text'
            
            # Create Column object
            columns.append(
                Column(
                    name=col_name,
                    type=col_type,
                    sample_value=str(value)
                )
            )
            
            # Add to data row
            data_row.append(str(value))
    
    # Create Table object with K=1 (single row snapshot)
    table = Table(
        id=f"review_{row['review_id']}",
        header=columns,
        data=[data_row]
    ).tokenize(model.tokenizer)
    
    return table

# Test the conversion
print("üß™ Testing Table conversion with first row:\n")
sample_row = df.iloc[0]
sample_table = dataframe_row_to_tabert_table(sample_row)

print(f"‚úÖ Table ID: {sample_table.id}")
print(f"üìä Number of columns: {len(sample_table.header)}")
print(f"üìã Column names: {[col.name for col in sample_table.header]}")
print(f"\nüîç First 3 columns:")
for i, col in enumerate(sample_table.header[:3]):
    print(f"  {i+1}. {col.name} ({col.type}): {col.sample_value}")

üß™ Testing Table conversion with first row:

‚úÖ Table ID: review_1
üìä Number of columns: 12
üìã Column names: ['brand', 'model', 'price_usd', 'rating', 'battery_life_rating', 'camera_rating', 'performance_rating', 'design_rating', 'display_rating', 'verified_purchase', 'sentiment', 'country']

üîç First 3 columns:
  1. brand (text): Realme
  2. model (text): Realme 12 Pro
  3. price_usd (real): 337.31


In [4]:
# Cell 7: Function to generate embeddings

import torch

def get_tabert_embedding(context_text, table):
    """
    Generate TaBERT embeddings for text + table
    
    Args:
        context_text: Natural language text (review_text)
        table: TaBERT Table object
    
    Returns:
        embedding: numpy array of embedding vector
    """
    # Tokenize context
    context_tokens = model.tokenizer.tokenize(context_text)
    
    # Encode with TaBERT
    with torch.no_grad():
        context_encoding, column_encoding, info_dict = model.encode(
            contexts=[context_tokens],
            tables=[table]
        )
    
    # context_encoding shape: [batch_size, seq_len, hidden_size]
    # Use mean pooling over sequence length for final embedding
    context_embedding = context_encoding[0].mean(dim=0).cpu().numpy()
    
    return context_embedding

# Test embedding generation
print("üß™ Generating test embedding...")
test_table = dataframe_row_to_tabert_table(df.iloc[0])
test_embedding = get_tabert_embedding(
    df.iloc[0]['review_text'], 
    test_table
)

print(f"‚úÖ Embedding generated!")
print(f"üìä Embedding shape: {test_embedding.shape}")
print(f"üìä Embedding dimension: {len(test_embedding)}")
print(f"üìä Sample values (first 10): {test_embedding[:10]}")

üß™ Generating test embedding...
‚úÖ Embedding generated!
üìä Embedding shape: (768,)
üìä Embedding dimension: 768
üìä Sample values (first 10): [ 0.1119225  -0.04385044  0.12691893  0.21343757  0.13927068  0.2099418
 -0.04488863  0.08520498  0.11729547 -0.05301265]


In [5]:
# Cell 8: Generate embeddings for entire dataset

def generate_all_tabert_embeddings(dataframe, batch_size=16):
    """
    Generate TaBERT embeddings for all rows
    
    Args:
        dataframe: pandas DataFrame
        batch_size: Not really used for batching (kept for compatibility)
    
    Returns:
        embeddings_array: numpy array of embeddings
        metadata: list of metadata dicts
    """
    embeddings = []
    metadata = []
    
    print(f"üöÄ Generating TaBERT embeddings for {len(dataframe)} rows...")
    print(f"‚è±Ô∏è  This may take a while on CPU...")
    
    for idx, row in tqdm(dataframe.iterrows(), total=len(dataframe)):
        try:
            # Convert row to TaBERT Table
            table = dataframe_row_to_tabert_table(row)
            
            # Generate embedding
            embedding = get_tabert_embedding(
                row['review_text'],
                table
            )
            
            embeddings.append(embedding)
            
            # Store metadata for retrieval
            metadata.append({
                'index': idx,
                'review_id': int(row['review_id']),
                'brand': row['brand'],
                'model': row['model'],
                'price_usd': float(row['price_usd']),
                'rating': int(row['rating']),
                'sentiment': row['sentiment'],
                'camera_rating': int(row['camera_rating']),
                'battery_life_rating': int(row['battery_life_rating'])
            })
            
        except Exception as e:
            print(f"\n‚ö†Ô∏è  Error processing row {idx}: {str(e)}")
            continue
    
    embeddings_array = np.array(embeddings).astype('float32')
    
    print(f"\n‚úÖ Generated {len(embeddings)} embeddings")
    print(f"üìä Embeddings shape: {embeddings_array.shape}")
    
    return embeddings_array, metadata

# Generate all embeddings
embeddings, metadata = generate_all_tabert_embeddings(df)

üöÄ Generating TaBERT embeddings for 5000 rows...
‚è±Ô∏è  This may take a while on CPU...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5000/5000 [12:37<00:00,  6.60it/s]


‚úÖ Generated 5000 embeddings
üìä Embeddings shape: (5000, 768)





In [6]:
# Cell 9: Install FAISS (if not already installed)
# Run this in terminal: pip install faiss-cpu

import faiss

def create_faiss_index(embeddings_array):
    """
    Create FAISS index for fast similarity search
    """
    dimension = embeddings_array.shape[1]
    
    print(f"üìè Embedding dimension: {dimension}")
    
    # Create index - using IndexFlatIP for cosine similarity
    # (Inner Product after L2 normalization = cosine similarity)
    index = faiss.IndexFlatIP(dimension)
    
    # Normalize embeddings for cosine similarity
    faiss.normalize_L2(embeddings_array)
    
    # Add embeddings to index
    index.add(embeddings_array)
    
    print(f"‚úÖ FAISS index created")
    print(f"üìä Total vectors in index: {index.ntotal}")
    
    return index

# Create FAISS index
faiss_index = create_faiss_index(embeddings)

# Save to disk
print("\nüíæ Saving to disk...")
faiss.write_index(faiss_index, "tabert_embeddings.index")
pd.DataFrame(metadata).to_pickle("tabert_metadata.pkl")
df.to_pickle("dataframe.pkl")

print("‚úÖ Saved: tabert_embeddings.index")
print("‚úÖ Saved: tabert_metadata.pkl")
print("‚úÖ Saved: dataframe.pkl")

üìè Embedding dimension: 768
‚úÖ FAISS index created
üìä Total vectors in index: 5000

üíæ Saving to disk...
‚úÖ Saved: tabert_embeddings.index
‚úÖ Saved: tabert_metadata.pkl
‚úÖ Saved: dataframe.pkl


In [23]:
# Cell 10: RAG Query System

def query_tabert_rag(query_text, top_k=5):
    """
    RAG system using TaBERT embeddings
    
    Args:
        query_text: Natural language query
        top_k: Number of results to return
    
    Returns:
        DataFrame with matching rows (unedited original data)
    """
    print(f"üîç Query: '{query_text}'")
    print("="*80)
    
    # Step 1: Encode query with a minimal dummy table
    dummy_table = Table(
        id='query',
        header=[Column('query', 'text', sample_value='search')],
        data=[['search']]
    ).tokenize(model.tokenizer)
    
    # Encode query
    query_embedding = get_tabert_embedding(query_text, dummy_table)
    query_embedding = query_embedding.astype('float32').reshape(1, -1)
    
    # Normalize for cosine similarity
    faiss.normalize_L2(query_embedding)
    
    # Step 2: Search FAISS index
    scores, indices = faiss_index.search(query_embedding, top_k)
    
    print(f"üìä Found {len(indices[0])} results\n")
    
    # Step 3: Retrieve original rows
    result_indices = indices[0]
    result_df = df.iloc[result_indices].copy()
    
    # Add similarity scores (higher is better for cosine similarity)
    result_df['similarity_score'] = scores[0]
    
    return result_df

def display_results(query, results):
    """Pretty print search results"""
    print(f"üîç Query: {query}")
    print(f"{'='*80}\n")
    
    for i, (idx, row) in enumerate(results.iterrows(), 1):
        print(row)

        # print(f"Result #{i} | Brand: {row['brand']} | Model: {row['model']}")
        # print(f"Price: ${row['price_usd']:.2f} | Rating: {row['rating']}/5 ({row['sentiment']})")
        # print(f"Camera: {row['camera_rating']}/5 | Battery: {row['battery_life_rating']}/5")
        # print(f"Performance: {row['performance_rating']}/5 | Design: {row['design_rating']}/5")
        # print(f"Review: {row['review_text'][:]}... | Similarity: {row['similarity_score']:.4f}")

        print()

# Test the RAG system
query = "affordable Xiaomi phones with excellent camera"
results = query_tabert_rag(query, top_k=5)
display_results(query, results)

üîç Query: 'affordable Xiaomi phones with excellent camera'
üìä Found 5 results

üîç Query: affordable Xiaomi phones with excellent camera

review_id                                                            1105
customer_name                                                Vivaan Basak
age                                                                    30
brand                                                             OnePlus
model                                                         OnePlus 11R
price_usd                                                          748.73
price_local                                                     ‚Çπ62144.59
currency                                                              INR
exchange_rate_to_usd                                                   83
rating                                                                  5
review_text             Absolutely love this phone! The camera is next...
sentiment                                

In [24]:
# Cell 11: Test various queries

# Query 1: Price + Feature
query1 = "best mobiles from Amazon as per the dataset"
results1 = query_tabert_rag(query1, top_k=3)
display_results(query1, results1)



üîç Query: 'best mobiles from Amazon as per the dataset'
üìä Found 3 results

üîç Query: best mobiles from Amazon as per the dataset

review_id                                                            2494
customer_name                                                 Jagat Patel
age                                                                    18
brand                                                              Google
model                                                             Pixel 6
price_usd                                                          850.09
price_local                                                     ‚Çπ70557.47
currency                                                              INR
exchange_rate_to_usd                                                   83
rating                                                                  3
review_text             Fast charging is a lifesaver. Best purchase of...
sentiment                                      

In [25]:

# Query 2: Brand + Quality
query2 = "reviews of Aryan Maharaj"
results2 = query_tabert_rag(query2, top_k=3)
display_results(query2, results2)


üîç Query: 'reviews of Aryan Maharaj'
üìä Found 3 results

üîç Query: reviews of Aryan Maharaj

review_id                                                            2987
customer_name                                             Yashawini Konda
age                                                                    37
brand                                                              Google
model                                                            Pixel 7a
price_usd                                                          788.19
price_local                                                     ‚Çπ65419.77
currency                                                              INR
exchange_rate_to_usd                                                   83
rating                                                                  3
review_text             Software updates are delayed sometimes. Averag...
sentiment                                                         Neutral
country    

In [26]:

# Query 3: Performance focus
query3 = "all reviews mentioning battery or screen issues"
results3 = query_tabert_rag(query3, top_k=3)
display_results(query3, results3)

üîç Query: 'all reviews mentioning battery or screen issues'
üìä Found 3 results

üîç Query: all reviews mentioning battery or screen issues

review_id                                                            3161
customer_name                                       Jo√£o Vitor Cavalcanti
age                                                                    37
brand                                                             Samsung
model                                                          Galaxy S24
price_usd                                                          592.59
price_local                                                     R$3377.76
currency                                                              BRL
exchange_rate_to_usd                                                  5.7
rating                                                                  2
review_text             Software updates are delayed sometimes. Averag...
sentiment                               