In [1]:
# --- 1. CORE LIBRARIES ---
import os
import sys
import warnings

import folium  # For interactive mapping

# --- 3. GEOSPATIAL LIBRARIES ---
import geopy.geocoders
import numpy as np

# --- 2. DATA HANDLING & ANALYSIS ---
import pandas as pd
from geopy.extra.rate_limiter import RateLimiter  # To avoid spamming the server
from geopy.geocoders import Nominatim  # A popular free geocoding service
# --- 1. CORE LIBRARIES ---
import os
import sys
import warnings

# In your main setup cell, replace the old NLTK section with this:
import nltk
import numpy as np

# --- 2. DATA HANDLING & ANALYSIS ---
import pandas as pd
from nltk.corpus import stopwords

# --- 3. NATURAL LANGUAGE PROCESSING (NLP) ---
from textblob import TextBlob  # For easy sentiment analysis

# This will now work because you've manually downloaded the data.
STOPWORDS = set(stopwords.words("english"))

# --- 4. UTILITIES ---
from loguru import logger
from tqdm.auto import tqdm

tqdm.pandas()

# ===================================================================
#                      CONFIGURATION
# ===================================================================
# (Your standard, excellent configuration settings)
pd.set_option("display.max_columns", None)
# ... etc. ...

logger.remove()
logger.add(
    sys.stdout,
    colorize=True,
    format=(
        "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
        "<level>{level: <8}</level> | "
        "<level>{message}</level>"
    ),
)

logger.info("✅ All libraries imported and configurations set successfully!")

# --- Load the NLP Dataset ---
DATA_PATH = "../data/processed/zomato_nlp.parquet"
try:
    df_nlp = pd.read_parquet(DATA_PATH)
    logger.success(f"Successfully loaded the NLP dataset from '{DATA_PATH}'.")
    logger.info(f"DataFrame shape: {df_nlp.shape}")
except FileNotFoundError:
    logger.error(
        f"FATAL: The file was not found at '{DATA_PATH}'. Please ensure the path is correct."
    )

df_nlp.head()
# --- 4. UTILITIES ---
from loguru import logger
from tqdm.auto import tqdm

tqdm.pandas()  # Enable progress bars for pandas apply

# ===================================================================
#                      CONFIGURATION
# ===================================================================
# (Same pandas, plotting, and loguru settings as before)
pd.set_option("display.max_columns", None)
# ... (rest of your standard config) ...

logger.remove()
logger.add(
    sys.stdout,
    colorize=True,
    format=(
        "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
        "<level>{level: <8}</level> | "
        "<level>{message}</level>"
    ),
)

logger.info("✅ All libraries imported and configurations set successfully!")

# --- Load the Geo Dataset ---
DATA_PATH = "../data/processed/zomato_geo.parquet"
try:
    df_geo = pd.read_parquet(DATA_PATH)
    logger.success(f"Successfully loaded the Geo dataset from '{DATA_PATH}'.")
    logger.info(f"DataFrame shape: {df_geo.shape}")
except FileNotFoundError:
    logger.error(f"FATAL: The file was not found at '{DATA_PATH}'.")

df_geo.head()

[32m2025-09-16 19:40:15[0m | [1mINFO    [0m | [1m✅ All libraries imported and configurations set successfully![0m
[32m2025-09-16 19:40:16[0m | [32m[1mSUCCESS [0m | [32m[1mSuccessfully loaded the NLP dataset from '../data/processed/zomato_nlp.parquet'.[0m
[32m2025-09-16 19:40:16[0m | [1mINFO    [0m | [1mDataFrame shape: (45187, 7)[0m
[32m2025-09-16 19:40:16[0m | [1mINFO    [0m | [1m✅ All libraries imported and configurations set successfully![0m
[32m2025-09-16 19:40:16[0m | [32m[1mSUCCESS [0m | [32m[1mSuccessfully loaded the Geo dataset from '../data/processed/zomato_geo.parquet'.[0m
[32m2025-09-16 19:40:16[0m | [1mINFO    [0m | [1mDataFrame shape: (45187, 5)[0m


Unnamed: 0,name,address,rate,location,listed_in_city
0,Jalsa,"942, 21st Main Road, 2nd Stage, Banashankari, ...",4.1,Banashankari,Banashankari
1,Spice Elephant,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",4.1,Banashankari,Banashankari
2,San Churro Cafe,"1112, Next to KIMS Medical College, 17th Cross...",3.8,Banashankari,Banashankari
3,Addhuri Udupi Bhojana,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",3.7,Banashankari,Banashankari
4,Grand Village,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",3.8,Basavanagudi,Banashankari


In [15]:
import json
import os
import time

import pandas as pd
import requests
from azure.core.credentials import AzureKeyCredential
from azure.maps.search import MapsSearchClient
from dotenv import load_dotenv
from loguru import logger
from tqdm.auto import tqdm

# --- STEP 1: SETUP & CONFIGURATION ---
logger.info("--- Setting up the environment ---")
load_dotenv()
AZURE_MAPS_KEY = os.getenv("AZURE_MAPS_KEY")
if not AZURE_MAPS_KEY:
    logger.error("FATAL: AZURE_MAPS_KEY not found.")
    maps_search_client = None
else:
    maps_search_client = MapsSearchClient(credential=AzureKeyCredential(AZURE_MAPS_KEY))


# --- BATCH GEOCODING FUNCTION (UNCHANGED) ---
def geocode_with_azure_batch(df_to_geocode: pd.DataFrame, cache: dict) -> dict:
    if not AZURE_MAPS_KEY:
        return cache
    logger.info("--- Starting Azure Maps Asynchronous Batch Geocoding ---")
    addresses_to_geocode_new = df_to_geocode[
        ~df_to_geocode["address"].isin(cache.keys())
    ]
    if addresses_to_geocode_new.empty:
        logger.success("All sample addresses are already in the cache.")
        return cache
    logger.info(
        f"Preparing a batch of {len(addresses_to_geocode_new)} new addresses..."
    )
    batch_items = []
    for index, row in addresses_to_geocode_new.iterrows():
        query_text = f"{row['name']}, {row['address']}"
        batch_query = f"?query={requests.utils.quote(query_text)}&countrySet=IN"
        batch_items.append({"query": batch_query})
    submit_url = f"https://atlas.microsoft.com/search/fuzzy/batch/json?api-version=1.0&subscription-key={AZURE_MAPS_KEY}"
    logger.info(f"Submitting {len(batch_items)} queries...")
    try:
        response = requests.post(
            submit_url, json={"batchItems": batch_items}, timeout=30
        )
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        logger.error(f"Failed to submit batch job: {e}")
        return cache
    if response.status_code == 202:
        status_url = response.headers["Location"]
        logger.success("Batch job accepted.")
    else:
        logger.error(
            f"Batch submission failed with status {response.status_code}: {response.text}"
        )
        return cache
    logger.info("Polling for results...")
    while True:
        try:
            result_response = requests.get(status_url, timeout=30)
            result_response.raise_for_status()
            if result_response.status_code == 202:
                logger.info("...processing, waiting 5s...")
                time.sleep(5)
            elif result_response.status_code == 200:
                logger.success("Processing complete.")
                batch_result = result_response.json()
                break
            else:
                logger.error(
                    f"Polling failed with status {result_response.status_code}"
                )
                return cache
        except requests.exceptions.RequestException as e:
            logger.error(f"Error while polling: {e}")
            return cache
    logger.info("Parsing results and updating cache...")
    for i, item in enumerate(batch_result["batchItems"]):
        original_address = addresses_to_geocode_new.iloc[i]["address"]
        if item["statusCode"] == 200 and item["response"].get("results"):
            first_result = item["response"]["results"][0]
            lat = first_result["position"]["lat"]
            lon = first_result["position"]["lon"]
            cache[original_address] = {"latitude": lat, "longitude": lon}
        else:
            cache[original_address] = {"latitude": None, "longitude": None}
    logger.success("Cache updated.")
    return cache


def verify_geocoding_with_rest_api(df_with_coords: pd.DataFrame) -> pd.DataFrame:
    """
    Performs a BATCH reverse geocode using direct REST API calls,
    following the official documentation precisely.
    """
    if not AZURE_MAPS_KEY:
        logger.warning("Azure key not available. Skipping verification.")
        df_with_coords["verified_address"] = "Verification Skipped"
        return df_with_coords

    logger.info(
        "--- Verifying coordinates with Manual REST API Batch Reverse Geocoding ---"
    )

    df_to_verify = df_with_coords.dropna(subset=["latitude", "longitude"])
    if df_to_verify.empty:
        df_with_coords["verified_address"] = "Original geocode failed"
        return df_with_coords

    # --- 1. Construct the POST Body ---
    # As per the documentation, the query is "?query=lat,lon"
    batch_items = []
    for index, row in df_to_verify.iterrows():
        query_str = f"?query={row['latitude']},{row['longitude']}"
        batch_items.append({"query": query_str})

    # --- 2. Submit the Asynchronous Job ---
    submit_url = f"https://atlas.microsoft.com/search/address/reverse/batch/json?api-version=1.0&subscription-key={AZURE_MAPS_KEY}"
    try:
        response = requests.post(
            submit_url, json={"batchItems": batch_items}, timeout=30
        )
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        logger.error(f"Failed to submit reverse geocode batch job: {e}")
        df_with_coords["verified_address"] = "Error on Submit"
        return df_with_coords

    if response.status_code == 202:
        status_url = response.headers["Location"]
    else:
        logger.error(
            f"Reverse geocode submission failed with status {response.status_code}"
        )
        df_with_coords["verified_address"] = "Error on Submit"
        return df_with_coords

    # --- 3. Poll for Results ---
    while True:
        try:
            result_response = requests.get(status_url, timeout=30)
            result_response.raise_for_status()
            if result_response.status_code == 200:
                batch_result = result_response.json()
                break
            time.sleep(2)  # Wait 2 seconds between polls
        except requests.exceptions.RequestException as e:
            logger.error(f"Error while polling reverse geocode results: {e}")
            df_with_coords["verified_address"] = "Error on Poll"
            return df_with_coords

    # --- 4. Parse Results ---
    verified_addresses = {}
    for i, item in enumerate(batch_result["batchItems"]):
        original_index = df_to_verify.index[i]
        if item["statusCode"] == 200 and item["response"]["addresses"]:
            verified_addresses[original_index] = item["response"]["addresses"][0][
                "address"
            ]["freeformAddress"]
        else:
            verified_addresses[original_index] = "Reverse geocode failed"

    df_with_coords["verified_address"] = df_with_coords.index.map(
        verified_addresses
    ).fillna("Not Verified (No Coords)")
    return df_with_coords


# --- SCRIPT EXECUTION ---
logger.info("Step 1: Identifying unique restaurants and loading cache...")
df_unique_restaurants = (
    df_geo[["name", "address", "location"]].drop_duplicates().reset_index(drop=True)
)
CACHE_FILE = "azure_batch_geocache_test.json"
if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, "r") as f:
        cache = json.load(f)
else:
    cache = {}

logger.info("Step 2: Creating a sample of 10 for the test run...")
df_sample_to_process = df_unique_restaurants.sample(n=10, random_state=42)

logger.info("Step 3: Running the batch geocoding function on the sample...")
updated_cache = geocode_with_azure_batch(df_sample_to_process, cache)

with open(CACHE_FILE, "w") as f:
    json.dump(updated_cache, f, indent=2)
logger.success(f"Cache saved to '{CACHE_FILE}'.")

# --- Step 4: Prepare the sample results for verification ---
logger.info("Step 4: Preparing sample results for verification...")
df_sample_results = df_sample_to_process.copy()
lat_map = {addr: data.get("latitude") for addr, data in updated_cache.items()}
lon_map = {addr: data.get("longitude") for addr, data in updated_cache.items()}
df_sample_results["latitude"] = df_sample_results["address"].map(lat_map)
df_sample_results["longitude"] = df_sample_results["address"].map(lon_map)

# --- Step 5: Run the NEW, DOCUMENTATION-DRIVEN VERIFICATION function ---
df_verified_results = verify_geocoding_with_rest_api(df_sample_results)

# --- FINAL REPORT ---
logger.info("--- Geocoding Verification Report ---")
display(df_verified_results[["address", "verified_address", "latitude", "longitude"]])

[32m2025-09-11 00:09:53[0m | [1mINFO    [0m | [1m--- Setting up the environment ---[0m
[32m2025-09-11 00:09:53[0m | [1mINFO    [0m | [1mStep 1: Identifying unique restaurants and loading cache...[0m
[32m2025-09-11 00:09:53[0m | [1mINFO    [0m | [1mStep 2: Creating a sample of 10 for the test run...[0m
[32m2025-09-11 00:09:53[0m | [1mINFO    [0m | [1mStep 3: Running the batch geocoding function on the sample...[0m
[32m2025-09-11 00:09:53[0m | [1mINFO    [0m | [1m--- Starting Azure Maps Asynchronous Batch Geocoding ---[0m
[32m2025-09-11 00:09:53[0m | [32m[1mSUCCESS [0m | [32m[1mAll sample addresses are already in the cache.[0m
[32m2025-09-11 00:09:53[0m | [32m[1mSUCCESS [0m | [32m[1mCache saved to 'azure_batch_geocache_test.json'.[0m
[32m2025-09-11 00:09:53[0m | [1mINFO    [0m | [1mStep 4: Preparing sample results for verification...[0m
[32m2025-09-11 00:09:53[0m | [1mINFO    [0m | [1m--- Verifying coordinates with Manual REST API 

Unnamed: 0,address,verified_address,latitude,longitude
10347,"63, SH 35, Vinayaka Layout, Bengaluru, Karnata...","819, Whitefield Main Road, Vinayaka Layout, Wh...",12.969086,77.749974
6070,"303, 5th Main, 100 Feet Road, Indiranagar, Ban...","32, 5th Main Road, Indira Nagar I Stage, Benga...",12.979962,77.640713
10109,"Arya Hub Mall, ITPL Main Road, Whitefield, Ban...","Whitefield Main Road, Prashanth Extension, Whi...",12.983661,77.752186
7637,"602, 4th Cross, 2nd Block, HRBR Layout, Kalyan...","2, 4th D Cross Road, HRBR Layout 3rd Block, Ka...",13.025616,77.63358
9839,"5, 1st Main, 2nd Block, 3rd Stage, Basaveshwar...","Vinayaka Layout Road, Kodanda Reddy Layout, Du...",13.011113,77.670645
33,"80, BDA Complex, 2nd Stage, Banashankari, Bang...","645, 24th Cross Road, Banashankari Stage 2, Be...",12.924435,77.565474
9281,"3, Outer Ring Road, Opposite More Megastore, M...","Mahadevapura Outer Ring Road, Konadas Pura, Ma...",12.989707,77.688775
2970,"36, Vittal Mallya Road, Lavelle Road, Bangalore","38-2, Vittal Mallya Road, Shanthala Nagar, Ben...",12.971137,77.597785
9129,"Nageshwar Rao Building, 3rd Crass Kamadhenu, L...","159, 3rd Cross Road, Indira Nagar Layout, Maha...",12.996587,77.686737
4734,"2nd cross, Behind Anjaneya Temple, Opposite Pr...","6-14, 2nd Cross Road, Madivala, BTM Layout I S...",12.922288,77.618638


In [1]:
import torch

x = torch.rand(5, 3)
print(x)

tensor([[0.5875, 0.2518, 0.1725],
        [0.5259, 0.8692, 0.8414],
        [0.5028, 0.5129, 0.8821],
        [0.4729, 0.2335, 0.6795],
        [0.7349, 0.3373, 0.9296]])


In [4]:
import transformers

In [5]:
transformers.pipeline

<function transformers.pipelines.pipeline(task: Optional[str] = None, model: Union[str, ForwardRef('PreTrainedModel'), ForwardRef('TFPreTrainedModel'), NoneType] = None, config: Union[str, transformers.configuration_utils.PretrainedConfig, NoneType] = None, tokenizer: Union[str, transformers.tokenization_utils.PreTrainedTokenizer, ForwardRef('PreTrainedTokenizerFast'), NoneType] = None, feature_extractor: Union[str, ForwardRef('SequenceFeatureExtractor'), NoneType] = None, image_processor: Union[str, transformers.image_processing_utils.BaseImageProcessor, NoneType] = None, processor: Union[str, transformers.processing_utils.ProcessorMixin, NoneType] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, token: Union[str, bool, NoneType] = None, device: Union[int, str, ForwardRef('torch.device'), NoneType] = None, device_map: Union[str, dict[str, Union[int, str]], NoneType] = None, dtype: Union[str, ForwardRef('torch.dtype'), NoneType] = 'auto', 

In [1]:
import triton

In [2]:
dir(triton)

['CompilationError',
 'Config',
 'InterpreterError',
 'JITFunction',
 'KernelInterface',
 'MockTensor',
 'OutOfResources',
 'TensorWrapper',
 'TritonError',
 '_C',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'autotune',
 'backends',
 'cdiv',
 'compile',
 'compiler',
 'errors',
 'heuristics',
 'jit',
 'language',
 'next_power_of_2',
 'reinterpret',
 'runtime',
 'testing',
 'tools']

In [1]:
import torch
from loguru import logger

logger.info("--- PyTorch GPU Diagnostic ---")

# 1. Check if PyTorch is installed
logger.info(f"PyTorch Version: {torch.__version__}")

# 2. Check if CUDA is available TO PYTORCH
is_available = torch.cuda.is_available()
logger.info(f"Is CUDA available to PyTorch? -> {is_available}")

if is_available:
    # 3. If it is available, get details about the GPU
    gpu_count = torch.cuda.device_count()
    gpu_name = torch.cuda.get_device_name(0)
    cuda_version_pytorch = torch.version.cuda

    logger.success("GPU DETECTED SUCCESSFULLY!")
    logger.info(f"Number of GPUs: {gpu_count}")
    logger.info(f"GPU Name: {gpu_name}")
    logger.info(f"PyTorch was compiled with CUDA Version: {cuda_version_pytorch}")
else:
    logger.error("GPU NOT DETECTED BY PYTORCH.")
    logger.warning(
        "This is likely due to an installation mismatch between your NVIDIA driver, the CUDA toolkit version PyTorch was compiled with, and the PyTorch version itself."
    )
    logger.info(
        "Recommendation: Re-run the official PyTorch installation command from their website for your specific CUDA version."
    )

[32m2025-09-16 18:57:08.623[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1m--- PyTorch GPU Diagnostic ---[0m
[32m2025-09-16 18:57:08.624[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mPyTorch Version: 2.8.0+cu129[0m
[32m2025-09-16 18:57:08.735[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mIs CUDA available to PyTorch? -> True[0m
[32m2025-09-16 18:57:08.771[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m19[0m - [32m[1mGPU DETECTED SUCCESSFULLY![0m
[32m2025-09-16 18:57:08.771[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m20[0m - [1mNumber of GPUs: 1[0m
[32m2025-09-16 18:57:08.772[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1mGPU Name: NVIDIA GeForce RTX 3060 Laptop GPU[0m
[32m2025-09-16 18:57:08.773[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m22[0m - [1mPyTorch was compiled wit

In [5]:
import pandas as pd
import numpy as np
from transformers import pipeline
from loguru import logger
from tqdm.auto import tqdm
tqdm.pandas()

pipe = pipeline("text-classification", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")

def generate_transformer_sentiment(df: pd.DataFrame, review_col: str = 'reviews_list') -> pd.DataFrame:
    """
    Calculates the average sentiment score using a Transformer model.
    Includes the 'trust_remote_code' fix.
    """
    logger.info("--- Initializing the Advanced Multilingual Sentiment Model (XLM-RoBERTa) ---")
    
    try:
        sentiment_pipeline = pipeline(
            "sentiment-analysis", 
            model="cardiffnlp/twitter-xlm-roberta-base-sentiment",
            device=0, # Use GPU
        )
        logger.success("Multilingual sentiment model initialized successfully ON GPU.")
    except Exception as e:
        logger.error(f"Failed to initialize sentiment pipeline on GPU. Error: {e}")
        logger.warning("Falling back to CPU. This will be very slow.")
        sentiment_pipeline = pipeline(
            "sentiment-analysis", 
            model="cardiffnlp/twitter-xlm-roberta-base-sentiment",
            device=-1,
        )

    df_out = df.copy()
    
    # The get_avg_sentiment sub-function remains the same
    def get_avg_sentiment(review_array):
        if review_array is None or not hasattr(review_array, '__iter__'):
            return 0.069
        # Step 2: Now that we know it's iterable, we can safely check its length.
        if len(review_array) == 0:
            return 0.067
        review_texts = [review[1] for review in review_array if len(review) == 2 and isinstance(review[1], str)]
        if not review_texts: return 0.999
        sentiments = []
        try:
            results = sentiment_pipeline(review_texts, truncation=True, max_length=512)
            for result in results:
                if result['label'] == 'Positive': sentiments.append(1 * result['score'])
                elif result['label'] == 'Negative': sentiments.append(-1 * result['score'])
                else: sentiments.append(0)
        except Exception as e:
            logger.error(f"Could not process a batch of reviews. Error: {e}")
            return 0.0
        return np.mean(sentiments) if sentiments else 0.107

    # Apply the function to the entire DataFrame
    df_out['transformer_sentiment'] = df_out[review_col].progress_apply(get_avg_sentiment)
    logger.success("Advanced sentiment analysis complete.")
    return df_out

# --- Execute Transformer Analysis ---
logger.info("--- Loading Data from VADER Sentiment Checkpoint for Transformer ---")
CHECKPOINT_PATH = "../data/processed/zomato_nlp_with_sentiment.parquet"
df_for_transformer = pd.read_parquet(CHECKPOINT_PATH)
logger.success("Checkpoint loaded.")

# We run this on the ORIGINAL 'reviews_list' column for maximum accuracy
df_with_transformer = generate_transformer_sentiment(df_for_transformer)

# --- Verification ---
print("\n--- Verification of Transformer Sentiment ---")
display(df_with_transformer[['name', 'rate', 'transformer_sentiment']].head())

[32m2025-09-16 19:27:19.728[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m62[0m - [1m--- Loading Data from VADER Sentiment Checkpoint for Transformer ---[0m
[32m2025-09-16 19:27:20.104[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m65[0m - [32m[1mCheckpoint loaded.[0m
[32m2025-09-16 19:27:20.105[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_transformer_sentiment[0m:[36m15[0m - [1m--- Initializing the Advanced Multilingual Sentiment Model (XLM-RoBERTa) ---[0m
[32m2025-09-16 19:27:23.162[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mgenerate_transformer_sentiment[0m:[36m23[0m - [32m[1mMultilingual sentiment model initialized successfully ON GPU.[0m


  0%|          | 0/45187 [00:00<?, ?it/s]

[32m2025-09-16 19:27:23.204[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mgenerate_transformer_sentiment[0m:[36m58[0m - [32m[1mAdvanced sentiment analysis complete.[0m



--- Verification of Transformer Sentiment ---


Unnamed: 0,name,rate,transformer_sentiment
0,Jalsa,4.1,0.069
1,Spice Elephant,4.1,0.069
2,San Churro Cafe,3.8,0.069
3,Addhuri Udupi Bhojana,3.7,0.069
4,Grand Village,3.8,0.069


In [8]:
df_with_transformer["reviews_list"]

0        None
1        None
2        None
3        None
4        None
         ... 
45182    None
45183    None
45184    None
45185    None
45186    None
Name: reviews_list, Length: 45187, dtype: object

In [5]:
import pandas as pd
import numpy as np
from transformers import pipeline
from loguru import logger
from tqdm.auto import tqdm
import os
tqdm.pandas()

# --- STEP 1: LOAD THE ORIGINAL, TRUSTWORTHY DATA ---
logger.info("--- Starting from a Clean Slate ---")
DATA_PATH = "../data/processed/zomato_nlp.parquet"
try:
    df_nlp_source = pd.read_parquet(DATA_PATH)
    logger.success(f"Successfully loaded the ORIGINAL NLP dataset from '{DATA_PATH}'.")
    logger.info(f"DataFrame shape: {df_nlp_source.shape}")
except FileNotFoundError:
    logger.error(f"FATAL: The file was not found at '{DATA_PATH}'. Cannot proceed.")
    # In a real script, we would stop here.
    # For the notebook, create an empty df to prevent further errors.
    df_nlp_source = pd.DataFrame()


# --- STEP 2: THE DEFINITIVE SENTIMENT ANALYSIS FUNCTION ---
def generate_transformer_sentiment_definitive(df: pd.DataFrame, review_col: str = 'reviews_list') -> pd.DataFrame:
    """
    The final, definitive, robust version for calculating Transformer sentiment.
    It loads the model online and handles all known data structure issues.
    """
    if df.empty:
        logger.warning("Input DataFrame is empty. Skipping sentiment analysis.")
        return df

    logger.info("--- Initializing the Advanced Multilingual Sentiment Model (Online) ---")
    
    try:
        # Load the model directly from Hugging Face Hub
        sentiment_pipeline = pipeline(
            "sentiment-analysis", 
            model="cardiffnlp/twitter-xlm-roberta-base-sentiment",
            device=0 # Use GPU
        )
        logger.success("Multilingual sentiment model initialized successfully ON GPU.")
    except Exception as e:
        logger.error(f"Failed to initialize sentiment pipeline on GPU. Error: {e}")
        logger.warning("Falling back to CPU. This will be very slow.")
        sentiment_pipeline = pipeline(
            "sentiment-analysis", 
            model="cardiffnlp/twitter-xlm-roberta-base-sentiment",
            device=-1
        )

    df_out = df.copy()
    
    def get_avg_sentiment(review_array):
        # The most robust guard clause
        if review_array is None or not hasattr(review_array, '__len__') or len(review_array) == 0:
            return 0.696969
        
        # Correctly parse the array-of-arrays structure
        review_texts = [review[1] for review in review_array if len(review) == 2 and isinstance(review[1], str)]
        if not review_texts: 
            return 0.06789
        
        sentiments = []
        try:
            # Use a larger batch size for GPU efficiency
            results = sentiment_pipeline(review_texts, truncation=True, max_length=512, batch_size=16)
            for result in results:
                if result['label'] == 'Positive': sentiments.append(1 * result['score'])
                elif result['label'] == 'Negative': sentiments.append(-1 * result['score'])
                else: sentiments.append(0)
        except Exception as e:
            logger.error(f"Could not process a batch of reviews. Error: {e}")
            return 0.012345
            
        return np.mean(sentiments) if sentiments else 0.0234567

    logger.info("Applying sentiment analysis to the full dataset. This will take time...")
    df_out['transformer_sentiment'] = df_out[review_col].progress_apply(get_avg_sentiment)
    logger.success("Advanced sentiment analysis complete.")
    return df_out


# --- STEP 3: EXECUTE THE FULL PIPELINE ---
# We run this on our clean, freshly loaded source DataFrame.
# We will run the full dataset this time.
df_with_transformer = generate_transformer_sentiment_definitive(df_nlp_source)


# --- STEP 4: VERIFICATION & FINAL EXPORT ---
if 'transformer_sentiment' in df_with_transformer.columns:
    print("\n--- Verification of Transformer Sentiment ---")
    display(df_with_transformer[['name', 'rate', 'transformer_sentiment']].head())
    
    # Let's save this hard-earned result!
    FINAL_NLP_PATH = "../data/processed/zomato_nlp_features_final.parquet"
    logger.info(f"Exporting final NLP features to '{FINAL_NLP_PATH}'...")
    df_with_transformer.to_parquet(FINAL_NLP_PATH, index=False)
    logger.success("Final NLP features saved successfully.")
else:
    logger.error("Sentiment analysis did not complete successfully. No file was saved.")

[32m2025-09-16 19:58:23[0m | [1mINFO    [0m | [1m--- Starting from a Clean Slate ---[0m
[32m2025-09-16 19:58:24[0m | [32m[1mSUCCESS [0m | [32m[1mSuccessfully loaded the ORIGINAL NLP dataset from '../data/processed/zomato_nlp.parquet'.[0m
[32m2025-09-16 19:58:24[0m | [1mINFO    [0m | [1mDataFrame shape: (45187, 7)[0m
[32m2025-09-16 19:58:24[0m | [1mINFO    [0m | [1m--- Initializing the Advanced Multilingual Sentiment Model (Online) ---[0m




[32m2025-09-16 19:58:31[0m | [32m[1mSUCCESS [0m | [32m[1mMultilingual sentiment model initialized successfully ON GPU.[0m
[32m2025-09-16 19:58:31[0m | [1mINFO    [0m | [1mApplying sentiment analysis to the full dataset. This will take time...[0m


  0%|          | 0/45187 [00:00<?, ?it/s]

--- Logging error ---
Traceback (most recent call last):
  File "/home/puneet/tools/miniconda3/envs/newAge/lib/python3.10/logging/__init__.py", line 1100, in emit
    msg = self.format(record)
  File "/home/puneet/tools/miniconda3/envs/newAge/lib/python3.10/logging/__init__.py", line 943, in format
    return fmt.format(record)
  File "/home/puneet/tools/miniconda3/envs/newAge/lib/python3.10/logging/__init__.py", line 678, in format
    record.message = record.getMessage()
  File "/home/puneet/tools/miniconda3/envs/newAge/lib/python3.10/logging/__init__.py", line 368, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/home/puneet/tools/miniconda3/envs/newAge/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/puneet/tools/miniconda3/envs/newAge/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/puneet/tools/

[32m2025-09-16 21:06:23[0m | [32m[1mSUCCESS [0m | [32m[1mAdvanced sentiment analysis complete.[0m

--- Verification of Transformer Sentiment ---


Unnamed: 0,name,rate,transformer_sentiment
0,Jalsa,4.1,0.0
1,Spice Elephant,4.1,0.0
2,San Churro Cafe,3.8,0.0
3,Addhuri Udupi Bhojana,3.7,0.0
4,Grand Village,3.8,0.0


[32m2025-09-16 21:06:24[0m | [1mINFO    [0m | [1mExporting final NLP features to '../data/processed/zomato_nlp_features_final.parquet'...[0m
[32m2025-09-16 21:06:27[0m | [32m[1mSUCCESS [0m | [32m[1mFinal NLP features saved successfully.[0m


In [4]:
df_nlp_source["reviews_list"]

0        None
1        None
2        None
3        None
4        None
         ... 
45182    None
45183    None
45184    None
45185    None
45186    None
Name: reviews_list, Length: 45187, dtype: object