# Filtering the Noise: ML for Trustworthy Location Reviews
## 24-Hour Hackathon Solution

**Team:** [Your Team Name]  
**Date:** August 27, 2025  
**Challenge:** Design and implement an ML-based system to evaluate the quality and relevancy of Google location reviews

### Problem Statement
- **Gauge review quality**: Detect spam, advertisements, irrelevant content, and rants
- **Assess relevancy**: Determine if review content is genuinely related to the location
- **Enforce policies**: Automatically flag reviews violating predefined policies

## 🔨 Setup

In [1]:
# ⚠️ Run this cell only if fresh runtime or first time setup

# Install required packages
%pip install transformers torch datasets pandas numpy scikit-learn matplotlib seaborn plotly
%pip install huggingface-hub accelerate
%pip install nltk spacy wordcloud
%pip install kaggle
%python -m spacy download en_core_web_sm
print("All packages installed successfully!")

Collecting transformers
  Using cached transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
Collecting torch
  Using cached torch-2.8.0-cp312-cp312-win_amd64.whl.metadata (30 kB)
Collecting datasets
  Using cached datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pandas
  Using cached pandas-2.3.2-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Using cached numpy-2.3.2-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting matplotlib
  Using cached matplotlib-3.10.5-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting plotly
  Using cached plotly-6.3.0-py3-none-any.whl.metadata (8.5 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub

UsageError: Line magic function `%python` not found (But cell magic `%%python` exists, did you mean that instead?).


In [3]:
# ⚠️ Run this cell only if fresh runtime or first time setup

# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# NLP and ML libraries
import nltk
import spacy
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Data processing
import re
import string
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Terminal commands
import os
from pathlib import Path
import shutil

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

print("All imports successful!")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jotha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


All imports successful!


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jotha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\jotha\AppData\Roaming\nltk_data...


In [7]:
# ⚠️ Run this cell only if fresh runtime or first time setup

from kaggle.api.kaggle_api_extended import KaggleApi

# Kaggle API Setup & Downloading of Dataset to ./kaggle_data directory
def config_kaggle_api_token():
    # kaggle_dir = Path.home() / '.config' / 'kaggle'
    kaggle_dir = Path.home() / '.kaggle'
    kaggle_dir.mkdir(exist_ok=True)

    shutil.copy('./kaggle.json', kaggle_dir / 'kaggle.json')
    os.chmod(kaggle_dir / 'kaggle.json', 0o600)

def download_kaggle_dataset(path='./kaggle_data', dataset_name="denizbilginn/google-maps-restaurant-reviews"):
    api = KaggleApi()
    api.authenticate()
    dataset_name="denizbilginn/google-maps-restaurant-reviews"
    api.dataset_download_files(dataset_name,
                            path=path,
                            unzip=True)

## 📊 Data Collection & Loading

We'll use the provided Google Local Reviews dataset. You can also supplement with additional data sources.

In [8]:
# ⚠️ Run this cell only if fresh runtime or first time setup

# Download Kaggle Dataset
config_kaggle_api_token()
download_kaggle_dataset()

Dataset URL: https://www.kaggle.com/datasets/denizbilginn/google-maps-restaurant-reviews


In [11]:
# Data Loading Functions

def load_dataset(file_path):
    """Load dataset from local CSV file"""
    try:
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            print(f"✅ Loaded {len(df)} rows from {file_path}")
            df = standardize_columns(df)
            return df
        else:
            print(f"❌ File not found: {file_path}")
            return None
    except Exception as e:
        print(f"❌ Error loading local file: {e}")
        return None

def standardize_columns(df):
    """Standardize column names to match our expected format"""
    # Common column mappings
    column_mappings = {
        'text': 'review_text',
        'review': 'review_text',
        'comment': 'review_text',
        'content': 'review_text',
        'review_text': 'review_text',

        'rating': 'rating',
        'stars': 'rating',
        'score': 'rating',
        'star_rating': 'rating',

        'business': 'business_name',
        'restaurant': 'business_name',
        'place_name': 'business_name',
        'name': 'business_name',

        'user': 'user_id',
        'user_name': 'user_id',
        'reviewer': 'user_id',

        'date': 'timestamp',
        'time': 'timestamp',
        'created_at': 'timestamp',
        'review_date': 'timestamp'
    }

    # Convert column names to lowercase for matching
    df_columns_lower = [col.lower() for col in df.columns]

    # Apply mappings
    new_columns = []
    for col in df.columns:
        col_lower = col.lower()
        if col_lower in column_mappings:
            new_columns.append(column_mappings[col_lower])
        else:
            new_columns.append(col)

    df.columns = new_columns

    # Ensure we have required columns
    required_columns = ['review_text', 'rating']
    for col in required_columns:
        if col not in df.columns:
            if col == 'review_text':
                # Try to find any text column
                text_cols = [c for c in df.columns if 'text' in c.lower() or 'review' in c.lower() or 'comment' in c.lower()]
                if text_cols:
                    df['review_text'] = df[text_cols[0]]
                else:
                    print(f"⚠️ Could not find text column, creating placeholder")
                    df['review_text'] = "Sample review text"
            elif col == 'rating':
                # Try to find any rating column
                rating_cols = [c for c in df.columns if 'rating' in c.lower() or 'star' in c.lower() or 'score' in c.lower()]
                if rating_cols:
                    df['rating'] = df[rating_cols[0]]
                else:
                    print(f"⚠️ Could not find rating column, creating placeholder")
                    df['rating'] = 3  # Default neutral rating

    # Add missing optional columns
    if 'business_name' not in df.columns:
        df['business_name'] = 'Unknown Business'
    if 'user_id' not in df.columns:
        df['user_id'] = [f'user_{i}' for i in range(len(df))]
    if 'timestamp' not in df.columns:
        df['timestamp'] = pd.date_range('2024-01-01', periods=len(df), freq='D')

    return df

In [None]:
# Data Cleanup

def _find_col(df, aliases, required=True):
    """Return the first matching column from aliases; None if not found and required=False."""
    cols_lower = {c.lower(): c for c in df.columns}
    for a in aliases:
        if a.lower() in cols_lower:
            return cols_lower[a.lower()]
    if required:
        raise KeyError(f"None of the aliases {aliases} found in columns: {list(df.columns)}")
    return None

def clean_reviews_dataset(df):
    """
    Keep rows that have ALL of the following (non-empty, non-NaN):
      - business_name
      - author_name
      - text
      - rating
    Allow missing: photo, rating_category
    Preserve output columns in original schema.
    """

    # Resolve columns even if earlier steps renamed them
    col_business = _find_col(df, ["business_name", "restaurant", "place_name", "name"])
    col_author   = _find_col(df, ["author_name", "user", "user_name", "reviewer"])
    col_text     = _find_col(df, ["text", "review_text", "comment", "content"])
    col_rating   = _find_col(df, ["rating", "stars", "score", "star_rating"])

    # Optional columns may or may not exist
    col_photo          = _find_col(df, ["photo"], required=False)
    col_rating_category= _find_col(df, ["rating_category"], required=False)

    # Work on a copy
    d = df.copy()

    # Normalize whitespace for string fields (only if they exist)
    for c in [col_business, col_author, col_text]:
        d[c] = d[c].astype(str).str.strip()

    # Coerce rating to numeric
    d[col_rating] = pd.to_numeric(d[col_rating], errors="coerce")

    # Drop rows with missing/empty required fields
    before = len(d)
    d = d.dropna(subset=[col_business, col_author, col_text, col_rating])
    # Remove empty-string rows in required text columns
    for c in [col_business, col_author, col_text]:
        d = d[d[c] != ""]
    # Optionally enforce valid rating range (comment out if you want raw)
    d = d[(d[col_rating] >= 1) & (d[col_rating] <= 5)]

    removed = before - len(d)
    print(f"🧹 Cleaned dataset: {before} → {len(d)} rows (removed {removed})")

    # Rebuild output with your target column names in the same format
    out = pd.DataFrame({
        "business_name":    d[col_business],
        "author_name":      d[col_author],
        "text":             d[col_text],
        "rating":           d[col_rating],
    })

    # Attach optional columns if present; else create with NaN
    out["photo"] = d[col_photo] if col_photo in d.columns else pd.Series([pd.NA]*len(d))
    out["rating_category"] = d[col_rating_category] if col_rating_category in d.columns else pd.Series([pd.NA]*len(d))

    # Keep any extra columns? If you want to strictly keep only the six, return `out` as is.
    return out

✅ Loaded 1100 rows from ./kaggle_data/reviews.csv
🔧 Introduced a missing value in row 5 (text column)

🧹 Cleaned dataset: 1100 → 1099 rows (removed 1)

📋 Dataset Info (Cleaned):
<class 'pandas.core.frame.DataFrame'>
Index: 1099 entries, 0 to 1099
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   business_name    1099 non-null   object
 1   author_name      1099 non-null   object
 2   text             1099 non-null   object
 3   rating           1099 non-null   int64 
 4   photo            1099 non-null   object
 5   rating_category  1099 non-null   object
dtypes: int64(1), object(5)
memory usage: 60.1+ KB
None

🔍 First 5 rows after cleaning:
                     business_name    author_name text  rating  \
0  Haci'nin Yeri - Yigit Lokantasi    Gulsum Akar  nan       5   
1  Haci'nin Yeri - Yigit Lokantasi  Oguzhan Cetin  nan       4   
2  Haci'nin Yeri - Yigit Lokantasi     Yasin Kuyu  nan       3   
3  Haci'n

In [None]:
# Load the dataset
df = load_dataset('./kaggle_data/reviews.csv')

# 👇 Simulate a bad row (make the 5th row's text missing)
df.loc[4, "text"] = ""   # or "" to test empty-string removal
print("🔧 Introduced a missing value in row 5 (text column)\n")

df = clean_reviews_dataset(df)

print("\n📋 Cleaned Dataset Info:")
print(df.info())
print(f"\n📊 Dataset shape: {df.shape}")
print("\n🔍 First 5 reviews:")
print(df[['review_text', 'rating', 'business_name']].head())

# Display data quality info
print(f"\n✅ Data Quality Check:")
print(f"- Total reviews: {len(df)}")
print(f"- Unique businesses: {df['business_name'].nunique()}")
print(f"- Rating distribution: {dict(df['rating'].value_counts().sort_index())}")
print(f"- Missing values: {df.isnull().sum().sum()}")
print(f"- Average review length: {df['review_text'].str.len().mean():.1f} characters")