In [2]:
!pip install pandas geojson




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import geojson
import sys

# Using the GeoJson to convert Csv to geoJson

In [36]:
def csv_to_geojson(csv_path, output_path):
    # Read CSV file
    df = pd.read_csv(csv_path, encoding='latin1')

    # Ensure latitude and longitude columns exist
    if 'Latitude' not in df.columns or 'Longitude' not in df.columns:
        print("CSV must have 'latitude' and 'longitude' columns.")
        sys.exit(1)

    # Create GeoJSON features
    features = []
    for idx, row in df.iterrows():
        point = geojson.Point((row['Longitude'], row['Latitude']))
        properties = row.drop(['Latitude', 'Longitude']).to_dict()
        feature = geojson.Feature(geometry=point, properties=properties)
        features.append(feature)
        print(f"\nFeature {idx+1}:\n", geojson.dumps(feature, indent=2))

    # Create FeatureCollection
    feature_collection = geojson.FeatureCollection(features)

    # Write to GeoJSON file
    with open(output_path, 'w') as f:
        geojson.dump(feature_collection, f, indent=2)
    print(f"GeoJSON saved to: {output_path}")

# Example usage
csv_path = "updated_dataset_with_coords.csv"        # Replace with your CSV file path
geojson_path = "restaurants_of_mumbai.geojson"  # Desired output file
csv_to_geojson(csv_path, geojson_path)


CSV must have 'latitude' and 'longitude' columns.


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [16]:
df = pd.read_csv("zomato.csv", encoding='latin1')

In [17]:
df.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


In [18]:
df.columns

Index(['Restaurant ID', 'Restaurant Name', 'Country Code', 'City', 'Address',
       'Locality', 'Locality Verbose', 'Longitude', 'Latitude', 'Cuisines',
       'Average Cost for two', 'Currency', 'Has Table booking',
       'Has Online delivery', 'Is delivering now', 'Switch to order menu',
       'Price range', 'Aggregate rating', 'Rating color', 'Rating text',
       'Votes'],
      dtype='object')

# Using Groq to generate geoJson

In [9]:
from groq import Groq

In [22]:
import json

In [24]:
# Initialize Groq client with your API key
client = Groq(api_key="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")  # ← Replace with your actual key

# Choose model: mixtral-8x7b-32768 or llama3-70b-8192
model_name = "llama3-70b-8192"

## Prompt Section along with input

In [37]:
def generate_prompt(row):
    return f"""
Given this restaurant information, generate a valid GeoJSON Feature object:

- Name: {row['name']}
- Address: {row['address']}
- Latitude: {row['latitude']}
- Longitude: {row['longitude']}
Only return the GeoJSON Feature object.
"""

## Initializing Groq model

In [38]:
def call_groq_llm(prompt):
    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )
    content = response.choices[0].message.content.strip()
    print(f"\n📥 Raw LLM Response:\n{content}\n")  # Add this line for debugging
    return content


In [39]:
import re

def extract_json_block(text):
    # Remove markdown-style code fencing if present
    match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
    if match:
        return match.group(1)
    else:
        return text.strip()  # fallback if no code fencing


## Final Part

In [56]:
def csv_to_geojson_with_groq(csv_path, output_geojson_path, max_rows=1337):
    df = pd.read_csv(csv_path, encoding='latin1')
    features = []

    for i, row in df.iterrows():
        if i >= max_rows:
            break  # Prevent excessive usage during testing

        prompt = generate_prompt(row)
        print(f"🔄 Processing row {i+1}...")

        try:
            geojson_str = call_groq_llm(prompt)
            geojson_str = extract_json_block(geojson_str)
            geojson_obj = json.loads(geojson_str)
            feature = json.loads(geojson_str)
            features.append(feature)
            print(f"✅ Added row {i+1}")
        except json.JSONDecodeError as e:
            print(f"❌ Failed on row {i+1}: {e}")

    feature_collection = {
        "type": "FeatureCollection",
        "features": features
    }

    with open(output_geojson_path, "w") as f:
        json.dump(feature_collection, f, indent=2)

    print(f"\n🎉 GeoJSON file saved to: {output_geojson_path}")

# Run the function
csv_to_geojson_with_groq("osm_data.csv", "osm_data.geojson", max_rows=1337)

🔄 Processing row 1...

📥 Raw LLM Response:
{
  "type": "Feature",
  "geometry": {
    "type": "Point",
    "coordinates": [null, null]
  },
  "properties": {
    "name": null,
    "address": null
  }
}

✅ Added row 1
🔄 Processing row 2...

📥 Raw LLM Response:
Here is the GeoJSON Feature object:

```
{
  "type": "Feature",
  "geometry": {
    "type": "Point",
    "coordinates": [72.8240813, 19.0517569]
  },
  "properties": {
    "name": "Good Luck Restaurant",
    "address": "M Kahn Chowk, Mumbai, 400050"
  }
}
```

✅ Added row 2
🔄 Processing row 3...

📥 Raw LLM Response:
Here is the GeoJSON Feature object:

```
{
  "type": "Feature",
  "geometry": {
    "type": "Point",
    "coordinates": [72.8258922, 19.0548735]
  },
  "properties": {
    "name": "Wonton House",
    "address": "Mumbai, 400050"
  }
}
```

✅ Added row 3
🔄 Processing row 4...

📥 Raw LLM Response:
Here is the GeoJSON Feature object:

```
{
  "type": "Feature",
  "geometry": {
    "type": "Point",
    "coordinates": [72.82

KeyboardInterrupt: 

## Json to Csv file

In [55]:
import json
import pandas as pd

# Load the JSON file
with open("osm_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Prepare a list of clean records
records = []

# Flatten only the inner dictionaries (skip lists and other types)
for entry in data.values():
    if isinstance(entry, dict):
        records.append(entry)
    elif isinstance(entry, list):
        # If it's a list, loop through and pick only dict items
        records.extend([item for item in entry if isinstance(item, dict)])

# Convert to DataFrame
df = pd.DataFrame(records)

# Save to CSV
df.to_csv("osm_data.csv", index=False)


## Merging all geoJson File

In [57]:
geojson_files = ["updated_justdial_data.geojson", "oldwikipedia_data.geojson", "restaurants_of_mumbai.geojson","overpass_mumbai.geojson"]
# Initialize an empty list to hold all features
all_features = []

# Loop through each file and collect features
for file in geojson_files:
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        if data["type"] == "FeatureCollection":
            all_features.extend(data["features"])

# Build the concatenated GeoJSON
concatenated_geojson = {
    "type": "FeatureCollection",
    "features": all_features
}

# Save to a new file
with open("concatenated.geojson", "w", encoding="utf-8") as f:
    json.dump(concatenated_geojson, f, ensure_ascii=False, indent=2)

## Weights assigning

In [74]:
# Input files and corresponding weights
geojson_sources = {
    "overpass_mumbai.geojson": 4,
    "updated_justdial_data.geojson": 3,
    "restaurants_of_mumbai.geojson": 2,
    "oldwikipedia_data.geojson": 1
}

In [60]:
import collections

In [75]:
# Dictionary to hold weighted address counts
address_weights = collections.defaultdict(int)

## 1st Try

In [62]:
# Set to hold all unique address-like entries
unique_addresses = set()

In [76]:
def extract_address(feature):
    props = feature.get("properties", {})
    for key in ["address", "name", "road", "street", "location"]:
        value = props.get(key)
        if isinstance(value, str) and value.strip():
            return value.strip().lower()
    return None


In [66]:
# Collect unique addresses
for file in geojson_files:
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        if data["type"] == "FeatureCollection":
            for feature in data["features"]:
                address = extract_address(feature)
                if address:
                    unique_addresses.add(address)

In [69]:
import random

In [70]:
# Assign base weights and randomly increment
address_weights = collections.defaultdict(int)
for address in unique_addresses:
    base_weight = random.randint(1, 3)  # Base random weight
    extra_boost = random.randint(0, 5)  # Random increment
    address_weights[address] = base_weight + extra_boost

# Save results
with open("random_weighted_addresses.json", "w", encoding="utf-8") as f:
    json.dump(address_weights, f, indent=2, ensure_ascii=False)

print("Randomly weighted addresses saved to 'random_weighted_addresses.json'")

Randomly weighted addresses saved to 'random_weighted_addresses.json'


## Bas Weight Wala check

In [77]:
# Process each GeoJSON file
for file, weight in geojson_sources.items():
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        if data["type"] == "FeatureCollection":
            for feature in data["features"]:
                address = extract_address(feature)
                if address:
                    address_weights[address] += weight

In [78]:
# Save the aggregated weights to a JSON file
with open("weighted_address_aggregation.json", "w", encoding="utf-8") as f:
    json.dump(address_weights, f, indent=2, ensure_ascii=False)

print("Weighted address aggregation saved to 'weighted_address_aggregation.json'")

Weighted address aggregation saved to 'weighted_address_aggregation.json'


## Final Confidence percentage

In [80]:
# Calculate confidence percentages
max_weight = max(address_weights.values(), default=1)
address_confidences = {}

In [81]:

for address, weight in address_weights.items():
    if weight < 4:
        confidence = 50
    else:
        # Scale between 90% to 98% excluding 100%
        confidence = int(90 + (weight / max_weight) * 8)
        confidence = min(confidence, 98)
    address_confidences[address] = {
        "weight": weight,
        "confidence_percent": confidence
    }

In [82]:
# Save result
with open("weighted_address_confidence.json", "w", encoding="utf-8") as f:
    json.dump(address_confidences, f, indent=2, ensure_ascii=False)

print("Confidence-based address aggregation saved to 'weighted_address_confidence.json'")

Confidence-based address aggregation saved to 'weighted_address_confidence.json'


## Now merging the confidence with level with the geoJson wala file for dusra LLM

In [84]:
import json

# Load address weights and confidence
with open("weighted_address_confidence.json", "r", encoding="utf-8") as f:
    address_confidences = json.load(f)

def extract_address(props):
    for key in ["address", "name", "road", "street", "location"]:
        value = props.get(key)
        if isinstance(value, str):
            return value.strip().lower()
    return None

# Load the concatenated GeoJSON
with open("concatenated.geojson", "r", encoding="utf-8") as f:
    geojson_data = json.load(f)

# Update features with weight and confidence
for feature in geojson_data.get("features", []):
    props = feature.get("properties", {})
    address = extract_address(props)
    if address and address in address_confidences:
        props["weight"] = address_confidences[address]["weight"]
        props["confidence_percent"] = address_confidences[address]["confidence_percent"]
    else:
        props["weight"] = 0
        props["confidence_percent"] = 0

# Save updated GeoJSON
with open("combined_weighted.geojson", "w", encoding="utf-8") as f:
    json.dump(geojson_data, f, indent=2, ensure_ascii=False)

print("Updated GeoJSON with weights and confidence saved to 'combined_weighted.geojson'")


Updated GeoJSON with weights and confidence saved to 'combined_weighted.geojson'


## Final leaning 

In [88]:
import json

# Load the GeoJSON
with open("combined_weighted.geojson", "r", encoding="utf-8") as f:
    geojson_data = json.load(f)

# Function to check if any property is null, empty string, or only whitespace
def has_any_null_or_empty(props):
    return any(v is None or str(v).strip() == "" for v in props.values())

# Filter features
filtered_features = []
for feature in geojson_data.get("features", []):
    props = feature.get("properties", {})
    if not has_any_null_or_empty(props):
        filtered_features.append(feature)

# Save cleaned GeoJSON
geojson_data["features"] = filtered_features
with open("cleaned_combined.geojson", "w", encoding="utf-8") as f:
    json.dump(geojson_data, f, ensure_ascii=False, indent=2)

print(f"Cleaned GeoJSON saved with {len(filtered_features)} features.")


Cleaned GeoJSON saved with 1339 features.
