In [21]:
# -*- coding: utf-8 -*-
"""Chatbot.ipynb

Updated to read local document content (Corpus_Content.txt) for general Q&A,
and to include tweet validation, blockchain push functionality, and workarounds for NLTK tokenizer errors.
"""

import io
import os
import re              # Needed for regular expression matching.
import random
import string        # For processing standard Python strings.
import warnings
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings('ignore')

# Ensure NLTK uses the expected data directory.
os.environ["NLTK_DATA"] = "/root/nltk_data"

# Instead of using the Google Docs API, load document content from a local file.
# Change "Corpus_Content.txt" to your actual file name.
DOCUMENT_FILENAME = "Corpus_Content.txt"
if os.path.exists(DOCUMENT_FILENAME):
    with open(DOCUMENT_FILENAME, "r", encoding="utf-8") as f:
        raw = f.read().lower()
    print("Local document content loaded from '{}'.\n".format(DOCUMENT_FILENAME))
else:
    raw = "hello, this is a fallback text for the chatbot. feel free to ask me anything about our system."
    print("Using fallback text.\n")

# Install nltk if not already installed.
!pip install -q nltk

import nltk
# Download required NLTK resources.
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer

# Attempt to load the English Punkt tokenizer.
try:
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
except LookupError as e:
    print("Error loading tokenizer:", e)
    print("Attempting to download 'punkt_tab' as suggested by the error message...")
    nltk.download('punkt_tab')
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

sent_tokens = tokenizer.tokenize(raw)  # List of sentences.
word_tokens = nltk.word_tokenize(raw)    # List of words.

lemmer = WordNetLemmatizer()
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

GREETING_INPUTS = ("hello", "hi", "greetings", "sup", "what's up", "hey",)
GREETING_RESPONSES = ["hi", "hey", "*nods*", "hi there", "hello", "I am glad! You are talking to me"]
def greeting(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

def response(user_response):
    robo_response = ''
    sent_tokens.append(user_response)
    TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
    tfidf = TfidfVec.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)
    idx = vals.argsort()[0][-2]
    flat = vals.flatten()
    flat.sort()
    req_tfidf = flat[-2]
    if req_tfidf == 0:
        robo_response += "I am sorry! I don't understand you"
        return robo_response
    else:
        robo_response += sent_tokens[idx]
        return robo_response

##############################
# Tweet Validation Integration
##############################

import requests
import datetime as dt
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Set your blockchain ledger endpoint – update if needed.
LEDGER_RPC = "https://4c7f-209-129-88-187.ngrok-free.app"
SUBMIT_URL  = f"{LEDGER_RPC}/submit"

# Load sensor data (ensure "final_sensors_zones.csv" is in your working directory).
import pandas as pd
sensors = pd.read_csv("/final_sensors_zones.csv")
sensors["timestamp"] = pd.to_datetime(sensors["timestamp"])

# Hazard keyword patterns and helper functions from your tweet classification notebook.
HAZARD_PATTERNS = {
    "fire":       r"\bfire\b",
    "earthquake": r"\bearthquake\b|\bquake\b",
    "flood":      r"\bflood\b",
    "evacuate":   r"\bevacuate\b|\bevacuation\b|\bemergency\b"
}

def extract_hazard(text):
    """Extract a hazard keyword from tweet text."""
    for hazard, pattern in HAZARD_PATTERNS.items():
        if re.search(pattern, str(text), flags=re.I):
            return hazard
    return "unknown"

def map_hazard_to_sensors(hazard):
    """Map hazard types to relevant sensor types."""
    mapping = {
        "flood":      ["flood", "humidity"],
        "fire":       ["temp", "humidity"],
        "earthquake": ["seismic"],
        "evacuate":   ["flood", "temp", "seismic", "humidity"],
    }
    return mapping.get(hazard, [])

VALID_THRESHOLD = 60   # Sensor threshold (1-100 scale)
TIME_WINDOW = "20min"  # ± window for sensor data
MIN_ACTIVE = 1
HIGH_PCT_REQ = 20      # Percent threshold to consider hazard real
MEAN_REQ = 20

def validate_tweet(row):
    """
    Validate a tweet based on sensor readings.
    Expects row with keys: tweet_hazard, zone_id, and timestamp.
    Returns a tuple: (verdict, pct_high, mean_read, n_active)
    """
    if row["tweet_hazard"] == "unknown":
        return "ignore", 0, 0, 0

    zone = row["zone_id"]
    hazard = row["tweet_hazard"]
    tweet_time = row["timestamp"]

    window = sensors[
        (sensors["zone_id"] == zone) &
        (sensors["sensor_type"].isin(map_hazard_to_sensors(hazard))) &
        (abs(sensors["timestamp"] - tweet_time) <= pd.Timedelta(TIME_WINDOW))
    ]

    if window.empty:
        return "fake", 0, 0, 0

    mean_read = window["reading_value"].mean()
    pct_high = (window["reading_value"] >= VALID_THRESHOLD).mean() * 100
    n_active = (window["status"].str.lower() == "active").sum()

    if pct_high >= HIGH_PCT_REQ and mean_read >= MEAN_REQ and n_active >= MIN_ACTIVE:
        verdict = "valid"
    elif pct_high < 5:
        verdict = "fake"
    else:
        verdict = "uncertain"
    return verdict, pct_high, mean_read, n_active

def push_tweet(payload):
    """
    Push tweet payload to the blockchain ledger.
    Returns the HTTP status code.
    """
    response = requests.post(SUBMIT_URL, json=payload, timeout=10, verify=False)
    return response.status_code

def process_chatbot_tweet(tweet_text, zone_id, ts_str):
    """
    Process a tweet from the chatbot:
      1. Convert timestamp string to a datetime object.
      2. Extract hazard from tweet text.
      3. Validate tweet using sensor data.
      4. Build payload and push it to the blockchain.
    Returns a response string.
    """
    try:
        tweet_timestamp = pd.to_datetime(ts_str)
    except Exception as e:
        return f"Invalid timestamp format: {e}"

    tweet_hazard = extract_hazard(tweet_text)
    tweet_dict = {
        "text": tweet_text,
        "zone_id": zone_id,
        "timestamp": tweet_timestamp,
        "tweet_hazard": tweet_hazard
    }
    verdict, pct_high, mean_read, n_active = validate_tweet(tweet_dict)

    payload = {
        "tweet_id": "manual-" + dt.datetime.now().strftime("%Y%m%d%H%M%S"),
        "zone_id": zone_id,
        "hazard": tweet_hazard,
        "status": verdict,
        "timestamp": tweet_timestamp.isoformat(),
        "pct_high": pct_high,
        "mean_read": mean_read,
        "n_active": n_active
    }
    push_result = push_tweet(payload)
    if push_result == 200:
        return (f"Tweet validated as '{verdict}'. Mean sensor reading: {mean_read:.2f}, "
                f"percent high: {pct_high:.1f}%, active sensors: {n_active}. "
                f"Successfully pushed to blockchain.")
    else:
        return f"Error sending tweet to the ledger. Status code: {push_result}"

##############################
# Chatbot Main Loop
##############################
flag = True
print("My name is Freestyle Genie. I will answer your queries and validate tweets. To exit, type 'cola'!")
while flag:
    user_response = input()
    user_response = user_response.lower()

    # If the user enters "validate tweet", enter tweet validation mode.
    if "validate tweet" in user_response:
        tweet_text = input("Enter tweet text: ")
        zone_id = input("Enter zone (e.g., ZoneA): ")
        ts_str = input("Enter tweet timestamp (YYYY-MM-DD HH:MM:SS): ")
        result = process_chatbot_tweet(tweet_text, zone_id, ts_str)
        print("Freestyle Genie:", result)
    elif user_response != 'cola':
        if user_response in ['thanks', 'thank you']:
            flag = False
            print("You are welcome..")
        else:
            if greeting(user_response) is not None:
                print("Freestyle Genie:" + greeting(user_response))
            else:
                print("Freestyle Genie:", end=" ")
                print(response(user_response))
                sent_tokens.remove(user_response)
    else:
        flag = False
        print("Freestyle Genie: Bye! take care..")


Local document content loaded from 'Corpus_Content.txt'.



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


My name is Freestyle Genie. I will answer your queries and validate tweets. To exit, type 'cola'!
zone 1 summary
Freestyle Genie: zone 0.0 summary:
total events: 1105.
most common risk level: low.
how many flood in zone 2
Freestyle Genie: most common disaster type: flood.
zone 5 most common disater
Freestyle Genie: zone 0.0 summary:
total events: 1105.
most common risk level: low.
what is this dashboard
Freestyle Genie: * explanatory note:
 this dashboard is part of a broader crisis management project designed for smart city applications.
exit
Freestyle Genie: I am sorry! I don't understand you
cola
Freestyle Genie: Bye! take care..


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
