In [1]:
import os
import pandas as pd

# Force change to D drive
os.chdir("D:/GroundwaterChatbot")
print(f"Working in: {os.getcwd()}")

class FastGroundwaterBot:
    def __init__(self):
        self.data = None
        self.location_col = None
        self.load_small_csv()
    
    def load_small_csv(self):
        """Load CSV with limits to prevent hanging"""
        print("Loading CSV...")
        
        csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
        
        if not csv_files:
            print("No CSV files found")
            return False
        
        csv_file = csv_files[0]
        print(f"Found CSV: {csv_file}")
        
        try:
            # Load only first 100 rows to test
            print("Loading first 100 rows...")
            self.data = pd.read_csv(csv_file, nrows=100)
            print(f"Loaded {len(self.data)} rows successfully")
            
            # Show columns
            print(f"Columns: {list(self.data.columns)[:5]}...")
            
            # Find location column
            location_keywords = ['district', 'block', 'location', 'place']
            location_cols = [col for col in self.data.columns 
                           if any(word in col.lower() for word in location_keywords)]
            
            if location_cols:
                self.location_col = location_cols[0]
                print(f"Using location column: {self.location_col}")
                
                # Show sample locations
                sample_locs = self.data[self.location_col].dropna().unique()[:3]
                print(f"Sample locations: {list(sample_locs)}")
                
                return True
            else:
                print("No location column found, using first column")
                self.location_col = self.data.columns[0]
                return True
                
        except Exception as e:
            print(f"CSV loading error: {e}")
            return False
    
    def find_data(self, location):
        """Simple data finder"""
        if self.data is None:
            return "No data loaded"
        
        print(f"Searching for: {location}")
        
        # Simple search
        location_upper = location.upper()
        
        # Check if location exists
        matches = self.data[self.data[self.location_col].str.upper().str.contains(location_upper, na=False)]
        
        if matches.empty:
            return f"No data found for {location}"
        
        # Get first match
        row = matches.iloc[0]
        location_name = row[self.location_col]
        
        # Show first few data points
        result = [f"Data for {location_name}:"]
        
        for col in self.data.columns[:5]:  # Show first 5 columns only
            value = row[col]
            if pd.notna(value):
                result.append(f"  {col}: {value}")
        
        return "\n".join(result)
    
    def ask(self, question):
        """Simple ask function"""
        print(f"Processing question: {question}")
        
        # Extract location (simple approach)
        words = question.lower().split()
        
        # Look for location after common words
        location = ""
        for i, word in enumerate(words):
            if word in ['in', 'of', 'for'] and i < len(words) - 1:
                location = words[i + 1]
                break
        
        if not location and words:
            location = words[-1]  # Use last word
        
        return self.find_data(location)

# Test the fast bot
print("="*40)
print("FAST GROUNDWATER BOT TEST")
print("="*40)

try:
    bot = FastGroundwaterBot()
    
    if bot.data is not None:
        print("\n‚úÖ Bot ready!")
        
        # Quick test
        print("\nTesting...")
        test_q = "groundwater in SHEOPUR"
        result = bot.ask(test_q)
        print(f"Q: {test_q}")
        print(f"A: {result}")
        
        print("\n" + "="*40)
        print("INTERACTIVE MODE")
        
        while True:
            question = input("\nQuestion (or 'quit'): ").strip()
            
            if question.lower() in ['quit', 'exit', '']:
                break
                
            answer = bot.ask(question)
            print(f"Answer:\n{answer}")
    
    else:
        print("‚ùå Bot failed to initialize")

except Exception as e:
    print(f"Error: {e}")
    
    # Emergency fallback
    print("\nüÜò EMERGENCY MODE - Manual CSV check")
    
    files = os.listdir('.')
    csv_files = [f for f in files if f.endswith('.csv')]
    
    print(f"Files found: {files[:10]}...")
    print(f"CSV files: {csv_files}")
    
    if csv_files:
        print(f"\nTrying to read {csv_files[0]} header only...")
        try:
            sample = pd.read_csv(csv_files[0], nrows=1)
            print(f"Columns: {list(sample.columns)}")
            print("CSV structure looks OK")
        except Exception as e2:
            print(f"CSV read error: {e2}")


Working in: D:\GroundwaterChatbot
FAST GROUNDWATER BOT TEST
Loading CSV...
Found CSV: Rainwater_data.Ingres_Data.csv
Loading first 100 rows...


  self.data = pd.read_csv(csv_file, nrows=100)


Loaded 100 rows successfully
Columns: ['_id', 'locationName', 'area.non_recharge_worthy.commandArea', 'area.non_recharge_worthy.nonCommandArea', 'area.non_recharge_worthy.poorQualityArea']...
Using location column: locationName
Sample locations: ['NORTH 24 PARGANAS', 'DAKSHIN DINAJPUR', 'UTTAR DINAJPUR']

‚úÖ Bot ready!

Testing...
Processing question: groundwater in SHEOPUR
Searching for: sheopur
Q: groundwater in SHEOPUR
A: Data for SHEOPUR:
  _id: 68c43357f5a6ddc1f765cd28
  locationName: SHEOPUR

INTERACTIVE MODE



Question (or 'quit'):  y


Processing question: y
Searching for: y
Answer:
Data for PURULIYA:
  _id: 68c43357f5a6ddc1f765cd14
  locationName: PURULIYA
  area.non_recharge_worthy.commandArea: 0.0
  area.non_recharge_worthy.nonCommandArea: 0.0
  area.non_recharge_worthy.poorQualityArea: 0.0



Question (or 'quit'):  PURI


Processing question: PURI
Searching for: puri
Answer:
Data for SHIVPURI:
  _id: 68c43357f5a6ddc1f765cd58
  locationName: SHIVPURI



Question (or 'quit'):  ANJAW


Processing question: ANJAW
Searching for: anjaw
Answer:
No data found for anjaw



Question (or 'quit'):  SIDHI


Processing question: SIDHI
Searching for: sidhi
Answer:
Data for SIDHI:
  _id: 68c43357f5a6ddc1f765cd3c
  locationName: SIDHI



Question (or 'quit'):  quit


In [9]:
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

os.chdir("D:/GroundwaterChatbot")

class HumanReadableGroundwaterBot:
    def __init__(self):
        self.csv_file = 'Rainwater_data.Ingres_Data.csv'
        self.data = None
        self.load_sample_data()
    
    def load_sample_data(self):
        """Load small sample for human-readable responses"""
        print("Loading groundwater data...")
        
        try:
            # Load just 50 rows to keep it manageable
            self.data = pd.read_csv(self.csv_file, nrows=50)
            locations = self.data['locationName'].dropna().unique()
            print(f"Ready! Loaded data for {len(locations)} locations")
            print(f"Available: {', '.join(locations[:3])}...")
            
        except Exception as e:
            print(f"Loading failed: {e}")
            # Create sample data as fallback
            self.create_sample_data()
    
    def create_sample_data(self):
        """Create sample data if CSV fails"""
        print("Creating sample groundwater data...")
        
        sample_data = {
            'locationName': ['SHEOPUR', 'BHOPAL', 'INDORE', 'GWALIOR', 'UJJAIN'],
            'groundwater_level_meters': [15.2, 8.5, 12.1, 18.7, 10.3],
            'water_quality': ['Poor', 'Good', 'Moderate', 'Good', 'Poor'],
            'criticality_status': ['Critical', 'Safe', 'Semi-Critical', 'Safe', 'Critical'],
            'annual_rainfall_mm': [800, 1200, 950, 750, 880]
        }
        
        self.data = pd.DataFrame(sample_data)
        print("Sample data ready!")
    
    def find_location_data(self, location_name):
        """Find data for specific location"""
        if self.data is None:
            return None
        
        location_upper = location_name.upper()
        
        # Try exact match
        matches = self.data[self.data['locationName'].str.upper() == location_upper]
        
        if matches.empty:
            # Try partial match
            matches = self.data[self.data['locationName'].str.upper().str.contains(location_upper, na=False)]
        
        return matches.iloc[0] if not matches.empty else None
    
    def format_human_response(self, location_data, question_type):
        """Convert data to human-readable response"""
        if location_data is None:
            return "Sorry, I don't have data for that location. Try asking about SHEOPUR, BHOPAL, or INDORE."
        
        location = location_data['locationName']
        
        # Different responses based on what user asked
        if 'level' in question_type:
            # Look for level data
            level_cols = [col for col in location_data.index if 'level' in str(col).lower()]
            if level_cols:
                level_value = location_data[level_cols[0]]
                return f"The groundwater level in {location} is {level_value} meters below surface."
            else:
                return f"I found {location} in the database, but don't have specific groundwater level data."
        
        elif 'critical' in question_type or 'status' in question_type:
            # Look for criticality data
            critical_cols = [col for col in location_data.index if any(word in str(col).lower() for word in ['critical', 'status', 'stage'])]
            if critical_cols:
                status = location_data[critical_cols[0]]
                return f"The water situation in {location} is classified as: {status}."
            else:
                return f"I have data for {location}, but no criticality status available."
        
        elif 'quality' in question_type:
            # Look for quality data
            quality_cols = [col for col in location_data.index if 'quality' in str(col).lower()]
            if quality_cols:
                quality = location_data[quality_cols[0]]
                return f"The water quality in {location} is: {quality}."
            else:
                return f"No water quality data available for {location}."
        
        else:
            # General information
            response_parts = [f"Here's what I know about {location}:"]
            
            # Add available information in plain English
            count = 0
            for col, value in location_data.items():
                if col != 'locationName' and pd.notna(value) and count < 3:
                    
                    if 'level' in str(col).lower():
                        response_parts.append(f"- Groundwater level: {value} meters")
                    elif 'critical' in str(col).lower() or 'status' in str(col).lower():
                        response_parts.append(f"- Water status: {value}")
                    elif 'quality' in str(col).lower():
                        response_parts.append(f"- Water quality: {value}")
                    elif 'rainfall' in str(col).lower():
                        response_parts.append(f"- Annual rainfall: {value} mm")
                    else:
                        # Generic data
                        clean_name = str(col).replace('_', ' ').replace('.', ' ').title()
                        response_parts.append(f"- {clean_name}: {value}")
                    
                    count += 1
            
            if count == 0:
                response_parts.append("- Data is available but in technical format")
            
            return "\n".join(response_parts)
    
    def extract_location(self, question):
        """Extract location from user question"""
        words = question.lower().split()
        
        # Look after common prepositions
        for i, word in enumerate(words):
            if word in ['in', 'of', 'for', 'about'] and i < len(words) - 1:
                return words[i + 1].upper()
        
        # Look for known locations in the question
        if self.data is not None:
            known_locations = self.data['locationName'].dropna().str.upper().tolist()
            for word in words:
                word_upper = word.upper()
                if word_upper in known_locations:
                    return word_upper
        
        # Use last word as fallback
        return words[-1].upper() if words else ""
    
    def ask(self, question):
        """Main function - returns human-readable answer"""
        question_lower = question.lower().strip()
        
        # Handle special requests
        if any(word in question_lower for word in ['locations', 'available', 'list']):
            if self.data is not None:
                locations = self.data['locationName'].dropna().tolist()
                return f"I have data for these locations: {', '.join(locations)}"
            else:
                return "No location data available."
        
        # Extract location and question type
        location = self.extract_location(question)
        
        # Find the data
        location_data = self.find_location_data(location)
        
        # Return human-readable response
        return self.format_human_response(location_data, question_lower)
    
    def chat(self):
        """Simple chat interface"""
        print("\n" + "=" * 50)
        print("GROUNDWATER INFORMATION ASSISTANT")
        print("=" * 50)
        print("Ask me about groundwater in any location!")
        print("Examples:")
        print("‚Ä¢ 'What is the groundwater level in SHEOPUR?'")
        print("‚Ä¢ 'Tell me about water quality in BHOPAL'")
        print("‚Ä¢ 'Is INDORE water situation critical?'")
        print("‚Ä¢ 'Show available locations'")
        print("\nType 'quit' to exit")
        print("=" * 50)
        
        while True:
            try:
                question = input("\nYour question: ").strip()
                
                if question.lower() in ['quit', 'exit', 'bye']:
                    print("Goodbye! Stay informed about groundwater!")
                    break
                
                if not question:
                    print("Please ask a question about groundwater.")
                    continue
                
                answer = self.ask(question)
                print(f"\nAnswer: {answer}")
                
            except KeyboardInterrupt:
                print("\nGoodbye!")
                break
            except Exception as e:
                print(f"Sorry, I encountered an error: {e}")

# Initialize the human-readable bot
print("Starting Groundwater Information Assistant...")
bot = HumanReadableGroundwaterBot()

# Quick test
print("\nQuick test:")
test_questions = [
    "What is the groundwater level in SHEOPUR?",
    "Tell me about BHOPAL water situation",
    "Is INDORE critical?"
]

for q in test_questions:
    print(f"\nQ: {q}")
    print(f"A: {bot.ask(q)}")

# Interactive mode
print("\n" + "="*50)
start_chat = input("Want to start asking questions? (y/n): ").lower()

if start_chat == 'y':
    bot.chat()
else:
    print("Bot ready! Use bot.ask('your question') to test")


Starting Groundwater Information Assistant...
Loading groundwater data...
Ready! Loaded data for 50 locations
Available: NORTH 24 PARGANAS, DAKSHIN DINAJPUR, UTTAR DINAJPUR...

Quick test:

Q: What is the groundwater level in SHEOPUR?
A: The groundwater level in SHEOPUR is 4.0 meters below surface.

Q: Tell me about BHOPAL water situation
A: Sorry, I don't have data for that location. Try asking about SHEOPUR, BHOPAL, or INDORE.

Q: Is INDORE critical?
A: Sorry, I don't have data for that location. Try asking about SHEOPUR, BHOPAL, or INDORE.



Want to start asking questions? (y/n):  y



GROUNDWATER INFORMATION ASSISTANT
Ask me about groundwater in any location!
Examples:
‚Ä¢ 'What is the groundwater level in SHEOPUR?'
‚Ä¢ 'Tell me about water quality in BHOPAL'
‚Ä¢ 'Is INDORE water situation critical?'
‚Ä¢ 'Show available locations'

Type 'quit' to exit



Your question:  kargil



Answer: Sorry, I don't have data for that location. Try asking about SHEOPUR, BHOPAL, or INDORE.



Your question:  quit


Goodbye! Stay informed about groundwater!


In [11]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

os.chdir("D:/GroundwaterChatbot")

class CSVCleaner:
    def __init__(self):
        self.input_file = 'Rainwater_data.Ingres_Data.csv'
        self.output_file = 'Clean_Groundwater_Data.csv'
        self.backup_file = 'Original_Backup.csv'
    
    def analyze_csv(self):
        """Analyze the CSV file structure"""
        print("üîç Analyzing CSV file...")
        
        try:
            # Read small sample to understand structure
            sample = pd.read_csv(self.input_file, nrows=10)
            
            print(f"üìä Original structure:")
            print(f"   ‚Ä¢ Rows in sample: {len(sample)}")
            print(f"   ‚Ä¢ Total columns: {len(sample.columns)}")
            print(f"   ‚Ä¢ Sample columns: {list(sample.columns[:5])}")
            
            # Check null percentage
            null_percent = (sample.isnull().sum() / len(sample) * 100)
            high_null_cols = null_percent[null_percent > 80].index.tolist()
            
            print(f"   ‚Ä¢ Columns with >80% nulls: {len(high_null_cols)}")
            
            return True
            
        except Exception as e:
            print(f"‚ùå Analysis failed: {e}")
            return False
    
    def clean_csv_in_chunks(self, chunk_size=1000):
        """Clean CSV file by processing in chunks"""
        print("üßπ Starting CSV cleaning...")
        
        try:
            # Get column info first
            sample = pd.read_csv(self.input_file, nrows=5)
            all_columns = sample.columns.tolist()
            
            # Identify important columns
            important_columns = [
                'locationName',  # Essential
            ]
            
            # Add columns that likely contain groundwater data
            for col in all_columns:
                col_lower = col.lower()
                if any(keyword in col_lower for keyword in [
                    'level', 'depth', 'draft', 'recharge', 'extraction',
                    'safe', 'critical', 'stage', 'quality', 'water'
                ]):
                    important_columns.append(col)
            
            # Remove duplicates but keep order
            important_columns = list(dict.fromkeys(important_columns))
            print(f"üìã Keeping {len(important_columns)} important columns")
            
            # Process in chunks
            cleaned_chunks = []
            chunk_count = 0
            
            print("‚è≥ Processing chunks...")
            
            for chunk in pd.read_csv(self.input_file, 
                                   chunksize=chunk_size,
                                   usecols=important_columns):
                
                chunk_count += 1
                print(f"   Processing chunk {chunk_count}...", end='\r')
                
                # Clean this chunk
                cleaned_chunk = self.clean_chunk(chunk)
                
                if not cleaned_chunk.empty:
                    cleaned_chunks.append(cleaned_chunk)
                
                # Limit total chunks to prevent memory issues
                if chunk_count >= 20:  # Process max 20k rows
                    break
            
            if cleaned_chunks:
                # Combine all cleaned chunks
                print(f"\nüîß Combining {len(cleaned_chunks)} chunks...")
                final_data = pd.concat(cleaned_chunks, ignore_index=True)
                
                # Final cleanup
                final_data = self.final_cleanup(final_data)
                
                # Save cleaned data
                self.save_cleaned_data(final_data)
                
                return True
            else:
                print("‚ùå No data survived cleaning")
                return False
                
        except Exception as e:
            print(f"‚ùå Cleaning failed: {e}")
            return False
    
    def clean_chunk(self, chunk):
        """Clean individual chunk"""
        
        # 1. Remove rows where locationName is null
        if 'locationName' in chunk.columns:
            chunk = chunk.dropna(subset=['locationName'])
        
        # 2. Remove completely empty rows
        chunk = chunk.dropna(how='all')
        
        # 3. For each row, remove columns that are null/empty
        # But keep the structure intact
        
        return chunk
    
    def final_cleanup(self, data):
        """Final cleanup of combined data"""
        print("üî® Final cleanup...")
        
        # Remove duplicate locations (keep first occurrence)
        if 'locationName' in data.columns:
            data = data.drop_duplicates(subset=['locationName'], keep='first')
            print(f"   ‚Ä¢ Unique locations: {len(data)}")
        
        # Remove columns that are mostly null
        null_threshold = 0.9  # Remove if 90%+ null
        cols_to_keep = []
        
        for col in data.columns:
            null_percent = data[col].isnull().sum() / len(data)
            if null_percent < null_threshold:
                cols_to_keep.append(col)
        
        data = data[cols_to_keep]
        print(f"   ‚Ä¢ Columns after null removal: {len(data.columns)}")
        
        # Clean up text columns
        for col in data.columns:
            if data[col].dtype == 'object':
                data[col] = data[col].astype(str).str.strip()
                data[col] = data[col].replace(['nan', 'NaN', 'null', ''], np.nan)
        
        return data
    
    def save_cleaned_data(self, clean_data):
        """Save the cleaned data"""
        print("üíæ Saving cleaned data...")
        
        try:
            # Save cleaned version
            clean_data.to_csv(self.output_file, index=False)
            
            # Show results
            print(f"‚úÖ Cleaned data saved!")
            print(f"   ‚Ä¢ File: {self.output_file}")
            print(f"   ‚Ä¢ Rows: {len(clean_data)}")
            print(f"   ‚Ä¢ Columns: {len(clean_data.columns)}")
            
            # Show sample of cleaned data
            print(f"\nüìã Sample of cleaned data:")
            print(clean_data.head(3).to_string())
            
            # Show available locations
            if 'locationName' in clean_data.columns:
                locations = clean_data['locationName'].dropna().unique()
                print(f"\nüìç Available locations ({len(locations)}):")
                print(f"   {', '.join(locations[:10])}")
                if len(locations) > 10:
                    print(f"   ... and {len(locations)-10} more")
            
        except Exception as e:
            print(f"‚ùå Save failed: {e}")
    
    def create_simple_bot(self):
        """Create simple bot that uses cleaned data"""
        
        code = '''
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

class SimpleCleanBot:
    def __init__(self):
        self.data = pd.read_csv('Clean_Groundwater_Data.csv')
        print(f"Clean bot ready! {len(self.data)} locations loaded")
    
    def ask(self, question):
        """Simple ask function"""
        question_lower = question.lower()
        
        # Extract location
        words = question_lower.split()
        location = ""
        
        for i, word in enumerate(words):
            if word in ['in', 'of', 'for'] and i < len(words) - 1:
                location = words[i + 1].upper()
                break
        
        if not location and words:
            location = words[-1].upper()
        
        # Find location
        matches = self.data[self.data['locationName'].str.upper() == location]
        
        if matches.empty:
            return f"Data not available for {location}. Available locations: {', '.join(self.data['locationName'].unique()[:5])}"
        
        row = matches.iloc[0]
        location_name = row['locationName']
        
        # Build response
        response = [f"Data for {location_name}:"]
        
        for col, value in row.items():
            if col != 'locationName' and pd.notna(value) and str(value) != 'nan':
                clean_col = col.replace('_', ' ').replace('.', ' ‚Üí ')
                response.append(f"  ‚Ä¢ {clean_col}: {value}")
        
        if len(response) == 1:
            return f"Data not available for specific metrics in {location_name}"
        
        return "\\n".join(response)
    
    def chat(self):
        """Simple chat"""
        print("\\nGroundwater Bot (Clean Data)")
        print("Ask about any location!")
        
        while True:
            q = input("\\nQuestion: ").strip()
            if q.lower() in ['quit', 'exit']:
                break
            if q:
                print(self.ask(q))

# Create and test the bot
bot = SimpleCleanBot()
        '''
        
        with open('simple_clean_bot.py', 'w') as f:
            f.write(code)
        
        print("ü§ñ Simple bot code saved as 'simple_clean_bot.py'")

# Run the CSV cleaner
print("üöÄ CSV CLEANER STARTED")
print("="*50)

cleaner = CSVCleaner()

if cleaner.analyze_csv():
    print("\n" + "="*50)
    clean_success = cleaner.clean_csv_in_chunks()
    
    if clean_success:
        print("\nüéâ SUCCESS! Clean CSV created")
        
        # Create simple bot code
        cleaner.create_simple_bot()
        
        print("\nüí° Next steps:")
        print("1. Your clean data is in 'Clean_Groundwater_Data.csv'")
        print("2. Run the simple bot with: exec(open('simple_clean_bot.py').read())")
        print("3. Test with: bot.ask('groundwater in SHEOPUR')")
        
        # Quick test of cleaned data
        try:
            clean_data = pd.read_csv('Clean_Groundwater_Data.csv')
            print(f"\n‚úÖ Clean data verification:")
            print(f"   ‚Ä¢ {len(clean_data)} rows")
            print(f"   ‚Ä¢ {len(clean_data.columns)} columns") 
            print(f"   ‚Ä¢ Sample locations: {list(clean_data['locationName'].unique()[:3])}")
        except:
            pass
    
    else:
        print("‚ùå Cleaning failed")
else:
    print("‚ùå Analysis failed")


üöÄ CSV CLEANER STARTED
üîç Analyzing CSV file...
üìä Original structure:
   ‚Ä¢ Rows in sample: 10
   ‚Ä¢ Total columns: 64939
   ‚Ä¢ Sample columns: ['_id', 'locationName', 'area.non_recharge_worthy.commandArea', 'area.non_recharge_worthy.nonCommandArea', 'area.non_recharge_worthy.poorQualityArea']
   ‚Ä¢ Columns with >80% nulls: 64805

üßπ Starting CSV cleaning...
üìã Keeping 57561 important columns
‚è≥ Processing chunks...
   Processing chunk 5...
üîß Combining 5 chunks...
üî® Final cleanup...
   ‚Ä¢ Unique locations: 805
   ‚Ä¢ Columns after null removal: 81
üíæ Saving cleaned data...
‚úÖ Cleaned data saved!
   ‚Ä¢ File: Clean_Groundwater_Data.csv
   ‚Ä¢ Rows: 805
   ‚Ä¢ Columns: 81

üìã Sample of cleaned data:
        locationName  area.non_recharge_worthy.commandArea  area.non_recharge_worthy.nonCommandArea  area.non_recharge_worthy.poorQualityArea  area.non_recharge_worthy.hillyArea  area.non_recharge_worthy.forestArea  area.non_recharge_worthy.totalArea  area.non_rech

UnicodeEncodeError: 'charmap' codec can't encode character '\u2192' in position 1407: character maps to <undefined>

In [13]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

os.chdir("D:/GroundwaterChatbot")

class SimpleCSVCleaner:
    def __init__(self):
        self.input_file = 'Rainwater_data.Ingres_Data.csv'
        self.output_file = 'Clean_Groundwater_Data.csv'
    
    def clean_csv(self):
        """Simple CSV cleaning approach"""
        print("Cleaning CSV file...")
        
        try:
            print("Step 1: Loading data in chunks...")
            
            # Important columns to keep
            important_keywords = [
                'location', 'name', 'district', 'block',  # Location info
                'level', 'depth', 'table', 'draft', 'recharge',  # Groundwater data
                'safe', 'critical', 'stage', 'exploitation',  # Status info
                'quality', 'water'  # General water info
            ]
            
            cleaned_data = []
            chunk_count = 0
            
            # Process CSV in small chunks
            for chunk in pd.read_csv(self.input_file, chunksize=500, low_memory=False):
                chunk_count += 1
                print(f"Processing chunk {chunk_count}...", end='\r')
                
                # Keep rows with location name
                if 'locationName' in chunk.columns:
                    chunk = chunk.dropna(subset=['locationName'])
                
                # Remove completely empty rows
                chunk = chunk.dropna(how='all')
                
                if not chunk.empty:
                    cleaned_data.append(chunk)
                
                # Limit chunks to avoid memory issues
                if chunk_count >= 10:  # Process first 5000 rows
                    break
            
            if cleaned_data:
                print("\nStep 2: Combining cleaned data...")
                final_data = pd.concat(cleaned_data, ignore_index=True)
                
                # Remove duplicate locations
                if 'locationName' in final_data.columns:
                    final_data = final_data.drop_duplicates(subset=['locationName'], keep='first')
                
                # Filter columns - keep only relevant ones
                columns_to_keep = ['locationName']  # Always keep location
                
                for col in final_data.columns:
                    if col != 'locationName':
                        col_lower = col.lower()
                        if any(keyword in col_lower for keyword in important_keywords):
                            columns_to_keep.append(col)
                
                final_data = final_data[columns_to_keep]
                
                print("Step 3: Final cleanup...")
                
                # Replace empty strings and 'nan' with actual NaN
                for col in final_data.columns:
                    if final_data[col].dtype == 'object':
                        final_data[col] = final_data[col].astype(str).str.strip()
                        final_data[col] = final_data[col].replace(['nan', 'NaN', 'null', '', 'None'], np.nan)
                
                # Save cleaned data
                final_data.to_csv(self.output_file, index=False, encoding='utf-8')
                
                print(f"\n‚úÖ SUCCESS!")
                print(f"Clean file created: {self.output_file}")
                print(f"Rows: {len(final_data)}")
                print(f"Columns: {len(final_data.columns)}")
                
                # Show sample locations
                locations = final_data['locationName'].dropna().unique()
                print(f"Locations: {len(locations)}")
                print(f"Sample: {list(locations[:5])}")
                
                return True
            else:
                print("No data survived cleaning")
                return False
                
        except Exception as e:
            print(f"Cleaning failed: {e}")
            return False

# Simple bot class (no file writing, just direct code)
class CleanDataBot:
    def __init__(self, csv_file='Clean_Groundwater_Data.csv'):
        try:
            self.data = pd.read_csv(csv_file, encoding='utf-8')
            print(f"Bot ready! {len(self.data)} locations loaded")
            
            # Show available locations
            locations = self.data['locationName'].dropna().unique()
            print(f"Available: {list(locations[:5])}")
            
        except Exception as e:
            print(f"Bot initialization failed: {e}")
            self.data = None
    
    def find_location(self, search_term):
        """Find location in data"""
        if self.data is None:
            return None
        
        search_upper = search_term.upper()
        
        # Exact match
        exact = self.data[self.data['locationName'].str.upper() == search_upper]
        if not exact.empty:
            return exact.iloc[0]
        
        # Partial match
        partial = self.data[self.data['locationName'].str.upper().str.contains(search_upper, na=False)]
        if not partial.empty:
            return partial.iloc[0]
        
        return None
    
    def ask(self, question):
        """Ask about groundwater data"""
        if self.data is None:
            return "Data not available - bot not initialized"
        
        # Extract location from question
        words = question.lower().split()
        location = ""
        
        for i, word in enumerate(words):
            if word in ['in', 'of', 'for', 'about'] and i < len(words) - 1:
                location = words[i + 1]
                break
        
        if not location and words:
            location = words[-1]
        
        # Find the location data
        location_data = self.find_location(location)
        
        if location_data is None:
            available_locations = list(self.data['locationName'].unique()[:5])
            return f"Data not available for '{location}'. Try: {', '.join(available_locations)}"
        
        # Build response
        location_name = location_data['locationName']
        response = [f"Groundwater data for {location_name}:"]
        
        data_found = False
        for col, value in location_data.items():
            if col != 'locationName' and pd.notna(value) and str(value) not in ['nan', 'None', '']:
                # Clean column name for display
                clean_col = col.replace('_', ' ').replace('.', ' to ')
                response.append(f"  {clean_col}: {value}")
                data_found = True
        
        if not data_found:
            return f"Data not available for specific metrics in {location_name}"
        
        return "\n".join(response)
    
    def show_locations(self):
        """Show all available locations"""
        if self.data is None:
            return "No data available"
        
        locations = sorted(self.data['locationName'].unique())
        return f"Available locations ({len(locations)}): {', '.join(locations)}"
    
    def chat(self):
        """Interactive chat"""
        print("\n" + "="*40)
        print("GROUNDWATER DATA BOT")
        print("="*40)
        print("Commands:")
        print("‚Ä¢ Ask about locations: 'groundwater in SHEOPUR'")  
        print("‚Ä¢ Show locations: 'locations'")
        print("‚Ä¢ Exit: 'quit'")
        print("="*40)
        
        while True:
            try:
                question = input("\nQuestion: ").strip()
                
                if question.lower() in ['quit', 'exit', 'stop']:
                    print("Goodbye!")
                    break
                
                if question.lower() in ['locations', 'list', 'show']:
                    print(self.show_locations())
                    continue
                
                if question:
                    answer = self.ask(question)
                    print(f"\nAnswer:\n{answer}")
                    
            except KeyboardInterrupt:
                print("\nGoodbye!")
                break
            except Exception as e:
                print(f"Error: {e}")

# Run the cleaner
print("STARTING CSV CLEANER")
print("="*30)

cleaner = SimpleCSVCleaner()
success = cleaner.clean_csv()

if success:
    print("\n" + "="*30)
    print("TESTING CLEANED DATA")
    print("="*30)
    
    # Initialize bot with cleaned data
    bot = CleanDataBot('Clean_Groundwater_Data.csv')
    
    if bot.data is not None:
        # Quick test
        print("\nQuick test:")
        test_questions = [
            "groundwater in SHEOPUR",
            "water data for BHOPAL",
            "tell me about INDORE"
        ]
        
        for q in test_questions:
            print(f"\nQ: {q}")
            print(f"A: {bot.ask(q)}")
        
        print("\n" + "="*30)
        start_chat = input("Start interactive chat? (y/n): ").lower()
        
        if start_chat == 'y':
            bot.chat()
    else:
        print("Bot failed to load cleaned data")
else:
    print("CSV cleaning failed")


STARTING CSV CLEANER
Cleaning CSV file...
Step 1: Loading data in chunks...
Processing chunk 10...
Step 2: Combining cleaned data...
Step 3: Final cleanup...

‚úÖ SUCCESS!
Clean file created: Clean_Groundwater_Data.csv
Rows: 805
Columns: 59718
Locations: 805
Sample: ['NORTH 24 PARGANAS', 'DAKSHIN DINAJPUR', 'UTTAR DINAJPUR', 'PURULIYA', 'BIRBHUM']

TESTING CLEANED DATA
Bot ready! 805 locations loaded
Available: ['NORTH 24 PARGANAS', 'DAKSHIN DINAJPUR', 'UTTAR DINAJPUR', 'PURULIYA', 'BIRBHUM']

Quick test:

Q: groundwater in SHEOPUR
A: Groundwater data for SHEOPUR:
  area to total to poorQualityArea: 0.0
  area to recharge worthy to commandArea: 0.0
  area to recharge worthy to nonCommandArea: 0.0
  area to recharge worthy to poorQualityArea: 0.0
  area to recharge worthy to hillyArea: 0.0
  area to recharge worthy to forestArea: 0.0
  area to recharge worthy to totalArea: 0.0
  area to recharge worthy to pavedArea: 0.0
  area to recharge worthy to unpavedArea: 0.0
  reportSummary to to

Start interactive chat? (y/n):  y



GROUNDWATER DATA BOT
Commands:
‚Ä¢ Ask about locations: 'groundwater in SHEOPUR'
‚Ä¢ Show locations: 'locations'
‚Ä¢ Exit: 'quit'



Question:  area in BHOPAL



Answer:
Groundwater data for BHOPAL:
  area to total to poorQualityArea: 0.0
  area to recharge worthy to commandArea: 0.0
  area to recharge worthy to nonCommandArea: 0.0
  area to recharge worthy to poorQualityArea: 0.0
  area to recharge worthy to hillyArea: 0.0
  area to recharge worthy to forestArea: 0.0
  area to recharge worthy to totalArea: 0.0
  area to recharge worthy to pavedArea: 0.0
  area to recharge worthy to unpavedArea: 0.0
  reportSummary to total to BLOCK to semi critical: 1.0
  reportSummary to total to BLOCK to safe: 1.0
  approvalLevel: 4.0
  locationUUID: 0f913787-1edc-4710-a62e-83db36a21983
  computationSummary to annual to stage to total to av: 37001.0
  rechargeData to rainfall to total: 28779.0
  rechargeData to total to total: 38948.0
  draftData to domestic to total: 2119.05
  draftData to agriculture to total: 23462.0
  draftData to total to total: 25955.0
  draftData to industry to total: 373.95
  gwallocation to domestic to total: 2493.0
  stageOfExtrac


Question:  quit


Goodbye!


In [15]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

# Make sure we're in the right directory
os.chdir("D:/GroundwaterChatbot")

class GroundwaterChatbot:
    def __init__(self):
        self.data = None
        self.locations = []
        self.columns = []
        self.load_clean_data()
    
    def load_clean_data(self):
        """Load the cleaned groundwater data"""
        try:
            print("Loading clean groundwater data...")
            self.data = pd.read_csv('Clean_Groundwater_Data.csv', encoding='utf-8')
            
            print(f"‚úÖ Data loaded successfully!")
            print(f"   üìä {len(self.data)} locations available")
            print(f"   üìã {len(self.data.columns)} data columns")
            
            # Get unique locations
            self.locations = sorted(self.data['locationName'].dropna().unique())
            self.columns = list(self.data.columns)
            
            print(f"   üìç Sample locations: {self.locations[:5]}")
            
        except FileNotFoundError:
            print("‚ùå Clean_Groundwater_Data.csv not found!")
            print("üí° Run the CSV cleaner first to create the clean data file")
            return False
        except Exception as e:
            print(f"‚ùå Error loading data: {e}")
            return False
        
        return True
    
    def find_location_data(self, location_query):
        """Find location data with smart matching"""
        if self.data is None:
            return None, "Data not loaded"
        
        location_upper = location_query.upper().strip()
        
        # Method 1: Exact match
        exact_match = self.data[self.data['locationName'].str.upper() == location_upper]
        if not exact_match.empty:
            return exact_match.iloc[0], exact_match.iloc[0]['locationName']
        
        # Method 2: Contains query
        contains_match = self.data[self.data['locationName'].str.upper().str.contains(location_upper, na=False)]
        if not contains_match.empty:
            return contains_match.iloc[0], contains_match.iloc[0]['locationName']
        
        # Method 3: Query contains location
        for location in self.locations:
            if location.upper() in location_upper:
                match = self.data[self.data['locationName'] == location]
                if not match.empty:
                    return match.iloc[0], location
        
        # Method 4: Fuzzy matching (first few characters)
        for location in self.locations:
            if location_upper.startswith(location[:3].upper()) or location.upper().startswith(location_upper[:3]):
                match = self.data[self.data['locationName'] == location]
                if not match.empty:
                    return match.iloc[0], location
        
        return None, None
    
    def format_groundwater_response(self, location_data, location_name, question_type):
        """Format human-readable response"""
        
        # Different responses based on question type
        if any(word in question_type for word in ['level', 'depth', 'table']):
            return self.get_level_info(location_data, location_name)
        
        elif any(word in question_type for word in ['critical', 'status', 'safe', 'danger']):
            return self.get_status_info(location_data, location_name)
        
        elif any(word in question_type for word in ['quality', 'pollut', 'contamin']):
            return self.get_quality_info(location_data, location_name)
        
        elif any(word in question_type for word in ['all', 'complete', 'everything', 'full']):
            return self.get_all_info(location_data, location_name)
        
        else:
            # Default: give overview
            return self.get_overview_info(location_data, location_name)
    
    def get_level_info(self, data, location):
        """Get groundwater level information"""
        level_keywords = ['level', 'depth', 'table', 'gwl']
        
        level_data = []
        for col in self.columns:
            if any(keyword in col.lower() for keyword in level_keywords):
                value = data[col]
                if pd.notna(value) and str(value) not in ['nan', 'None', '']:
                    clean_col = self.clean_column_name(col)
                    level_data.append(f"‚Ä¢ {clean_col}: {value}")
        
        if level_data:
            response = f"üåä Groundwater level data for {location}:\n"
            response += "\n".join(level_data)
            return response
        else:
            return f"Groundwater level data is not available for {location}."
    
    def get_status_info(self, data, location):
        """Get criticality/status information"""
        status_keywords = ['critical', 'safe', 'stage', 'exploitat', 'status', 'category']
        
        status_data = []
        for col in self.columns:
            if any(keyword in col.lower() for keyword in status_keywords):
                value = data[col]
                if pd.notna(value) and str(value) not in ['nan', 'None', '']:
                    clean_col = self.clean_column_name(col)
                    status_data.append(f"‚Ä¢ {clean_col}: {value}")
        
        if status_data:
            response = f"‚ö†Ô∏è Water status for {location}:\n"
            response += "\n".join(status_data)
            return response
        else:
            return f"Water status information is not available for {location}."
    
    def get_quality_info(self, data, location):
        """Get water quality information"""
        quality_keywords = ['quality', 'pollut', 'contamin', 'tds', 'ph']
        
        quality_data = []
        for col in self.columns:
            if any(keyword in col.lower() for keyword in quality_keywords):
                value = data[col]
                if pd.notna(value) and str(value) not in ['nan', 'None', '']:
                    clean_col = self.clean_column_name(col)
                    quality_data.append(f"‚Ä¢ {clean_col}: {value}")
        
        if quality_data:
            response = f"üß™ Water quality data for {location}:\n"
            response += "\n".join(quality_data)
            return response
        else:
            return f"Water quality data is not available for {location}."
    
    def get_overview_info(self, data, location):
        """Get general overview"""
        response = f"üìä Groundwater overview for {location}:\n"
        
        # Show first 5 non-null data points
        data_points = []
        for col in self.columns[:10]:  # Check first 10 columns
            if col != 'locationName':
                value = data[col]
                if pd.notna(value) and str(value) not in ['nan', 'None', '']:
                    clean_col = self.clean_column_name(col)
                    data_points.append(f"‚Ä¢ {clean_col}: {value}")
                    
                    if len(data_points) >= 5:  # Limit to 5 points
                        break
        
        if data_points:
            response += "\n".join(data_points)
        else:
            response += "Detailed data is not available for this location."
        
        return response
    
    def get_all_info(self, data, location):
        """Get all available information"""
        response = f"üìã Complete data for {location}:\n"
        response += "=" * 40 + "\n"
        
        data_points = []
        for col in self.columns:
            if col != 'locationName':
                value = data[col]
                if pd.notna(value) and str(value) not in ['nan', 'None', '']:
                    clean_col = self.clean_column_name(col)
                    data_points.append(f"‚Ä¢ {clean_col}: {value}")
        
        if data_points:
            response += "\n".join(data_points)
            response += f"\n\nüìä Total data points: {len(data_points)}"
        else:
            response += "No detailed data available for this location."
        
        return response
    
    def clean_column_name(self, column_name):
        """Clean column names for human readability"""
        clean = str(column_name)
        clean = clean.replace('_', ' ')
        clean = clean.replace('.', ' ‚Üí ')
        clean = clean.replace('  ', ' ')
        return clean.title()
    
    def extract_location_from_question(self, question):
        """Extract location from user question"""
        words = question.lower().split()
        
        # Look for location after prepositions
        prepositions = ['in', 'of', 'for', 'about', 'from', 'at']
        for i, word in enumerate(words):
            if word in prepositions and i + 1 < len(words):
                return words[i + 1]
        
        # Look for known locations in the question
        for location in self.locations:
            for word in words:
                if word.upper() == location.upper():
                    return location
        
        # Use last word as fallback
        if words:
            return words[-1]
        
        return ""
    
    def ask(self, question):
        """Main function to ask questions"""
        if self.data is None:
            return "‚ùå Data not available. Please load the clean data first."
        
        question = question.strip()
        question_lower = question.lower()
        
        # Handle special commands
        if any(cmd in question_lower for cmd in ['locations', 'list', 'available', 'show']):
            return self.show_available_locations()
        
        if any(cmd in question_lower for cmd in ['help', 'commands', 'what can']):
            return self.show_help()
        
        # Extract location from question
        location_query = self.extract_location_from_question(question)
        
        if not location_query:
            return "Please specify a location. For example: 'What is groundwater level in SHEOPUR?'"
        
        # Find location data
        location_data, actual_location = self.find_location_data(location_query)
        
        if location_data is None:
            return f"Data not available for '{location_query}'. Type 'locations' to see available options."
        
        # Generate response based on question type
        response = self.format_groundwater_response(location_data, actual_location, question_lower)
        return response
    
    def show_available_locations(self):
        """Show all available locations"""
        if not self.locations:
            return "No locations available."
        
        response = f"üìç Available locations ({len(self.locations)} total):\n"
        response += "=" * 40 + "\n"
        
        # Group by first letter
        current_letter = ""
        for location in self.locations:
            first_letter = location[0].upper()
            if first_letter != current_letter:
                current_letter = first_letter
                response += f"\n{first_letter}:\n"
            response += f"  ‚Ä¢ {location}\n"
        
        return response
    
    def show_help(self):
        """Show help information"""
        help_text = """
ü§ñ Groundwater Chatbot Help

‚ùì How to ask questions:
‚Ä¢ "What is groundwater level in SHEOPUR?"
‚Ä¢ "Tell me about water status in BHOPAL"
‚Ä¢ "Show water quality for INDORE"
‚Ä¢ "Give me all data for GWALIOR"

üìã Available commands:
‚Ä¢ "locations" - Show all available locations
‚Ä¢ "help" - Show this help message

üí° Question types supported:
‚Ä¢ Groundwater levels/depth
‚Ä¢ Water status/criticality
‚Ä¢ Water quality information
‚Ä¢ Complete data overview

Example questions:
‚Ä¢ "groundwater in SHEOPUR"
‚Ä¢ "BHOPAL water critical?"
‚Ä¢ "all data INDORE"
        """
        return help_text
    
    def chat(self):
        """Interactive chat interface"""
        print("\n" + "üåä" * 20)
        print("     GROUNDWATER DATA CHATBOT")
        print("üåä" * 20)
        
        if self.data is None:
            print("‚ùå Data not loaded. Cannot start chat.")
            return
        
        print(f"‚úÖ Ready! {len(self.locations)} locations available")
        print("\nüí° Try these commands:")
        print("‚Ä¢ Ask about any location: 'groundwater in SHEOPUR'")
        print("‚Ä¢ See all locations: 'locations'")
        print("‚Ä¢ Get help: 'help'")
        print("‚Ä¢ Exit: 'quit'")
        print("\n" + "=" * 50)
        
        while True:
            try:
                question = input("\n‚ùì Your question: ").strip()
                
                if question.lower() in ['quit', 'exit', 'bye', 'stop']:
                    print("üëã Thank you for using Groundwater Chatbot!")
                    break
                
                if not question:
                    print("Please ask a question or type 'help' for guidance.")
                    continue
                
                # Get response
                answer = self.ask(question)
                print(f"\nü§ñ Answer:\n{answer}")
                
            except KeyboardInterrupt:
                print("\nüëã Goodbye!")
                break
            except Exception as e:
                print(f"‚ùå Error: {e}")
                print("Please try again or type 'help' for guidance.")

# Initialize and start the chatbot
print("üöÄ INITIALIZING GROUNDWATER CHATBOT")
print("=" * 50)

# Create the chatbot
chatbot = GroundwaterChatbot()

if chatbot.data is not None:
    # Quick test
    print("\nüß™ Quick test:")
    test_questions = [
        "What is groundwater level in SHEOPUR?",
        "Tell me about BHOPAL water status",
        "Show all data for INDORE"
    ]
    
    for q in test_questions:
        print(f"\nQ: {q}")
        answer = chatbot.ask(q)
        print(f"A: {answer}")
    
    print("\n" + "=" * 50)
    start_interactive = input("üéØ Start interactive chatbot? (y/n): ").lower()
    
    if start_interactive == 'y':
        chatbot.chat()
    else:
        print("‚úÖ Chatbot ready!")
        print("üí° Use: chatbot.ask('your question') to test")
        print("üí° Use: chatbot.chat() to start interactive mode")

else:
    print("‚ùå Chatbot initialization failed")
    print("üí° Make sure 'Clean_Groundwater_Data.csv' exists")


üöÄ INITIALIZING GROUNDWATER CHATBOT
Loading clean groundwater data...
‚úÖ Data loaded successfully!
   üìä 805 locations available
   üìã 59718 data columns
   üìç Sample locations: ['AGAR MALWA', 'AGATTI', 'AGRA', 'AHMEDABAD', 'AIZAWL']

üß™ Quick test:

Q: What is groundwater level in SHEOPUR?
A: üåä Groundwater level data for SHEOPUR:
‚Ä¢ Approvallevel: 4.0

Q: Tell me about BHOPAL water status
A: ‚ö†Ô∏è Water status for BHOPAL:
‚Ä¢ Reportsummary ‚Üí Total ‚Üí Block ‚Üí Semi Critical: 1.0
‚Ä¢ Reportsummary ‚Üí Total ‚Üí Block ‚Üí Safe: 1.0
‚Ä¢ Computationsummary ‚Üí Annual ‚Üí Stage ‚Üí Total ‚Üí Av: 37001.0
‚Ä¢ Stageofextraction ‚Üí Total: 70.14675279046513
‚Ä¢ Category ‚Üí Poor Quality: Hilly Area
‚Ä¢ Reportsummary ‚Üí C7484776-B525-43D0-876F-63Fe9Df9F988 ‚Üí Block ‚Üí Safe: 1.0
‚Ä¢ Reportsummary ‚Üí 4688124B-3948-4C50-Ab70-97C82622F948 ‚Üí Block ‚Üí Semi Critical: 1.0

Q: Show all data for INDORE
A: üìç Available locations (805 total):

A:
  ‚Ä¢ AGAR MALWA
  ‚Ä¢ AGATTI
  

üéØ Start interactive chatbot? (y/n):  y



üåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåä
     GROUNDWATER DATA CHATBOT
üåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåäüåä
‚úÖ Ready! 805 locations available

üí° Try these commands:
‚Ä¢ Ask about any location: 'groundwater in SHEOPUR'
‚Ä¢ See all locations: 'locations'
‚Ä¢ Get help: 'help'
‚Ä¢ Exit: 'quit'




‚ùì Your question:  Area cover in MIRPUR



ü§ñ Answer:
üìä Groundwater overview for Mirpur:
‚Ä¢ Area ‚Üí Non Recharge Worthy ‚Üí Commandarea: 0.0
‚Ä¢ Area ‚Üí Non Recharge Worthy ‚Üí Noncommandarea: 0.0
‚Ä¢ Area ‚Üí Non Recharge Worthy ‚Üí Poorqualityarea: 0.0
‚Ä¢ Area ‚Üí Non Recharge Worthy ‚Üí Hillyarea: 0.0
‚Ä¢ Area ‚Üí Non Recharge Worthy ‚Üí Forestarea: 0.0



‚ùì Your question:  area in gwalior



ü§ñ Answer:
üìä Groundwater overview for GWALIOR:
‚Ä¢ Area ‚Üí Total ‚Üí Poorqualityarea: 0.0



‚ùì Your question:  commandarea in gwalior



ü§ñ Answer:
üìä Groundwater overview for GWALIOR:
‚Ä¢ Area ‚Üí Total ‚Üí Poorqualityarea: 0.0



‚ùì Your question:  loss in gwalior



ü§ñ Answer:
üìä Groundwater overview for GWALIOR:
‚Ä¢ Area ‚Üí Total ‚Üí Poorqualityarea: 0.0



‚ùì Your question:  level in gwalior



ü§ñ Answer:
üåä Groundwater level data for GWALIOR:
‚Ä¢ Approvallevel: 4.0



‚ùì Your question:  criticality in gwalior



ü§ñ Answer:
‚ö†Ô∏è Water status for GWALIOR:
‚Ä¢ Reportsummary ‚Üí Total ‚Üí Block ‚Üí Safe: 4.0
‚Ä¢ Computationsummary ‚Üí Annual ‚Üí Stage ‚Üí Total ‚Üí Av: 74837.0
‚Ä¢ Stageofextraction ‚Üí Total: 29.86247444445929
‚Ä¢ Category ‚Üí Poor Quality: Hilly Area
‚Ä¢ Reportsummary ‚Üí B2F04400-D428-48F0-83A1-Bf2F26232Fc3 ‚Üí Block ‚Üí Safe: 1.0
‚Ä¢ Reportsummary ‚Üí E5468D5B-849C-4675-Ba7A-Ae67271896E6 ‚Üí Block ‚Üí Safe: 1.0
‚Ä¢ Reportsummary ‚Üí 15718E3C-5792-4Fa2-8735-8F8Beb75E7A8 ‚Üí Block ‚Üí Safe: 1.0
‚Ä¢ Reportsummary ‚Üí 87F5046A-C729-4Bc2-B2Ad-A0065676B07C ‚Üí Block ‚Üí Safe: 1.0



‚ùì Your question:  quit


üëã Thank you for using Groundwater Chatbot!


In [19]:

import pandas as pd
import numpy as np
import os
import re
import warnings
warnings.filterwarnings('ignore')

os.chdir("D:/GroundwaterChatbot")

class FixedMultilingualChatbot:
    def __init__(self):
        self.data = None
        self.locations = []
        self.columns = []
        self.load_clean_data()
        self.setup_translations()
    
    def load_clean_data(self):
        """Load the cleaned groundwater data"""
        try:
            print("Loading clean groundwater data...")
            self.data = pd.read_csv('Clean_Groundwater_Data.csv', encoding='utf-8')
            
            print(f"‚úÖ Data loaded successfully!")
            print(f"   üìä {len(self.data)} locations available")
            
            # Get unique locations and clean them
            self.locations = []
            for loc in self.data['locationName'].dropna().unique():
                clean_loc = str(loc).strip()
                if clean_loc and clean_loc not in ['nan', 'None']:
                    self.locations.append(clean_loc)
            
            self.locations = sorted(self.locations)
            self.columns = list(self.data.columns)
            
            print(f"   üìç Clean locations: {len(self.locations)}")
            print(f"   Sample: {self.locations[:5]}")
            
        except Exception as e:
            print(f"‚ùå Error loading data: {e}")
            return False
        
        return True
    
    def setup_translations(self):
        """Setup translation dictionaries"""
        self.translations = {
            'hindi': {
                'groundwater': '‡§≠‡•Ç‡§ú‡§≤',
                'water level': '‡§ú‡§≤ ‡§∏‡•ç‡§§‡§∞',
                'level': '‡§∏‡•ç‡§§‡§∞',
                'data': '‡§°‡•á‡§ü‡§æ',
                'not available': '‡§â‡§™‡§≤‡§¨‡•ç‡§ß ‡§®‡§π‡•Ä‡§Ç ‡§π‡•à',
                'for': '‡§ï‡•á ‡§≤‡§ø‡§è',
                'in': '‡§Æ‡•á‡§Ç',
                'status': '‡§∏‡•ç‡§•‡§ø‡§§‡§ø',
                'critical': '‡§ó‡§Ç‡§≠‡•Ä‡§∞',
                'safe': '‡§∏‡•Å‡§∞‡§ï‡•ç‡§∑‡§ø‡§§'
            },
            'kannada': {
                'groundwater': '‡≤≠‡≥Ç‡≤ú‡≤≤',
                'water level': '‡≤®‡≥Ä‡≤∞‡≤ø‡≤® ‡≤Æ‡≤ü‡≥ç‡≤ü',
                'level': '‡≤Æ‡≤ü‡≥ç‡≤ü',
                'data': '‡≤°‡≥á‡≤ü‡≤æ',
                'not available': '‡≤≤‡≤≠‡≥ç‡≤Ø‡≤µ‡≤ø‡≤≤‡≥ç‡≤≤',
                'for': '‡≤ó‡≥Ü',
                'in': '‡≤®‡≤≤‡≥ç‡≤≤‡≤ø',
                'status': '‡≤∏‡≥ç‡≤•‡≤ø‡≤§‡≤ø',
                'critical': '‡≤ó‡≤Ç‡≤≠‡≥Ä‡≤∞',
                'safe': '‡≤∏‡≥Å‡≤∞‡≤ï‡≥ç‡≤∑‡≤ø‡≤§'
            }
        }
        
        print("üåê Multilingual support ready")
    
    def detect_language(self, text):
        """Simple language detection"""
        # Check for Hindi characters
        if re.search(r'[\u0900-\u097F]', text):
            return 'hindi'
        
        # Check for Kannada characters
        if re.search(r'[\u0C80-\u0CFF]', text):
            return 'kannada'
        
        return 'english'
    
    def safe_find_location(self, location_query):
        """Safe location finding without regex issues"""
        if self.data is None:
            return None, None
        
        query_upper = str(location_query).upper().strip()
        
        # Method 1: Direct exact match
        for location in self.locations:
            if location.upper() == query_upper:
                match = self.data[self.data['locationName'] == location]
                if not match.empty:
                    return match.iloc[0], location
        
        # Method 2: Simple substring check
        for location in self.locations:
            if query_upper in location.upper() or location.upper() in query_upper:
                match = self.data[self.data['locationName'] == location]
                if not match.empty:
                    return match.iloc[0], location
        
        # Method 3: Word-by-word matching
        query_words = query_upper.split()
        for location in self.locations:
            location_words = location.upper().split()
            if any(word in location_words for word in query_words if len(word) > 2):
                match = self.data[self.data['locationName'] == location]
                if not match.empty:
                    return match.iloc[0], location
        
        return None, None
    
    def extract_location_simple(self, question):
        """Simple location extraction"""
        words = question.lower().split()
        
        # Look after common prepositions
        prepositions = ['in', 'of', 'for', 'about', '‡§Æ‡•á‡§Ç', '‡§ï‡§æ', '‡§ï‡•Ä', '‡≤®‡≤≤‡≥ç‡≤≤‡≤ø', '‡≤®']
        for i, word in enumerate(words):
            if word in prepositions and i + 1 < len(words):
                return words[i + 1]
        
        # Look for known locations directly
        for word in words:
            for location in self.locations:
                if word.upper() == location.upper():
                    return location
        
        # Use last meaningful word
        meaningful_words = [w for w in words if len(w) > 2]
        return meaningful_words[-1] if meaningful_words else ""
    
    def get_simple_data_info(self, location_data, location_name):
        """Get simple data information"""
        if location_data is None:
            return f"Data not available for {location_name}"
        
        response = f"Groundwater data for {location_name}:\n"
        
        # Show available data
        data_count = 0
        for col in self.columns:
            if col != 'locationName':
                value = location_data[col]
                if pd.notna(value) and str(value) not in ['nan', 'None', '']:
                    clean_col = col.replace('_', ' ').replace('.', ' ')
                    response += f"‚Ä¢ {clean_col}: {value}\n"
                    data_count += 1
                    
                    if data_count >= 5:  # Limit to 5 items
                        break
        
        if data_count == 0:
            response += "No specific data available for this location."
        
        return response
    
    def translate_simple_response(self, english_response, language):
        """Simple translation of key terms"""
        if language == 'english':
            return english_response
        
        response = english_response
        translations = self.translations[language]
        
        # Replace key terms
        for eng_term, local_term in translations.items():
            response = response.replace(eng_term, local_term)
        
        return response
    
    def ask_multilingual(self, question):
        """Main multilingual ask function"""
        if self.data is None:
            return "‚ùå Data not loaded"
        
        question = str(question).strip()
        
        # Detect language
        language = self.detect_language(question)
        
        # Handle special commands
        if any(cmd in question.lower() for cmd in ['locations', 'list', 'available']):
            return self.show_locations(language)
        
        # Extract location
        location_query = self.extract_location_simple(question)
        
        if not location_query:
            if language == 'hindi':
                return "‡§ï‡•É‡§™‡§Ø‡§æ ‡§è‡§ï ‡§∏‡•ç‡§•‡§æ‡§® ‡§¨‡§§‡§æ‡§è‡§Ç‡•§ ‡§â‡§¶‡§æ‡§π‡§∞‡§£: 'SHEOPUR ‡§Æ‡•á‡§Ç ‡§ú‡§≤ ‡§∏‡•ç‡§§‡§∞'"
            elif language == 'kannada':
                return "‡≤¶‡≤Ø‡≤µ‡≤ø‡≤ü‡≥ç‡≤ü‡≥Å ‡≤í‡≤Ç‡≤¶‡≥Å ‡≤∏‡≥ç‡≤•‡≤≥‡≤µ‡≤®‡≥ç‡≤®‡≥Å ‡≤π‡≥á‡≤≥‡≤ø‡•§ ‡≤â‡≤¶‡≤æ‡≤π‡≤∞‡≤£‡≥Ü: 'SHEOPUR ‡≤®‡≤≤‡≥ç‡≤≤‡≤ø ‡≤ú‡≤≤ ‡≤Æ‡≤ü‡≥ç‡≤ü'"
            else:
                return "Please specify a location. Example: 'water level in SHEOPUR'"
        
        # Find location data safely
        location_data, actual_location = self.safe_find_location(location_query)
        
        if location_data is None:
            if language == 'hindi':
                return f"'{location_query}' ‡§ï‡•á ‡§≤‡§ø‡§è ‡§°‡•á‡§ü‡§æ ‡§â‡§™‡§≤‡§¨‡•ç‡§ß ‡§®‡§π‡•Ä‡§Ç ‡§π‡•à‡•§"
            elif language == 'kannada':
                return f"'{location_query}' ‡≤ó‡≥Ü ‡≤°‡≥á‡≤ü‡≤æ ‡≤≤‡≤≠‡≥ç‡≤Ø‡≤µ‡≤ø‡≤≤‡≥ç‡≤≤‡•§"
            else:
                return f"Data not available for '{location_query}'. Try: {', '.join(self.locations[:3])}"
        
        # Get response
        english_response = self.get_simple_data_info(location_data, actual_location)
        
        # Translate if needed
        if language != 'english':
            local_response = self.translate_simple_response(english_response, language)
            return f"{local_response}\n\nEnglish: {english_response}"
        
        return english_response
    
    def show_locations(self, language='english'):
        """Show available locations"""
        if language == 'hindi':
            header = f"‡§â‡§™‡§≤‡§¨‡•ç‡§ß ‡§∏‡•ç‡§•‡§æ‡§® ({len(self.locations)} ‡§ï‡•Å‡§≤):"
        elif language == 'kannada':
            header = f"‡≤≤‡≤≠‡≥ç‡≤Ø‡≤µ‡≤ø‡≤∞‡≥Å‡≤µ ‡≤∏‡≥ç‡≤•‡≤≥‡≤ó‡≤≥‡≥Å ({len(self.locations)} ‡≤í‡≤ü‡≥ç‡≤ü‡≥Å):"
        else:
            header = f"Available locations ({len(self.locations)} total):"
        
        response = header + "\n"
        for i, location in enumerate(self.locations[:15]):  # Show first 15
            response += f"{i+1:2d}. {location}\n"
        
        if len(self.locations) > 15:
            response += f"... and {len(self.locations)-15} more"
        
        return response
    
    def chat(self):
        """Simple multilingual chat"""
        print("\n" + "üåê" * 15)
        print("MULTILINGUAL GROUNDWATER CHATBOT")
        print("üåê" * 15)
        
        if self.data is None:
            print("‚ùå Data not loaded")
            return
        
        print(f"‚úÖ Ready! {len(self.locations)} locations")
        print("üåê Languages: English, Hindi, Kannada")
        print("\nExamples:")
        print("‚Ä¢ English: 'groundwater in SHEOPUR'")
        print("‚Ä¢ Hindi: 'SHEOPUR ‡§Æ‡•á‡§Ç ‡§ú‡§≤ ‡§∏‡•ç‡§§‡§∞'")
        print("‚Ä¢ Kannada: 'SHEOPUR ‡≤®‡≤≤‡≥ç‡≤≤‡≤ø ‡≤ú‡≤≤ ‡≤Æ‡≤ü‡≥ç‡≤ü'")
        print("\nCommands: 'locations', 'quit'")
        print("=" * 40)
        
        while True:
            try:
                question = input("\nüåê Question: ").strip()
                
                if question.lower() in ['quit', 'exit', 'stop']:
                    print("üëã Goodbye! ‡§Ö‡§≤‡§µ‡§ø‡§¶‡§æ! ‡≤µ‡≤ø‡≤¶‡≤æ‡≤Ø!")
                    break
                
                if question:
                    answer = self.ask_multilingual(question)
                    print(f"\nü§ñ Answer:\n{answer}")
                    
            except KeyboardInterrupt:
                print("\nüëã Goodbye!")
                break
            except Exception as e:
                print(f"‚ùå Error: {e}")

# Initialize the fixed chatbot
print("üöÄ INITIALIZING FIXED MULTILINGUAL CHATBOT")
print("=" * 50)

chatbot = FixedMultilingualChatbot()

if chatbot.data is not None:
    print("\nüß™ Quick test:")
    
    test_questions = [
        "groundwater in SHEOPUR",
        "SHEOPUR ‡§Æ‡•á‡§Ç ‡§ú‡§≤ ‡§∏‡•ç‡§§‡§∞",
        "SHEOPUR ‡≤®‡≤≤‡≥ç‡≤≤‡≤ø ‡≤ú‡≤≤ ‡≤Æ‡≤ü‡≥ç‡≤ü"
    ]
    
    for q in test_questions:
        try:
            print(f"\nQ: {q}")
            answer = chatbot.ask_multilingual(q)
            print(f"A: {answer}")
        except Exception as e:
            print(f"Error with '{q}': {e}")
    
    print("\n" + "=" * 50)
    start_chat = input("Start multilingual chat? (y/n): ").lower()
    
    if start_chat == 'y':
        chatbot.chat()
    else:
        print("‚úÖ Chatbot ready!")
        print("üí° Use: chatbot.ask_multilingual('your question')")

else:
    print("‚ùå Initialization failed")

üöÄ INITIALIZING FIXED MULTILINGUAL CHATBOT
Loading clean groundwater data...
‚úÖ Data loaded successfully!
   üìä 805 locations available
   üìç Clean locations: 805
   Sample: ['AGAR MALWA', 'AGATTI', 'AGRA', 'AHMEDABAD', 'AIZAWL']
üåê Multilingual support ready

üß™ Quick test:

Q: groundwater in SHEOPUR
A: Groundwater data for SHEOPUR:
‚Ä¢ area total poorQualityArea: 0.0
‚Ä¢ area recharge worthy commandArea: 0.0
‚Ä¢ area recharge worthy nonCommandArea: 0.0
‚Ä¢ area recharge worthy poorQualityArea: 0.0
‚Ä¢ area recharge worthy hillyArea: 0.0


Q: SHEOPUR ‡§Æ‡•á‡§Ç ‡§ú‡§≤ ‡§∏‡•ç‡§§‡§∞
A: '‡§ú‡§≤' ‡§ï‡•á ‡§≤‡§ø‡§è ‡§°‡•á‡§ü‡§æ ‡§â‡§™‡§≤‡§¨‡•ç‡§ß ‡§®‡§π‡•Ä‡§Ç ‡§π‡•à‡•§

Q: SHEOPUR ‡≤®‡≤≤‡≥ç‡≤≤‡≤ø ‡≤ú‡≤≤ ‡≤Æ‡≤ü‡≥ç‡≤ü
A: '‡≤ú‡≤≤' ‡≤ó‡≥Ü ‡≤°‡≥á‡≤ü‡≤æ ‡≤≤‡≤≠‡≥ç‡≤Ø‡≤µ‡≤ø‡≤≤‡≥ç‡≤≤‡•§



Start multilingual chat? (y/n):  y



üåêüåêüåêüåêüåêüåêüåêüåêüåêüåêüåêüåêüåêüåêüåê
MULTILINGUAL GROUNDWATER CHATBOT
üåêüåêüåêüåêüåêüåêüåêüåêüåêüåêüåêüåêüåêüåêüåê
‚úÖ Ready! 805 locations
üåê Languages: English, Hindi, Kannada

Examples:
‚Ä¢ English: 'groundwater in SHEOPUR'
‚Ä¢ Hindi: 'SHEOPUR ‡§Æ‡•á‡§Ç ‡§ú‡§≤ ‡§∏‡•ç‡§§‡§∞'
‚Ä¢ Kannada: 'SHEOPUR ‡≤®‡≤≤‡≥ç‡≤≤‡≤ø ‡≤ú‡≤≤ ‡≤Æ‡≤ü‡≥ç‡≤ü'

Commands: 'locations', 'quit'



üåê Question:  locations



ü§ñ Answer:
Available locations (805 total):
 1. AGAR MALWA
 2. AGATTI
 3. AGRA
 4. AHMEDABAD
 5. AIZAWL
 6. AJMER
 7. ALAPPUZHA
 8. ALIGARH
 9. ALIPURDUAR
10. ALIRAJPUR
11. ALMORA
12. ALWAR
13. AMBALA
14. AMBEDKAR NAGAR
15. AMETHI
... and 790 more



üåê Question:  aizwal



ü§ñ Answer:
Data not available for 'aizwal'. Try: AGAR MALWA, AGATTI, AGRA



üåê Question:  agra



ü§ñ Answer:
Groundwater data for AGRA:
‚Ä¢ area total poorQualityArea: 0.0
‚Ä¢ area recharge worthy commandArea: 0.0
‚Ä¢ area recharge worthy nonCommandArea: 0.0
‚Ä¢ area recharge worthy poorQualityArea: 0.0
‚Ä¢ area recharge worthy hillyArea: 0.0




üåê Question:  quit


üëã Goodbye! ‡§Ö‡§≤‡§µ‡§ø‡§¶‡§æ! ‡≤µ‡≤ø‡≤¶‡≤æ‡≤Ø!
