In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix

# Load the Dataset (Make sure the file is in the same folder)
df = pd.read_csv("complaints_master.csv")

# Convert Timestamp to DateTime object immediately
df['timestamp'] = pd.to_datetime(df['timestamp'])

print("Data Loaded Successfully!")
print(f"Total Complaints: {df.shape[0]}")
display(df.head(3))

Data Loaded Successfully!
Total Complaints: 1000


Unnamed: 0,complaint_id,complaint_text,timestamp,area,issue_category,issue_type,severity_level,complaint_channel,user_repeat_flag,affected_duration_hours,resolution_status,priority_label
0,1,Too many potholes near the main junction.,2026-02-08 10:15:00,Ahmedabad West,road,pothole,high,app,no,12,open,high
1,2,Garbage truck did not come since 3 days.,2026-02-08 11:20:00,Ahmedabad East,garbage,garbage_not_collected,medium,web,yes,72,in_progress,medium
2,3,Dirty water coming from tap.,2026-02-08 06:45:00,Ahmedabad North,water,dirty_water,high,call,no,2,open,high


In [2]:
# Function to clean text
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'\[.*?\]', '', text)  # Remove text in brackets
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
    text = re.sub(r'\w*\d\w*', '', text) # Remove words containing numbers
    return text

# Apply cleaning
df['cleaned_text'] = df['complaint_text'].apply(clean_text)

print("Text Cleaning Complete. Example:")
print(df[['complaint_text', 'cleaned_text']].iloc[0])

Text Cleaning Complete. Example:
complaint_text    Too many potholes near the main junction.
cleaned_text       too many potholes near the main junction
Name: 0, dtype: object


In [3]:
# 1. Convert Text to Numbers (TF-IDF)
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_text'])
y = df['issue_category']  # This is our target (what we want to predict)

# 2. Split Data (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Train Model (Logistic Regression is great for text)
model = LogisticRegression()
model.fit(X_train, y_train)

# 4. Evaluate
y_pred = model.predict(X_test)
print("Model Accuracy Score:", model.score(X_test, y_test))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Model Accuracy Score: 0.985

Classification Report:
               precision    recall  f1-score   support

    drainage       1.00      1.00      1.00        17
 electricity       1.00      0.95      0.98        22
     garbage       0.98      1.00      0.99        40
        road       0.97      0.97      0.97        30
street_light       0.96      1.00      0.98        27
     traffic       1.00      1.00      1.00        27
       water       1.00      0.97      0.99        37

    accuracy                           0.98       200
   macro avg       0.99      0.98      0.99       200
weighted avg       0.99      0.98      0.98       200



In [4]:
# We use K-Means to find 5 main clusters of issues
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X)

# Assign the cluster label to the original dataframe
df['cluster_label'] = kmeans.labels_

# Show what each cluster looks like (Top terms)
print("Top terms per cluster:")
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = tfidf.get_feature_names_out()

for i in range(5):
    print(f"Cluster {i}: ", end="")
    for ind in order_centroids[i, :6]: # Top 6 words per cluster
        print(f"'{terms[ind]}' ", end="")
    print()

Top terms per cluster:
Cluster 0: 'garbage' 'since' 'no' 'supply' 'water' 'society' 
Cluster 1: 'water' 'in' 'society' 'block' 'power' 'is' 
Cluster 2: 'light' 'pole' 'is' 'signal' 'street' 'broken' 
Cluster 3: 'road' 'near' 'on' 'main' 'the' 'blocked' 
Cluster 4: 'tonight' 'morning' 'today' 'area' 'side' 'exit' 


In [5]:
# Extract Hour from timestamp for hourly analysis
df['hour'] = df['timestamp'].dt.hour
df['date'] = df['timestamp'].dt.date

# Count complaints per Area per Hour
spike_df = df.groupby(['date', 'hour', 'area']).size().reset_index(name='complaint_count')

# Define a Threshold (e.g., if > 5 complaints in one hour in one area)
THRESHOLD = 5
spike_df['is_spike'] = spike_df['complaint_count'] > THRESHOLD

print("Spike Detection Analysis:")
spikes = spike_df[spike_df['is_spike'] == True]
if not spikes.empty:
    display(spikes)
else:
    print("No massive spikes detected with current threshold.")

Spike Detection Analysis:


Unnamed: 0,date,hour,area,complaint_count,is_spike
1,2026-02-08,5,Ahmedabad South,6,True
2,2026-02-08,5,Bangalore,9,True
7,2026-02-08,6,Ahmedabad North,9,True
8,2026-02-08,6,Ahmedabad South,6,True
9,2026-02-08,6,Bangalore,25,True
...,...,...,...,...,...
107,2026-02-08,21,Ahmedabad South,7,True
110,2026-02-08,21,Hyderabad,10,True
112,2026-02-08,21,Pune,18,True
115,2026-02-08,22,Hyderabad,7,True


In [6]:
# Map text severity to numbers
severity_map = {'high': 3, 'medium': 2, 'low': 1}
df['severity_score'] = df['severity_level'].map(severity_map)

# Priority Formula:
# Score = (Severity * 2) + (1 if User Repeated else 0) + (1 if Duration > 24hrs)
df['ai_priority_score'] = (df['severity_score'] * 2) + \
                          (df['user_repeat_flag'].apply(lambda x: 1 if x == 'yes' else 0)) + \
                          (df['affected_duration_hours'].apply(lambda x: 1 if x > 24 else 0))

# Sort by Priority
priority_queue = df.sort_values(by='ai_priority_score', ascending=False)

print("Top 5 Most Urgent Complaints:")
display(priority_queue[['complaint_id', 'area', 'issue_category', 'ai_priority_score']].head(5))

Top 5 Most Urgent Complaints:


Unnamed: 0,complaint_id,area,issue_category,ai_priority_score
958,959,Hyderabad,garbage,8
60,61,Ahmedabad East,garbage,8
375,376,Bangalore,garbage,8
584,585,Bangalore,garbage,8
374,375,Bangalore,garbage,8


In [7]:
# This is a PLACEHOLDER function.
# In the real project, you insert your Azure OpenAI Key here.

def generate_explanation(area, issue, count, spike_status):
    # This is the prompt we WOULD send to GPT-4
    prompt = f"""
    Act as a city infrastructure assistant.
    Analyze this data:
    - Area: {area}
    - Issue Type: {issue}
    - Complaint Count: {count}
    - Is Spike?: {spike_status}

    Write a 1-sentence summary for the mayor explaining the situation.
    """
    
    # Mock response for now (Pretending AI generated this)
    if spike_status:
        return f"URGENT: {area} is experiencing a severe surge in {issue} reports ({count} total). Immediate dispatch required."
    else:
        return f"Routine maintenance required in {area} for reported {issue} issues."

# Test the function on the top priority item
top_issue = priority_queue.iloc[0]
explanation = generate_explanation(
    area=top_issue['area'], 
    issue=top_issue['issue_category'], 
    count=1, 
    spike_status="Yes"
)

print(f"AI Generated Explanation for Complaint #{top_issue['complaint_id']}:")
print(explanation)

AI Generated Explanation for Complaint #959:
URGENT: Hyderabad is experiencing a severe surge in garbage reports (1 total). Immediate dispatch required.


In [8]:
import pickle

# Save the vectorizer and the model
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

with open('complaint_classifier.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Model and Vectorizer saved!")

Model and Vectorizer saved!
