In [20]:
import sys
!{sys.executable} -m pip install mysql-connector-python




In [21]:
import mysql.connector
print("✓ Working!")

✓ Working!


In [22]:
import mysql.connector
from mysql.connector import Error

config = {
    'host': 'localhost',
    'port': 3306,
    'user': 'root',
    'password': 'akhi@333'
}

try:
    conn = mysql.connector.connect(**config)
    print("✓ MySQL Connected!")
    print(f"Server info: {conn.get_server_info()}")
    conn.close()
except Error as e:
    print(f"✗ Error: {e}")


✓ MySQL Connected!
Server info: 8.0.39


In [23]:
import pandas as pd
import mysql.connector
from mysql.connector import Error
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

# MySQL Connection Configuration
mysql_config = {
    'host': 'localhost',
    'user': 'root',
    'password': 'akhi@333',  # ← CHANGE THIS TO YOUR MYSQL PASSWORD
    'database': 'stree_shakti_analytics'
}

def create_connection(config):
    """Create MySQL connection"""
    try:
        connection = mysql.connector.connect(**config)
        if connection.is_connected():
            print("✓ Connected to MySQL database successfully!")
            return connection
    except Error as e:
        print(f"✗ Error while connecting to MySQL: {e}")
        print("Make sure your password is correct!")
        return None

# Test connection
conn = create_connection(mysql_config)
if conn:
    cursor = conn.cursor()
    cursor.execute("SELECT DATABASE();")
    database = cursor.fetchone()
    print(f"  Currently using database: {database}")
    cursor.close()
else:
    print("Connection failed!")


✓ Connected to MySQL database successfully!
  Currently using database: ('stree_shakti_analytics',)


In [24]:
print("\n" + "="*60)
print("LOADING CLEANED DATA")
print("="*60)

# Read cleaned data
data_path = Path('../data/processed/stree_shakti_trips_cleaned.csv')
df = pd.read_csv(data_path)

print(f"\n✓ Data loaded: {len(df):,} records")
print(f"  Columns: {len(df)}")

# Convert time column to TIME format for MySQL
df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.time

# Select columns for database
db_columns = [
    'trip_id', 'date', 'time', 'hour', 'day_of_week', 'bus_id',
    'route_category', 'distance_km', 'passenger_gender', 'passenger_type',
    'age_group', 'normal_fare', 'revenue_loss', 'occupancy_pct',
    'month', 'week', 'day', 'is_weekend', 'time_period',
    'occupancy_category', 'beneficiary_trip', 'concessional_trip'
]

df_db = df[db_columns].copy()
print(f"\nColumns for database: {len(df_db.columns)}")
print(f"✓ Data prepared and ready for loading")



LOADING CLEANED DATA

✓ Data loaded: 1,000,000 records
  Columns: 1000000

Columns for database: 22
✓ Data prepared and ready for loading


In [25]:
print("\n" + "="*60)
print("INSERTING DATA INTO MYSQL")
print("="*60)

# Build insert statement
insert_query = """
INSERT INTO trips (
    trip_id, date, time, hour, day_of_week, bus_id,
    route_category, distance_km, passenger_gender, passenger_type,
    age_group, normal_fare, revenue_loss, occupancy_pct,
    month, week, day, is_weekend, time_period,
    occupancy_category, beneficiary_trip, concessional_trip
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""

# Prepare data for insertion
data_tuples = [tuple(row) for row in df_db.values]

# Insert in batches
batch_size = 10000
total_batches = len(data_tuples) // batch_size + (1 if len(data_tuples) % batch_size else 0)

print(f"Total records: {len(data_tuples):,}")
print(f"Batch size: {batch_size:,}")
print(f"Total batches: {total_batches}\n")

try:
    cursor = conn.cursor()
    
    for batch_num in range(total_batches):
        start_idx = batch_num * batch_size
        end_idx = min((batch_num + 1) * batch_size, len(data_tuples))
        batch_data = data_tuples[start_idx:end_idx]
        
        cursor.executemany(insert_query, batch_data)
        conn.commit()
        
        progress = (end_idx / len(data_tuples)) * 100
        remaining = total_batches - batch_num - 1
        
        print(f"[{'█' * int(progress//5):<20}] {progress:.1f}% - "
              f"Batch {batch_num + 1}/{total_batches} - "
              f"({remaining} remaining)")
    
    print(f"\n✓ All {len(data_tuples):,} records inserted successfully!")
    
except Error as e:
    print(f"✗ Error during insertion: {e}")
    conn.rollback()
finally:
    cursor.close()



INSERTING DATA INTO MYSQL
Total records: 1,000,000
Batch size: 10,000
Total batches: 100

[                    ] 1.0% - Batch 1/100 - (99 remaining)
[                    ] 2.0% - Batch 2/100 - (98 remaining)
[                    ] 3.0% - Batch 3/100 - (97 remaining)
[                    ] 4.0% - Batch 4/100 - (96 remaining)
[█                   ] 5.0% - Batch 5/100 - (95 remaining)
[█                   ] 6.0% - Batch 6/100 - (94 remaining)
[█                   ] 7.0% - Batch 7/100 - (93 remaining)
[█                   ] 8.0% - Batch 8/100 - (92 remaining)
[█                   ] 9.0% - Batch 9/100 - (91 remaining)
[██                  ] 10.0% - Batch 10/100 - (90 remaining)
[██                  ] 11.0% - Batch 11/100 - (89 remaining)
[██                  ] 12.0% - Batch 12/100 - (88 remaining)
[██                  ] 13.0% - Batch 13/100 - (87 remaining)
[██                  ] 14.0% - Batch 14/100 - (86 remaining)
[███                 ] 15.0% - Batch 15/100 - (85 remaining)
[███        

In [26]:
print("\n" + "="*60)
print("VERIFYING DATA IN DATABASE")
print("="*60)

try:
    cursor = conn.cursor()
    
    # Count records
    cursor.execute("SELECT COUNT(*) FROM trips")
    total_count_tuple = cursor.fetchone()
    total_count = total_count_tuple[0]  # Extract from tuple
    print(f"\n✓ Total records in database: {total_count:,}")
    
    # Sample records
    print("\n" + "-"*60)
    print("SAMPLE RECORDS")
    print("-"*60)
    cursor.execute("SELECT trip_id, date, passenger_type, occupancy_pct, revenue_loss FROM trips LIMIT 5")
    for row in cursor.fetchall():
        print(row)
    
    # Data distribution checks
    print("\n" + "-"*60)
    print("DATA DISTRIBUTION")
    print("-"*60)
    
    cursor.execute("SELECT passenger_type, COUNT(*) as count FROM trips GROUP BY passenger_type ORDER BY count DESC")
    print("\nPassenger Type Distribution:")
    for row in cursor.fetchall():
        print(f"  {row[0]}: {row[1]:,}")
    
    cursor.execute("SELECT route_category, COUNT(*) as count FROM trips GROUP BY route_category ORDER BY count DESC")
    print("\nRoute Category Distribution:")
    for row in cursor.fetchall():
        print(f"  {row[0]}: {row[1]:,}")
    
    cursor.execute("SELECT DATE(date) as date, COUNT(*) as trips FROM trips GROUP BY DATE(date) LIMIT 10")
    print("\nSample Daily Trip Counts:")
    for row in cursor.fetchall():
        print(f"  {row[0]}: {row[1]:,}")
    
    print("\n✓ Data verification complete!")
    
except Error as e:
    print(f"✗ Error during verification: {e}")
finally:
    cursor.close()



VERIFYING DATA IN DATABASE

✓ Total records in database: 1,000,000

------------------------------------------------------------
SAMPLE RECORDS
------------------------------------------------------------
('TRIP_00000001', datetime.date(2025, 8, 27), 'Paid', 43, 0)
('TRIP_00000002', datetime.date(2025, 9, 15), 'Senior_Citizen', 67, 22)
('TRIP_00000003', datetime.date(2025, 8, 30), 'Stree_Shakti', 98, 20)
('TRIP_00000004', datetime.date(2025, 10, 8), 'Stree_Shakti', 36, 20)
('TRIP_00000005', datetime.date(2025, 11, 12), 'Student', 47, 10)

------------------------------------------------------------
DATA DISTRIBUTION
------------------------------------------------------------

Passenger Type Distribution:
  Stree_Shakti: 390,434
  Paid: 261,809
  Student: 208,162
  Senior_Citizen: 100,497
  Other: 39,098

Route Category Distribution:
  urban: 500,340
  peri-urban: 349,917
  rural: 149,743

Sample Daily Trip Counts:
  2025-08-15: 7,150
  2025-08-16: 7,366
  2025-08-17: 7,241
  2025-08-

In [27]:
print("\n" + "="*60)
print("POPULATING AGGREGATE TABLES")
print("="*60)

try:
    cursor = conn.cursor()
    
    # Populate daily_summary
    print("\n[1/3] Populating daily_summary...")
    daily_query = """
    INSERT INTO daily_summary (summary_date, total_trips, daily_revenue_loss, 
                               avg_occupancy, buses_operated, female_trips, stree_shakti_trips)
    SELECT 
        DATE(date) as summary_date,
        COUNT(*) as total_trips,
        SUM(revenue_loss) as daily_revenue_loss,
        ROUND(AVG(occupancy_pct), 2) as avg_occupancy,
        COUNT(DISTINCT bus_id) as buses_operated,
        SUM(CASE WHEN passenger_gender = 'F' THEN 1 ELSE 0 END) as female_trips,
        SUM(CASE WHEN passenger_type = 'Stree_Shakti' THEN 1 ELSE 0 END) as stree_shakti_trips
    FROM trips
    GROUP BY DATE(date)
    """
    cursor.execute(daily_query)
    conn.commit()
    
    cursor.execute("SELECT COUNT(*) FROM daily_summary")
    daily_count = cursor.fetchone()
    print(f"✓ {daily_count} daily summaries created")
    
    # Populate hourly_summary
    print("\n[2/3] Populating hourly_summary...")
    hourly_query = """
    INSERT INTO hourly_summary (summary_date, hour, trip_count, revenue_loss, 
                                avg_occupancy, avg_distance, female_percentage)
    SELECT 
        DATE(date) as summary_date,
        hour,
        COUNT(*) as trip_count,
        SUM(revenue_loss) as revenue_loss,
        ROUND(AVG(occupancy_pct), 2) as avg_occupancy,
        ROUND(AVG(distance_km), 1) as avg_distance,
        ROUND(100.0 * SUM(CASE WHEN passenger_gender = 'F' THEN 1 ELSE 0 END) / COUNT(*), 2) as female_percentage
    FROM trips
    GROUP BY DATE(date), hour
    """
    cursor.execute(hourly_query)
    conn.commit()
    
    cursor.execute("SELECT COUNT(*) FROM hourly_summary")
    hourly_count = cursor.fetchone()
    print(f"✓ {hourly_count} hourly summaries created")
    
    # Populate route_summary
    print("\n[3/3] Populating route_summary...")
    route_query = """
    INSERT INTO route_summary (route_category, total_trips, avg_distance, 
                               avg_occupancy, total_revenue_loss, unique_buses)
    SELECT 
        route_category,
        COUNT(*) as total_trips,
        ROUND(AVG(distance_km), 1) as avg_distance,
        ROUND(AVG(occupancy_pct), 2) as avg_occupancy,
        SUM(revenue_loss) as total_revenue_loss,
        COUNT(DISTINCT bus_id) as unique_buses
    FROM trips
    GROUP BY route_category
    """
    cursor.execute(route_query)
    conn.commit()
    
    cursor.execute("SELECT COUNT(*) FROM route_summary")
    route_count = cursor.fetchone()
    print(f"✓ {route_count} route summaries created")
    
    print("\n✓✓✓ DATABASE SETUP COMPLETE ✓✓✓")
    
except Error as e:
    print(f"✗ Error: {e}")
    conn.rollback()
finally:
    cursor.close()

# Close connection
if conn.is_connected():
    conn.close()
    print("\n✓ MySQL connection closed")



POPULATING AGGREGATE TABLES

[1/3] Populating daily_summary...
✓ (139,) daily summaries created

[2/3] Populating hourly_summary...
✓ (2085,) hourly summaries created

[3/3] Populating route_summary...
✓ (3,) route summaries created

✓✓✓ DATABASE SETUP COMPLETE ✓✓✓

✓ MySQL connection closed


In [35]:
import pandas as pd
import mysql.connector

conn = mysql.connector.connect(
    host='localhost', database='stree_shakti_analytics',
    user='root', password='[powerbi123]'
)

# Export all tables
tables = ['trips', 'daily_summary', 'hourly_summary', 'route_summary']
for table in tables:
    df = pd.read_sql(f"SELECT * FROM {table}", conn)
    df.to_csv(f"{table}.csv", index=False)
    print(f"Exported {table}: {len(df):,} rows")

conn.close()


ProgrammingError: 1045 (28000): Access denied for user 'root'@'localhost' (using password: YES)