In [6]:
import pandas as pd
import os

# --- Folder containing your CSVs ---
folder_path = "C:/Users/Akshiya George/OneDrive/Desktop/Data Science/Amazon"

# --- List of years to combine ---
years = range(2015, 2026)

# --- Build full file paths ---
csv_files = [os.path.join(folder_path, f"amazon_india_{year}.csv") for year in years]

# --- Read and combine all CSVs ---
combined_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

# --- Save to a single CSV file ---
output_path = os.path.join(folder_path, "amazon_india_combined_2015_2025.csv")
combined_df.to_csv(output_path, index=False)

print(f"✅ Combined CSV saved to: {output_path}")

  combined_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)


✅ Combined CSV saved to: C:/Users/Akshiya George/OneDrive/Desktop/Data Science/Amazon\amazon_india_combined_2015_2025.csv


In [23]:
import mysql.connector
import pandas as pd
import os

# --- Path to Combined CSV ---
combined_path = "C:/Users/Akshiya George/OneDrive/Desktop/Data Science/Amazon/amazon_india_combined_2015_2025.csv"

# --- Connect to MySQL ---
conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="Akshiya13",
    database="amazon",
    allow_local_infile=True
)
cursor = conn.cursor()
cursor.execute("SET GLOBAL local_infile = 1;")

# --- Function to Save Temp CSV and Load into MySQL ---
def load_table(df, table_name):
    temp_path = f"{table_name}_temp.csv"
    df.to_csv(temp_path, index=False)
    query = f"""
    LOAD DATA LOCAL INFILE '{temp_path.replace("\\", "/")}'
    INTO TABLE {table_name}
    FIELDS TERMINATED BY ',' 
    ENCLOSED BY '"'
    LINES TERMINATED BY '\n'
    IGNORE 1 ROWS;
    """
    cursor.execute(query)
    conn.commit()
    os.remove(temp_path)
    print(f"✅ Loaded {table_name} with {len(df):,} rows")

# --- Read and Clean Combined CSV ---
df = pd.read_csv("C:/Users/Akshiya George/OneDrive/Desktop/Data Science/Amazon/amazon_india_combined_2015_2025.csv")
df.columns = df.columns.str.strip()
df['order_date'] = pd.to_datetime(df['order_date'], errors='coerce')
df = df.dropna(subset=['order_date'])

# --- Extract Transactions ---
transactions = df[[
    'transaction_id', 'order_date', 'customer_id', 'product_id',
    'original_price_inr', 'discount_percent', 'discounted_price_inr',
    'quantity', 'subtotal_inr', 'delivery_charges', 'final_amount_inr',
    'payment_method', 'payment_method_cleaned', 'delivery_days',
    'delivery_type', 'is_prime_member', 'is_festival_sale', 'festival_name',
    'customer_rating', 'return_status', 'order_month', 'order_year',
    'order_quarter', 'dup_key', 'dup_count', 'dup_status', 'flag_for_review'
]].drop_duplicates()

# --- Extract Products ---
products = df[[
    'product_id', 'product_name', 'category', 'subcategory', 'brand',
    'product_weight_kg', 'is_prime_eligible', 'product_rating'
]].drop_duplicates()

# --- Extract Customers ---
df.columns = df.columns.str.strip()

# Define expected columns
expected_cols = ['customer_id', 'customer_name', 'customer_state', 'customer_tier', 'customer_points', 'customer_age_group']

# Filter only existing columns
valid_cols = [col for col in expected_cols if col in df.columns]

# Extract and deduplicate
customers = df[valid_cols].drop_duplicates()

print(f"✅ Extracted customers with columns: {valid_cols}")

# --- Create Time Dimension ---
time_dimension = df[['order_date']].drop_duplicates().copy()
time_dimension['year'] = time_dimension['order_date'].dt.year
time_dimension['month'] = time_dimension['order_date'].dt.month
time_dimension['day'] = time_dimension['order_date'].dt.day
time_dimension['quarter'] = time_dimension['order_date'].dt.quarter
time_dimension['week'] = time_dimension['order_date'].dt.isocalendar().week
time_dimension['day_of_week'] = time_dimension['order_date'].dt.day_name()
time_dimension['is_weekend'] = time_dimension['day_of_week'].isin(['Saturday', 'Sunday'])
time_dimension['is_holiday'] = df.groupby('order_date')['is_festival_sale'].max().reindex(time_dimension['order_date']).fillna(False).values
time_dimension['holiday_name'] = df.groupby('order_date')['festival_name'].first().reindex(time_dimension['order_date']).fillna("").values
time_dimension.rename(columns={'order_date': 'date'}, inplace=True)

# --- Load Tables into MySQL ---
load_table(customers, "customers")
load_table(products, "products")
load_table(time_dimension, "time_dimension")
load_table(transactions, "transactions")

# --- Helper: Check if Index Exists ---
def index_exists(cursor, table, index_name):
    cursor.execute(f"SHOW INDEX FROM {table} WHERE Key_name = '{index_name}';")
    result = cursor.fetchall()
    return len(result) > 0

# --- Index Creation Queries ---
index_queries = [
    ("transactions", "idx_order_date", "CREATE INDEX idx_order_date ON transactions(order_date);"),
    ("transactions", "idx_customer_id", "CREATE INDEX idx_customer_id ON transactions(customer_id);"),
    ("transactions", "idx_product_id", "CREATE INDEX idx_product_id ON transactions(product_id);"),
    ("transactions", "idx_payment_method", "CREATE INDEX idx_payment_method ON transactions(payment_method_cleaned);"),
    ("transactions", "idx_order_year_month", "CREATE INDEX idx_order_year_month ON transactions(order_year, order_month);"),
    ("transactions", "idx_return_status", "CREATE INDEX idx_return_status ON transactions(return_status);"),
    ("products", "idx_category_subcategory", "CREATE INDEX idx_category_subcategory ON products(category, subcategory);"),
    ("products", "idx_brand", "CREATE INDEX idx_brand ON products(brand);"),
    ("products", "idx_prime_eligible", "CREATE INDEX idx_prime_eligible ON products(is_prime_eligible);"),
    ("customers", "idx_customer_location", "CREATE INDEX idx_customer_location ON customers(customer_state);"),
    ("customers", "idx_customer_tier", "CREATE INDEX idx_customer_tier ON customers(customer_tier);"),
    ("customers", "idx_spending_tier", "CREATE INDEX idx_spending_tier ON customers(customer_points);"),
    ("time_dimension", "idx_year_month", "CREATE INDEX idx_year_month ON time_dimension(year, month);"),
    ("time_dimension", "idx_day_of_week", "CREATE INDEX idx_day_of_week ON time_dimension(day_of_week);"),
    ("time_dimension", "idx_is_weekend", "CREATE INDEX idx_is_weekend ON time_dimension(is_weekend);"),
    ("time_dimension", "idx_is_holiday", "CREATE INDEX idx_is_holiday ON time_dimension(is_holiday);")
]

# --- Safe Index Creation ---
for table, index_name, query in index_queries:
    try:
        if index_exists(cursor, table, index_name):
            print(f"⚠️ Index already exists: {index_name} on {table}")
        else:
            cursor.execute(query)
            print(f"✅ Index created: {index_name} on {table}")
    except mysql.connector.Error as err:
        print(f"❌ Index creation failed for {index_name}: {err}")

# --- Close Connection ---
cursor.close()
conn.close()
print("🎉 All data from 2015–2025 loaded and indexed successfully.")



✅ Extracted customers with columns: ['customer_id', 'customer_state', 'customer_tier', 'customer_age_group']
✅ Loaded customers with 451,322 rows
✅ Loaded products with 2,004 rows
✅ Loaded time_dimension with 4,015 rows
✅ Loaded transactions with 1,127,400 rows
⚠️ Index already exists: idx_order_date on transactions
⚠️ Index already exists: idx_customer_id on transactions
⚠️ Index already exists: idx_product_id on transactions
⚠️ Index already exists: idx_payment_method on transactions
⚠️ Index already exists: idx_order_year_month on transactions
⚠️ Index already exists: idx_return_status on transactions
⚠️ Index already exists: idx_category_subcategory on products
⚠️ Index already exists: idx_brand on products
⚠️ Index already exists: idx_prime_eligible on products
⚠️ Index already exists: idx_customer_location on customers
⚠️ Index already exists: idx_customer_tier on customers
⚠️ Index already exists: idx_spending_tier on customers
⚠️ Index already exists: idx_year_month on time_dim