In [39]:
import pandas as pd
import mysql.connector
from mysql.connector import Error

# Load the dataset
file_path = "../data/final_dataset.csv"  # Path to your dataset
df = pd.read_csv(file_path)

# Preview the data
print(df.head())

  InvoiceNo  Quantity          InvoiceDate  Price  CustomerID    Country
0    540267        96  2011-01-06 11:12:00   0.72     12415.0  Australia
1    567085        16  2011-09-16 12:38:00   0.83     12434.0  Australia
2    540267        36  2011-01-06 11:12:00   1.85     12415.0  Australia
3    558537        48  2011-06-30 12:06:00   1.25     12424.0  Australia
4    556917       144  2011-06-15 13:37:00   2.49     12415.0  Australia


In [40]:
# Handle missing values
df.dropna(subset=['InvoiceNo', 'Quantity', 'Price', 'CustomerID', 'Country'], inplace=True)

# Convert 'InvoiceDate' to date-only format
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate']).dt.date

# Ensure data types are consistent
df['InvoiceNo'] = df['InvoiceNo'].astype(str)
df['Quantity'] = df['Quantity'].astype(int)
df['Price'] = df['Price'].astype(float)
df['CustomerID'] = df['CustomerID'].astype(int)

# Preview the cleaned data
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 5184 entries, 0 to 5561
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    5184 non-null   object 
 1   Quantity     5184 non-null   int64  
 2   InvoiceDate  5184 non-null   object 
 3   Price        5184 non-null   float64
 4   CustomerID   5184 non-null   int64  
 5   Country      5184 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 283.5+ KB
None


In [62]:
import os
from dotenv import load_dotenv
import mysql.connector
from mysql.connector import Error

# Load environment variables from .env file
load_dotenv()

# Fetch values from the .env file
db_host = os.getenv('DB_HOST')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_name = os.getenv('DB_NAME')

# Function to connect and load data into MySQL
def load_data_to_mysql(df, host, user, password, database, table_name):
    try:
        # Establish connection to MySQL
        connection = mysql.connector.connect(
            host=host,
            user=user,
            password=password,
            database=database
        )
        
        if connection.is_connected():
            print(f"Connected to MySQL database: {database}")
            cursor = connection.cursor()

            # Insert data into the specified table
            for _, row in df.iterrows():
                insert_query = f"""
                INSERT INTO {table_name} (InvoiceNo, Quantity, InvoiceDate, Price, CustomerID, Country)
                VALUES (%s, %s, %s, %s, %s, %s)
                """
                data_tuple = (
                    row['InvoiceNo'], 
                    row['Quantity'], 
                    row['InvoiceDate'], 
                    row['Price'], 
                    row['CustomerID'], 
                    row['Country']
                )
                try:
                    cursor.execute(insert_query, data_tuple)
                except Error as insert_error:
                    print(f"Error inserting row {row['InvoiceNo']}: {insert_error}")
            
            # Commit the transaction
            connection.commit()
            print("Data loaded successfully into the table!")

    except Error as e:
        print(f"Connection error: {e}")

    finally:
        if connection.is_connected():
            cursor.close()
            connection.close()
            print("MySQL connection closed.")

# Call the function to load the DataFrame `df` into the MySQL table
load_data_to_mysql(
    df=df,
    host=db_host,
    user=db_user,
    password=db_password,
    database=db_name,
    table_name='e_commerce_data'
)


Connected to MySQL database: ecommerce_analysis
Data loaded successfully into the table!
MySQL connection closed.


In [63]:
df.to_csv("../data/final_dataset.csv")