[Reference](https://medium.com/@bragadeeshs/streamlining-data-etl-with-apache-airflow-and-python-c46006463d5e)

In [1]:
import pandas as pd

# Define the file path
data_file = 'sales_data.csv'

# Read the CSV file into a Pandas DataFrame
data = pd.read_csv(data_file)

# Display the first few rows of the data
print(data.head())

# Group the data by 'Category' and calculate the sum of 'Revenue'
category_revenue = data.groupby('Category')['Revenue'].sum().reset_index()

# Display the transformed data
print(category_revenue)

In [2]:
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import datetime

# Define the default_args dictionary
default_args = {
    'owner': 'your_name',
    'start_date': datetime(2023, 1, 1),
    'retries': 1,
}

# Create a DAG instance
dag = DAG(
    'retail_sales_etl',
    default_args=default_args,
    schedule_interval='@daily',  # Run the DAG daily
    catchup=False,
)

# Define Python functions for extraction and transformation
def extract_data():
    # Your data extraction logic here
    pass

def transform_data():
    # Your data transformation logic here
    pass

# Define Airflow tasks
extract_task = PythonOperator(
    task_id='extract_data',
    python_callable=extract_data,
    dag=dag,
)

transform_task = PythonOperator(
    task_id='transform_data',
    python_callable=transform_data,
    dag=dag,
)

# Define task dependencies
extract_task >> transform_task

In [3]:
pip install mysql-connector-python

Collecting mysql-connector-python
  Downloading mysql_connector_python-8.2.0-cp310-cp310-manylinux_2_17_x86_64.whl (31.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.6/31.6 MB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting protobuf<=4.21.12,>=4.21.1 (from mysql-connector-python)
  Downloading protobuf-4.21.12-cp37-abi3-manylinux2014_x86_64.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.8/409.8 kB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf, mysql-connector-python
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling protobuf-3.20.3:
      Successfully uninstalled protobuf-3.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you h

In [6]:
import mysql.connector

# MySQL database credentials
db_config = {
    'host': 'your_host',
    'user': 'your_user',
    'password': 'your_password',
    'database': 'your_database',
}

# Create a connection to MySQL
conn = mysql.connector.connect(**db_config)

# Create a cursor object to interact with the database
cursor = conn.cursor()

# Close the cursor and connection when done
cursor.close()
conn.close()

In [7]:
# Replace 'your_table' with your actual table name
table_name = 'your_table'

# SQL statement to insert data
insert_query = f"INSERT INTO {table_name} (Category, Revenue) VALUES (%s, %s)"

# Prepare data for insertion (category_revenue is the DataFrame from the transformation step)
data_to_insert = [(row['Category'], row['Revenue']) for _, row in category_revenue.iterrows()]

# Execute the insert query
cursor.executemany(insert_query, data_to_insert)

# Commit the changes to the database
conn.commit()