# Data Injestion

This notebook represents the initial step in the data engineering project, where CSV files are ingested from GitHub and loaded into Filess.io SQL databases. Here we are creating two different databases one is MySQL and another one is No SQL database which is Mongo DB. In each of the databases we are creating only one table inside the database.

In [1]:
!pip install mysql-connector-python

Collecting mysql-connector-python
  Downloading mysql_connector_python-9.3.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (7.2 kB)
Downloading mysql_connector_python-9.3.0-cp311-cp311-manylinux_2_28_x86_64.whl (33.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.9/33.9 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mysql-connector-python
Successfully installed mysql-connector-python-9.3.0


In [None]:
import mysql.connector
from mysql.connector import Error

hostname = "kla8x.h.filess.io"
database = "olistproject_therefore"
port = "3307"
username = "olistproject_therefore"
password = "9b358f3d22ba5c9fe381a955f6d68d7eac2cf75c"

try:
    connection = mysql.connector.connect(host=hostname, database=database, user=username, password=password, port=port)
    if connection.is_connected():
        db_Info = connection.get_server_info()
        print("Connected to MySQL Server version ", db_Info)
        cursor = connection.cursor()
        cursor.execute("select database();")
        record = cursor.fetchone()
        print("You're connected to database: ", record)

except Error as e:
    print("Error while connecting to MySQL", e)
finally:
    if connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL connection is closed")



Connected to MySQL Server version  8.0.36-28
You're connected to database:  ('olistproject_therefore',)
MySQL connection is closed


    The property counterpart 'server_info' should be used instead.

  db_Info = connection.get_server_info()


In [None]:
import pandas as pd

In [None]:
order_payments = pd.read_csv("olist_order_payments_dataset.csv")
order_payments.head()

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45


In [None]:
import pandas as pd
import mysql.connector
from mysql.connector import Error

# Connection details
hostname = "kla8x.h.filess.io"
database = "olistproject_therefore"
port = "3307"
username = "olistproject_therefore"
password = "9b358f3d22ba5c9fe381a955f6d68d7eac2cf75c"

# CSV file path
csv_file_path = "olist_order_payments_dataset.csv"

# Table name where the data will be uploaded
table_name = "olist_order_payments"

try:
    # Step 1: Establish a connection to MySQL server
    connection = mysql.connector.connect(
        host=hostname,
        database=database,
        user=username,
        password=password,
        port=port
    )
    if connection.is_connected():
        print("Connected to MySQL Server successfully!")

        # Step 2: Create a cursor to execute SQL queries
        cursor = connection.cursor()

        # Step 3: Drop table if it already exists (for clean insertion)
        cursor.execute(f"DROP TABLE IF EXISTS {table_name};")
        print(f"Table `{table_name}` dropped if it existed.")

        # Step 4: Create a table structure to match CSV file
        create_table_query = f"""
        CREATE TABLE {table_name} (
            order_id VARCHAR(50),
            payment_sequential INT,
            payment_type VARCHAR(20),
            payment_installments INT,
            payment_value FLOAT
        );
        """
        cursor.execute(create_table_query)
        print(f"Table `{table_name}` created successfully!")

        # Step 5: Load the CSV data into pandas DataFrame
        data = pd.read_csv(csv_file_path)
        print("CSV data loaded into pandas DataFrame.")

        # Step 6: Insert data in batches of 500 records
        batch_size = 1000  # Define the batch size
        total_records = len(data)  # Get total records in the DataFrame

        print(f"Starting data insertion into `{table_name}` in batches of {batch_size} records.")
        for start in range(0, total_records, batch_size):
            end = start + batch_size
            batch = data.iloc[start:end]  # Get the current batch of records

            # Convert batch to list of tuples for MySQL insertion
            batch_records = [
                tuple(row) for row in batch.itertuples(index=False, name=None)
            ]

            # Prepare the INSERT query
            insert_query = f"""
            INSERT INTO {table_name}
            (order_id, payment_sequential, payment_type, payment_installments, payment_value)
            VALUES (%s, %s, %s, %s, %s);
            """

            # Execute the insertion query for the batch
            cursor.executemany(insert_query, batch_records)
            connection.commit()  # Commit after each batch
            print(f"Inserted records {start + 1} to {min(end, total_records)} successfully.")

        print(f"All {total_records} records inserted successfully into `{table_name}`.")

except Error as e:
    # Step 7: Handle any errors
    print("Error while connecting to MySQL or inserting data:", e)

finally:
    # Step 8: Close the cursor and connection
    if connection.is_connected():
        cursor.close()
        connection.close()
        print("MySQL connection is closed.")

Connected to MySQL Server successfully!
Table `olist_order_payments` dropped if it existed.
Table `olist_order_payments` created successfully!
CSV data loaded into pandas DataFrame.
Starting data insertion into `olist_order_payments` in batches of 1000 records.
Inserted records 1 to 1000 successfully.
Inserted records 1001 to 2000 successfully.
Inserted records 2001 to 3000 successfully.
Inserted records 3001 to 4000 successfully.
Inserted records 4001 to 5000 successfully.
Inserted records 5001 to 6000 successfully.
Inserted records 6001 to 7000 successfully.
Inserted records 7001 to 8000 successfully.
Inserted records 8001 to 9000 successfully.
Inserted records 9001 to 10000 successfully.
Inserted records 10001 to 11000 successfully.
Inserted records 11001 to 12000 successfully.
Inserted records 12001 to 13000 successfully.
Inserted records 13001 to 14000 successfully.
Inserted records 14001 to 15000 successfully.
Inserted records 15001 to 16000 successfully.
Inserted records 16001 t

In [None]:
!pip install pymongo

Collecting pymongo
  Downloading pymongo-4.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.13.0


In [None]:
# importing module
from pymongo import MongoClient

hostname = "kpb69.h.filess.io"
database = "olistDataNoSQL_readyplain"
port = "27018"
username = "olistDataNoSQL_readyplain"
password = "fb90d98a69dea27adc3be65ab015df5f64a080c7"

uri = "mongodb://" + username + ":" + password + "@" + hostname + ":" + port + "/" + database

# Connect with the portnumber and host
client = MongoClient(uri)

# Access database
mydatabase = client[database]


In [None]:
# prompt: read the product_category csv and create a collection and upload it to above mongoDB

import pandas as pd
from pymongo import MongoClient

# Load the product_category CSV file into a pandas DataFrame
try:
  product_category_df = pd.read_csv("product_category_name_translation.csv")
except FileNotFoundError:
  print("Error: 'product_category_name_translation.csv' not found.")
  exit() # Exit the script if the file is not found


# MongoDB connection details (assuming these are already defined in your script)
hostname = "kpb69.h.filess.io"
database = "olistDataNoSQL_readyplain"
port = "27018"
username = "olistDataNoSQL_readyplain"
password = "fb90d98a69dea27adc3be65ab015df5f64a080c7"

uri = "mongodb://" + username + ":" + password + "@" + hostname + ":" + port + "/" + database

try:
    # Establish a connection to MongoDB
    client = MongoClient(uri)
    db = client[database]

    # Select the collection (or create if it doesn't exist)
    collection = db["product_categories"]  # Choose a suitable name for your collection

    # Convert the DataFrame to a list of dictionaries for insertion into MongoDB
    data_to_insert = product_category_df.to_dict(orient="records")

    # Insert the data into the collection
    collection.insert_many(data_to_insert)

    print("Data uploaded to MongoDB successfully!")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Close the MongoDB connection
    if client:
        client.close()

Data uploaded to MongoDB successfully!
