In [None]:
#pip install pymysql
!pip show pymysql

In [1]:
import numpy as np
import pandas as pd

## Testing db connection

In [2]:
import pymysql

host='db' #this is the service in docker-compose
user='root'
password='example'


try:
    connection = pymysql.connect(
        host=host,           # Use 'db' if running from another container, 'localhost' if from host and port is mapped
        user=user,
        password=password
    )
    print("Connection to MySQL DB successful")
except Exception as e:
    print(f"Error: {e}")
finally:
    if 'connection' in locals() and connection.open:
        connection.close()

Connection to MySQL DB successful


In [None]:
# Install the package (run this in your Jupyter notebook cell if not already installed)
!pip install palmerpenguins

In [3]:
# Import the dataset
from palmerpenguins import load_penguins

penguins = load_penguins()

# Replace NaN with None for SQL compatibility
penguins = penguins.where(pd.notnull(penguins), None)

print(penguins.head())

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  year  
0       3750.0    male  2007  
1       3800.0  female  2007  
2       3250.0  female  2007  
3          NaN    None  2007  
4       3450.0  female  2007  


In [6]:
penguins = penguins.dropna()

In [9]:
# Show basic statistics for numeric columns
print(penguins.describe())

# Show statistics for categorical columns
print(penguins.describe(include=['object']))

       bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  \
count      333.000000     333.000000         333.000000   333.000000   
mean        43.992793      17.164865         200.966967  4207.057057   
std          5.468668       1.969235          14.015765   805.215802   
min         32.100000      13.100000         172.000000  2700.000000   
25%         39.500000      15.600000         190.000000  3550.000000   
50%         44.500000      17.300000         197.000000  4050.000000   
75%         48.600000      18.700000         213.000000  4775.000000   
max         59.600000      21.500000         231.000000  6300.000000   

              year  
count   333.000000  
mean   2008.042042  
std       0.812944  
min    2007.000000  
25%    2007.000000  
50%    2008.000000  
75%    2009.000000  
max    2009.000000  
       species  island   sex
count      333     333   333
unique       3       3     2
top     Adelie  Biscoe  male
freq       146     163   168


## Create database - penguin_db

In [4]:
database = 'penguin_db'  # the database you want to create

# Connect to MySQL server
conn = pymysql.connect(
    host=host,
    user=user,
    password=password
)
cur = conn.cursor()

# Create database if it does not exist
cur.execute(f"CREATE DATABASE IF NOT EXISTS {database};")
print(f"Database '{database}' is ready.")

cur.close()
conn.close()

Database 'penguin_db' is ready.


## Create table - penguins

In [None]:

# Connect to MySQL
conn = pymysql.connect(
    host=host,
    user=user,
    password=password,
    database=database
)

# Create a cursor object
cur = conn.cursor()

# Create table if not exists
create_table_query = """
CREATE TABLE IF NOT EXISTS penguins (
    species VARCHAR(20),
    island VARCHAR(20),
    bill_length_mm FLOAT,
    bill_depth_mm FLOAT,
    flipper_length_mm FLOAT,
    body_mass_g FLOAT,
    sex VARCHAR(10),
    year INT
)
"""
cur.execute(create_table_query)

# Insert data row by row
insert_query = """
INSERT INTO penguins (species, island, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, sex, year)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
"""

for _, row in penguins.iterrows():
    cur.execute(insert_query, tuple(row))
    #print(tuple(row))
    
conn.commit()
cur.close()
conn.close()

print("Penguins dataset loaded into MySQL using pymysql!")

## Fetch data from table penguins

In [8]:
# Connect to MySQL
conn = pymysql.connect(
    host=host,
    user=user,
    password=password,
    database=database
)
cur = conn.cursor()

# Check if table exists
cur.execute("""
    SELECT COUNT(*)
    FROM information_schema.tables
    WHERE table_schema = %s
      AND table_name = %s
""", (database, 'penguins'))

if cur.fetchone()[0]:
    print("Table 'penguins' exists. Showing first 5 rows:")
    cur.execute("SELECT * FROM penguins LIMIT 5")
    for row in cur.fetchall():
        print(row)
else:
    print("Table 'penguins' does not exist in database 'penguins_db'.")

cur.close()
conn.close()

Table 'penguins' exists. Showing first 5 rows:
('Adelie', 'Torgersen', 39.1, 18.7, 181.0, 3750.0, 'male', 2007)
('Adelie', 'Torgersen', 39.5, 17.4, 186.0, 3800.0, 'female', 2007)
('Adelie', 'Torgersen', 40.3, 18.0, 195.0, 3250.0, 'female', 2007)
('Adelie', 'Torgersen', 36.7, 19.3, 193.0, 3450.0, 'female', 2007)
('Adelie', 'Torgersen', 39.3, 20.6, 190.0, 3650.0, 'male', 2007)


## Pre-processing


In [10]:
pip show pymysql

Name: PyMySQL
Version: 1.1.1
Summary: Pure Python MySQL Driver
Home-page: 
Author: 
Author-email: Inada Naoki <songofacandy@gmail.com>, Yutaka Matsubara <yutaka.matsubara@gmail.com>
License: MIT License
Location: /opt/conda/lib/python3.11/site-packages
Requires: 
Required-by: 
Note: you may need to restart the kernel to use updated packages.
