In [4]:
import pandas as pd
from datetime import datetime

In [5]:
customers = pd.read_csv('Customers.csv', encoding='unicode_escape')
products = pd.read_csv('Products.csv')
sales = pd.read_csv('Sales.csv')
stores = pd.read_csv('Stores.csv')
exchange_rates = pd.read_csv('Exchange_Rates.csv')

In [4]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15266 entries, 0 to 15265
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   CustomerKey  15266 non-null  int64 
 1   Gender       15266 non-null  object
 2   Name         15266 non-null  object
 3   City         15266 non-null  object
 4   State Code   15256 non-null  object
 5   State        15266 non-null  object
 6   Zip Code     15266 non-null  object
 7   Country      15266 non-null  object
 8   Continent    15266 non-null  object
 9   Birthday     15266 non-null  object
dtypes: int64(1), object(9)
memory usage: 1.2+ MB


In [5]:
# printing rows with 'state code' column having null value
customers[customers['State Code'].isnull()]

Unnamed: 0,CustomerKey,Gender,Name,City,State Code,State,Zip Code,Country,Continent,Birthday
5304,729681,Female,Rossana Padovesi,Polvica,,Napoli,80035,Italy,Europe,4/18/1981
5316,732289,Male,Indro Piccio,Varcaturo,,Napoli,80014,Italy,Europe,2/24/1949
5372,742042,Male,Amaranto Loggia,Casaferro,,Napoli,80034,Italy,Europe,3/14/1936
5377,742886,Female,Edmonda Capon,Terzigno,,Napoli,80040,Italy,Europe,8/6/1963
5378,743343,Female,Ambra Sagese,Pomigliano D'Arco,,Napoli,80038,Italy,Europe,1/5/1961
5485,759705,Male,Callisto Lo Duca,Casilli,,Napoli,80047,Italy,Europe,8/28/1976
5525,765589,Male,Michelino Lucchesi,Pompei Scavi,,Napoli,80045,Italy,Europe,11/13/1947
5531,766410,Male,Adelmio Beneventi,Licola,,Napoli,80078,Italy,Europe,1/13/1940
5631,781667,Female,Ilda Manna,Napoli,,Napoli,80134,Italy,Europe,5/8/1977
5695,789177,Male,Calogero Folliero,Mariglianella,,Napoli,80030,Italy,Europe,3/3/2000


In [6]:
# The state code of Napoli is 'NA' which is treated as null value in pandas DataFrame,
# therefore changing the state code of Napoli from 'NA' to 'NL'
customers['State Code'] = customers['State Code'].fillna('NL')

In [7]:
# again checking for any null value
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15266 entries, 0 to 15265
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   CustomerKey  15266 non-null  int64 
 1   Gender       15266 non-null  object
 2   Name         15266 non-null  object
 3   City         15266 non-null  object
 4   State Code   15266 non-null  object
 5   State        15266 non-null  object
 6   Zip Code     15266 non-null  object
 7   Country      15266 non-null  object
 8   Continent    15266 non-null  object
 9   Birthday     15266 non-null  object
dtypes: int64(1), object(9)
memory usage: 1.2+ MB


In [8]:
# checking for null value in products dataframe
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2517 entries, 0 to 2516
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ProductKey      2517 non-null   int64 
 1   Product Name    2517 non-null   object
 2   Brand           2517 non-null   object
 3   Color           2517 non-null   object
 4   Unit Cost USD   2517 non-null   object
 5   Unit Price USD  2517 non-null   object
 6   SubcategoryKey  2517 non-null   int64 
 7   Subcategory     2517 non-null   object
 8   CategoryKey     2517 non-null   int64 
 9   Category        2517 non-null   object
dtypes: int64(3), object(7)
memory usage: 196.8+ KB


In [9]:
# checking for null value in Sales dataframe
print(sales.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62884 entries, 0 to 62883
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order Number   62884 non-null  int64 
 1   Line Item      62884 non-null  int64 
 2   Order Date     62884 non-null  object
 3   Delivery Date  13165 non-null  object
 4   CustomerKey    62884 non-null  int64 
 5   StoreKey       62884 non-null  int64 
 6   ProductKey     62884 non-null  int64 
 7   Quantity       62884 non-null  int64 
 8   Currency Code  62884 non-null  object
dtypes: int64(6), object(3)
memory usage: 4.3+ MB
None


In [10]:
print(sales['Delivery Date'].isnull().sum())



49719


In [11]:
# Since delivery date column has more than 50% of its value as null,
# we can drop the column
sales.drop('Delivery Date', axis=1, inplace=True)
# or we can also use sales = sales.drop('Delivery Date', axis=1)

In [12]:
# again checking for null values in sales dataframe
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62884 entries, 0 to 62883
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Order Number   62884 non-null  int64 
 1   Line Item      62884 non-null  int64 
 2   Order Date     62884 non-null  object
 3   CustomerKey    62884 non-null  int64 
 4   StoreKey       62884 non-null  int64 
 5   ProductKey     62884 non-null  int64 
 6   Quantity       62884 non-null  int64 
 7   Currency Code  62884 non-null  object
dtypes: int64(6), object(2)
memory usage: 3.8+ MB


In [13]:
stores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67 entries, 0 to 66
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   StoreKey       67 non-null     int64  
 1   Country        67 non-null     object 
 2   State          67 non-null     object 
 3   Square Meters  66 non-null     float64
 4   Open Date      67 non-null     object 
dtypes: float64(1), int64(1), object(3)
memory usage: 2.7+ KB


In [None]:
# one null value of Square Meters in stores dataframe refers to online store.

### changing the format of date in customes, sales and stores dataframe 

In [14]:
# changing the format of date from mm/dd/yyyy to mm-dd-yyyy in 'Birthday' column in customers dataframe.
customers['Birthday'] = pd.to_datetime(customers['Birthday']).dt.strftime('%m-%d-%Y')
# changing the format of date from mm-dd-yyyy to YYYY-MM-DD in 'Birthday' column in customers dataframe
customers['Birthday'] = pd.to_datetime(customers['Birthday'], format='%m-%d-%Y').dt.strftime('%Y-%m-%d')

In [16]:
# checking sample data from 'Birthday' column of customers dataframe
customers['Birthday'].sample(5)

2869     1984-11-04
14084    1996-08-15
11057    1969-05-02
6069     1949-06-03
9519     1961-05-12
Name: Birthday, dtype: object

In [17]:
# changing the format of date from mm/dd/yyyy to mm-dd-yyyy in 'Order Date' column in sales dataframe.
sales['Order Date'] = pd.to_datetime(sales['Order Date']).dt.strftime('%m-%d-%Y')
# changing the format of date from mm-dd-yyyy to YYYY-MM-DD in 'Order Date' column in sales dataframe
sales['Order Date'] = pd.to_datetime(sales['Order Date'], format='%m-%d-%Y').dt.strftime('%Y-%m-%d')

sales['Order Date'].sample(10)

43854    2019-10-02
42277    2019-09-07
11994    2017-09-30
30467    2019-01-18
8950     2017-05-06
38320    2019-07-01
35504    2019-05-16
27300    2018-12-15
14617    2017-12-28
7468     2017-01-21
Name: Order Date, dtype: object

In [18]:
# changing the format of date from mm/dd/yyyy to mm-dd-yyyy in 'Open Date' column in stores dataframe.
stores['Open Date'] = pd.to_datetime(stores['Open Date']).dt.strftime('%m-%d-%Y')
# changing the format of date from mm-dd-yyyy to YYYY-MM-DD in 'Open Date' column in stores dataframe
stores['Open Date'] = pd.to_datetime(stores['Open Date'], format='%m-%d-%Y').dt.strftime('%Y-%m-%d')

stores['Open Date'].sample(10)

19    2012-12-15
45    2012-08-08
61    2018-06-03
42    2015-01-01
60    2012-12-15
2     2012-01-07
6     2007-05-07
22    2010-01-01
46    2015-04-04
52    2012-06-06
Name: Open Date, dtype: object

In [6]:
# changing the format of date from mm/dd/yyyy to mm-dd-yyyy in 'Date' column in exchange_rates dataframe.
exchange_rates['Date'] = pd.to_datetime(exchange_rates['Date']).dt.strftime('%m-%d-%Y')
# changing the format of date from mm-dd-yyyy to YYYY-MM-DD in 'Date' column in exchange_rates dataframe
exchange_rates['Date'] = pd.to_datetime(exchange_rates['Date'], format='%m-%d-%Y').dt.strftime('%Y-%m-%d')

exchange_rates['Date'].head()

0    2015-01-01
1    2015-01-01
2    2015-01-01
3    2015-01-01
4    2015-01-01
Name: Date, dtype: object

removing the $ symbol in 'Unit Cost USD' and 'Unit Price USD' columns from products dataframe

In [20]:
# removing the $ symbol in 'Unit Cost USD' and 'Unit Price USD' columns
products['Unit Cost USD'] = products['Unit Cost USD'].str.replace('$', '', regex=False)
products['Unit Price USD'] = products['Unit Price USD'].str.replace('$', '', regex=False)

In [31]:
# removing the ',' symbol in 'Unit Cost USD' and 'Unit Price USD' columns
# and convert to numeric type
products['Unit Cost USD'] = products['Unit Cost USD'].str.replace(',', '').astype(float)
products['Unit Price USD'] = products['Unit Price USD'].str.replace(',', '').astype(float)

storing 5 dataframes into 5 tables in mysql

In [9]:
import mysql.connector
con = mysql.connector.connect(
    host="localhost",
    user="root",
    password="password"
)
cursor = con.cursor()
query = "create database IF NOT EXISTS DATASPARK_PROJECT"
cursor.execute(query)
query = "use DATASPARK_PROJECT"
cursor.execute(query)

In [26]:
# Create the Customers table
query = """
CREATE TABLE IF NOT EXISTS Customers (
    CustomerKey INT,
    Gender VARCHAR(10),
    Name VARCHAR(50),
    City VARCHAR(50),
    State_Code VARCHAR(50),
    State_Customer VARCHAR(50),
    Zip_Code VARCHAR(10),
    Country_Customer VARCHAR(50),
    Continent VARCHAR(50),
    Birthday DATE
);
"""
cursor.execute(query)
con.commit()

In [33]:
query = """
INSERT INTO customers (
    CustomerKey, Gender, Name, City, State_Code, State_Customer, 
    Zip_Code, Country_Customer, Continent, Birthday
) VALUES (
    %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
)
"""
for index, row in customers.iterrows():
    data = (
        row['CustomerKey'],
        row['Gender'],
        row['Name'],
        row['City'],
        row['State Code'],
        row['State'],
        row['Zip Code'],
        row['Country'],
        row['Continent'],
        row['Birthday']
    )
    try:
        cursor.execute(query, data)
    except mysql.connector.Error as err:
        print(f"Error inserting row {index}: {err}")
        continue
con.commit()

In [27]:
query = """
CREATE TABLE IF NOT EXISTS Products (
    ProductKey INT,
    Product_Name VARCHAR(100),
    Brand VARCHAR(50),
    Color VARCHAR(30),
    Unit_Cost_USD DECIMAL(10, 2),
    Unit_Price_USD DECIMAL(10, 2),
    SubcategoryKey INT,
    Subcategory VARCHAR(50),
    CategoryKey INT,
    Category VARCHAR(50)
)
"""

cursor.execute(query)
con.commit()

In [34]:
# Query to insert data into the Products table
insert_query = """
INSERT INTO Products (
    ProductKey, Product_Name, Brand, Color, Unit_Cost_USD, Unit_Price_USD,
    SubcategoryKey, Subcategory, CategoryKey, Category
) VALUES (
    %s, %s, %s, %s, %s, %s, %s, %s, %s, %s
)
"""

# Insert data into the Products table
for index, row in products.iterrows():
    data = (
        row['ProductKey'],
        row['Product Name'],
        row['Brand'],
        row['Color'],
        row['Unit Cost USD'],
        row['Unit Price USD'],
        row['SubcategoryKey'],
        row['Subcategory'],
        row['CategoryKey'],
        row['Category']
    )
    try:
        cursor.execute(insert_query, data)
    except mysql.connector.Error as err:
        print(f"Error inserting row {index}: {err}")
        continue

# Commit the transaction
con.commit()

In [28]:
# Create the Sales table
query = """
CREATE TABLE IF NOT EXISTS Sales (
    Order_Number INT,
    Line_Item INT,
    Order_Date DATE,
    CustomerKey INT,
    StoreKey INT,
    ProductKey INT,
    Quantity INT,
    Currency_Code VARCHAR(3)
    )
"""
cursor.execute(query)
con.commit()

In [35]:
# Query to insert data into the Sales table
insert_query = """
INSERT INTO Sales (
    Order_Number, Line_Item, Order_Date, CustomerKey, StoreKey, ProductKey,
    Quantity, Currency_Code
) VALUES (
    %s, %s, %s, %s, %s, %s, %s, %s
)
"""

# Insert data into the Sales table
for index, row in sales.iterrows():
    data = (
        row['Order Number'],
        row['Line Item'],
        row['Order Date'],
        row['CustomerKey'],
        row['StoreKey'],
        row['ProductKey'],
        row['Quantity'],
        row['Currency Code']
    )
    try:
        cursor.execute(insert_query, data)
    except mysql.connector.Error as err:
        print(f"Error inserting row {index}: {err}")
        continue

# Commit the transaction
con.commit()

In [29]:
# Create the Stores table
query = """
CREATE TABLE IF NOT EXISTS Stores (
    StoreKey INT,
    Country_Store VARCHAR(50),
    State_Store VARCHAR(50),
    Square_Meters INT,
    Open_Date DATE
)
"""
cursor.execute(query)
con.commit()

In [37]:
# Query to insert data into the Stores table
insert_query = """
INSERT INTO Stores (
    StoreKey, Country_Store, State_Store, Square_Meters, Open_Date
) VALUES (
    %s, %s, %s, %s, %s
)
"""
# to resolve the error "Unknown column 'nan' in 'field list'"
def handle_nan(value):
    if pd.isna(value):
        return None
    return value

# Insert data into the Stores table
for index, row in stores.iterrows():
    data = (
        handle_nan(row['StoreKey']),
        handle_nan(row['Country']),
        handle_nan(row['State']),
        handle_nan(row['Square Meters']),
        handle_nan(row['Open Date'])
    )
    try:
        cursor.execute(insert_query, data)
    except mysql.connector.Error as err:
        print(f"Error inserting row {index}: {err}")
        continue

# Commit the transaction
con.commit()

In [30]:
query = '''create table if not exists Currency_exchange(
    Date Date, Currency VARCHAR(3), Exchange DECIMAL(5, 4));'''
cursor.execute(query)
con.commit()

In [10]:
# Query to insert data into the Currency_exchange table
insert_query = '''
INSERT INTO Currency_exchange (
    Date, Currency, Exchange
) VALUES (
    %s, %s, %s
)
'''
# Insert data into the Currency_exchange table
for index, row in exchange_rates.iterrows():
    data = (
        row['Date'],
        row['Currency'],
        row['Exchange']
    )
    try:
        cursor.execute(insert_query, data)
    except mysql.connector.Error as err:
        print(f"Error inserting row {index}: {err}")
        continue

# Commit the transaction
con.commit()