[Reference](https://towardsdev.com/data-engineering-project-retail-store-part-2-loading-the-data-7c15c9c387e4)

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

class whisky_web_scraping():
    
    def scrape_html(self,base_url, page):
        '''
        Sending a GET request to https://www.thewhiskyexchange.com/ and creating a Beautiful Soup object.
        
        Args:
            base_url(String)              
            page(Int) - Which page to scrape.
            
        Returns
            soup(BeautfulSoup Object)         
        '''
        
        self.base_url = base_url
        self.page = page

        url = base_url + str(page)
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'lxml')

        return soup
    
    def get_page_content(self, soup):
        '''
        Extract from soup all the html object of type div and of class product-card__content.
        
        Args:
            soup(BeatifulSoup Object)
            
        Returns:
            proudcts_info_content(List of html objects) - List of div objects that contain the name of the beverage, the alcohol amount and percent.
        
        '''
        self.soup = soup
        
        proudcts_info_content = soup.find_all('div', class_ = 'product-card__content')
        return proudcts_info_content
        
    
    def get_page_price(self, soup):
        '''
        Extract from soup all the html object of type div and of class product-card__data.
        
        Args:
            soup(BeatifulSoup Object)
            
        Returns:
            proudcts_info_data(List of html objects) - List of div objects that contain the price of the beverage.
        
        '''
        self.soup = soup

        proudcts_info_price = soup.find_all('div', class_ = 'product-card__data')
        return proudcts_info_price

    
    def get_product_name(self, proudcts_info_content):
        '''
        Extract the name of product.
        
        Args:
            proudcts_info_content(String) - The div object of class product-card__content.
        
        Returns:
            product_name(List) - A list of names.
        '''
        
        self.proudcts_info_content = proudcts_info_content        
    
        product_name = []
        
        # Iterate through each product in the webpage 
        for product in range(len(proudcts_info_content)):
            
            # Extract the first class P - Which holds the name of the beverage
            name_p = proudcts_info_content[product].find_all('p')[0]
            
            # Extract the contents of the first paragraphs - the name of the beverage       
            alcohol_name = name_p.contents[0].strip()
            
            # Append each name to the list
            product_name.append(alcohol_name)
            
        return product_name

    def get_product_alcohol_percent(self, proudcts_info_content):
        '''
        Extract the alcohol percent of product.
        
        Args:
            proudcts_info_content(String) - The div object of class product-card__content.
        
        Returns:
            product_al_percent(List) - A list of alcohol percent.
        '''
        
        self.proudcts_info_content = proudcts_info_content        
    
        product_al_percent = []
        
        # Iterate through each product in the webpage 
        for product in range(len(proudcts_info_content)):
            
            # Extract the second class P - Which holds the alcohol values
            al_p = proudcts_info_content[product].find_all('p')[1]
            
            # Apply string manupulation to extract the alcohol percent
            alcohol_percent_str = al_p.contents[0].strip()
            start_location_percent = alcohol_percent_str.find('/ ') 
            end_location_percent = alcohol_percent_str.find('%')
            alcohol_percent = alcohol_percent_str[start_location_percent + 2:end_location_percent]
            
            # Append each alcohol percent to the list
            product_al_percent.append(alcohol_percent)
            
        return product_al_percent
    
    
    def get_product_alcohol_amount(self, proudcts_info_content):
        '''
        Extract the alcohol amount of product.
        
        Args:
            proudcts_info_content(String) - The div object of class product-card__content.
        
        Returns:
            product_al_percent(List) - A list of alcohol amount.
        '''
        self.proudcts_info_content = proudcts_info_content        

        product_al_amount = []
        
        # Iterate through each product in the webpage 
        for product in range(len(proudcts_info_content)):
            
            # Extract the second class P - Which holds the alcohol values
            al_p = proudcts_info_content[product].find_all('p')[1]
            
            # Apply string manupulation to extract the alcohol amount
            alcohol_percent_str = al_p.contents[0].strip()
            start_location_amount = 0
            end_location_amount = alcohol_percent_str.find('cl')
            alcohol_amount = alcohol_percent_str[start_location_amount:end_location_amount]
            
            # Append each alcohol amount to the list
            product_al_amount.append(alcohol_amount)
            
        return product_al_amount
    
    def get_product_price(self, proudcts_info_price):
        '''
        Extract the price of product.
        
        Args:
            proudcts_info_content(String) - The div object of class product-card__data.
        
        Returns:
            product_price(List) - A list of prices.
        '''
        self.proudcts_info_price = proudcts_info_price
    
        product_price = []
        
        # Iterate through each product in the webpage 
        for product in range(len(proudcts_info_price)):
            
            # Extract the price for each product 
            alcohol_price = proudcts_info_price[product].contents[0].contents[0].replace('£','').strip()
            
            # Append each alcohol price to the list
            product_price.append(alcohol_price)
            
        return product_price
    
    def create_df(self, names, alcohol_amount, alcohol_percent, price):
        '''
        Create a DataFrame that will hold the extracted data.
        
        Args:
            names(List) - A list of of product names.  
            alcohol_amount(List) - A list of of product alcohol amounts.  
            alcohol_percent(List) - A list of of product alcohol percent.  
            price(List) - A list of of product prices.  
        
        Returns:
            original_df(DataFrame)
        '''
        
        
        self.names = names
        self.alcohol_amount = alcohol_amount
        self.alcohol_percent = alcohol_percent
        self.price = price
        
        # Create a DataFrame
        original_df = pd.DataFrame(names, columns=['Product_Name'])
        original_df['Alcohol_Percent'] = alcohol_percent
        original_df['Alcohol_Amount'] = alcohol_amount
        original_df['Alcohol_Price'] = price
        
        return original_df
    
    def insert_to_df(self, original_df, new_df):
        '''
        Insert new data into an existing dataframe.
        
        Args:
            original_df(DataFrame) : DataFrame with data from the first page of a product.
            new_df(DataFrame) : DataFrame with data from other pages. 
            
        Returns:
            original_df(DataFrame) : DataFrame with data from original_df + new_df.
        '''
        
        self.original_df = original_df
        self.new_df = new_df
        
        # Insert new data into the DataFrame of the first page
        original_df = original_df.append(new_df,ignore_index=True, verify_integrity = True)

        return original_df
    
    def get_links(self, url = 'https://www.thewhiskyexchange.com/'):
        '''
        Generate a list of links that showcase whiskey beverages.
        
        Args:
            url(String) - The URL of the main page.
                Default - https://www.thewhiskyexchange.com/
        Returns:
            relevant_links(List) - A list of only the links that showcase a type of whiskey.
        '''
        
        self.url = url
        
        # Generate a BeautifullSoup object called soup
        url = url
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'lxml')
        
        # Collect all the html objects of type 'a'
        a_tags = soup.find_all('a', class_ ='subnav__link' )
        
        links_list = []
        
        # Collect all the hyper links of the webpage.
        for link in a_tags:
            links_list.append(link.get('href'))

        relevant_links = []
        
        # Iterate through the links and filter only the relevant ones that showcase a type of whiskey.
        for link in links_list:
            if link is not None and '/c/' in link and 'whisky' in link and '?' not in link:
                relevant_links.append(link)

        return relevant_links

    def scrape_whisky(self, url = 'https://www.thewhiskyexchange.com', number_of_pages = 5):
        '''
        1. Combining all of the methods into a single place. 
        2. Extracting a default number of pages from every page that showcases a type of whiskey.
        3. Export each data of each whiskey type to a CSV file. 
        4. Return a single DataFrame with all of the scraped whiskey data.
        
        Args:
            url(String) - The base url to extract data from.
                Default - https://www.thewhiskyexchange.com
                
						number_of_pages(Int) - The total number of pages to scrape.
							  Default = 5
        
        Returns:
            df(DataFrame) - The entire data scraped throughout this project in a single DataFrame.
        '''
        
        self.url = url
        self.number_of_pages = number_of_pages
        df = pd.DataFrame()
        # Creating a scraper object 
        s = whisky_web_scraping()
        
        # Generating the relevant links to scrape data from
        links = s.get_links()
        
        # Iterating throught each link
        for link in links:
            
            try:
                # For each page in each link, generate a DataFrame of whiskey related data
                for page in range(0,number_of_pages):
                    soup = s.scrape_html(base_url = url + link + '?pg='
                                                    ,page = page+1)

                    content_html = s.get_page_content(soup)
                    price_html = s.get_page_price(soup)

                    names = s.get_product_name(content_html)
                    alcohol_amount = s.get_product_alcohol_amount(content_html)
                    alcohol_percent = s.get_product_alcohol_percent(content_html)
                    price = s.get_product_price(price_html)
                    
                    # Create a new DataFrame for the first page of each whiskey type
                    if page == 0:
                        data = s.create_df(names,alcohol_amount, alcohol_percent, price)
                    
                    # Insert to an existing DataFrame new data.
                    data = s.insert_to_df(data, s.create_df(names,alcohol_amount, alcohol_percent, price))                  
            except:
                print('Error with the link: {}'.format(link))
            # Export data for each whiskey type to a seperate CSV file
            finally:
                start_location = link.rfind('/')+1
                end_location = len(link)
                data.to_csv(link[start_location:end_location] + '.csv')
                df = df.append(data, ignore_index = True)
                
        return df

In [2]:
# Create a scraper object
scraper = whisky_web_scraping()

# Scrape Data
product_df = scraper.scrape_whisky(number_of_pages=5)

# Export to CSV
product_df.to_csv('whiskey_data.csv')

Error with the link: /c/423/whisky-books
Error with the link: /c/423/whisky-books


# Imports and Functions


In [3]:
pip install names

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting names
  Downloading names-0.3.0.tar.gz (789 kB)
[K     |████████████████████████████████| 789 kB 5.4 MB/s 
[?25hBuilding wheels for collected packages: names
  Building wheel for names (setup.py) ... [?25l[?25hdone
  Created wheel for names: filename=names-0.3.0-py3-none-any.whl size=803699 sha256=bc9bdbf678215e771b3379414c2463895e21603ffcb25f570e8d4f13e2b1f815
  Stored in directory: /root/.cache/pip/wheels/05/ea/68/92f6b0669e478af9b7c3c524520d03050089e034edcc775c2b
Successfully built names
Installing collected packages: names
Successfully installed names-0.3.0


In [4]:
!pip install faker
!pip install pandasql

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faker
  Downloading Faker-13.12.0-py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 5.3 MB/s 
Installing collected packages: faker
Successfully installed faker-13.12.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandasql
  Downloading pandasql-0.7.3.tar.gz (26 kB)
Building wheels for collected packages: pandasql
  Building wheel for pandasql (setup.py) ... [?25l[?25hdone
  Created wheel for pandasql: filename=pandasql-0.7.3-py3-none-any.whl size=26784 sha256=dd629b60c42101230f24075e8d3241b7da009b12e73efe0ad4e51286ac046a9c
  Stored in directory: /root/.cache/pip/wheels/5c/4b/ec/41f4e116c8053c3654e2c2a47c62b4fca34cc67ef7b55deb7f
Successfully built pandasql
Installing collected packages: pandasql
Successfully installed pandasql-0.7.3


In [5]:
import numpy as np
import pandas as pd
import names
from faker import Faker
faker = Faker()
import pandasql as ps
import random
import time
from datetime import datetime

def sql(query):
    return ps.sqldf(query)

# 1. Generating Product Data


In [6]:
# Loading the CSV file from part 1 into a dataframe
product_df = pd.read_csv('whiskey_data.csv',index_col='Unnamed: 0')

# Changing the Alcohol Price to float
product_df['Alcohol_Price'] = product_df.Alcohol_Price.str.replace(',','').astype('float')

In [7]:
# Generate a column of unique product ids
product_id = np.random.default_rng().choice(len(product_df.Product_Name), len(product_df.Product_Name), replace = False)

# Verify that there are as many ids as there are products
assert len(set(product_id)) == len(product_df.Product_Name)

# Verify that the new ids are unique
assert len(pd.Series(product_id).unique()) == len(product_id)

# Insert the new column into the dataframe
product_df['product_id'] = product_id

In [8]:
# Sample Output
product_df.head()

Unnamed: 0,Product_Name,Alcohol_Percent,Alcohol_Amount,Alcohol_Price,product_id
0,Deanston 18 Year Old,46.3,70.0,63.95,927
1,Balvenie 16 Year Old French Oak,47.6,70.0,123.0,2502
2,Lagavulin 16 Year Old,43.0,70.0,74.95,78
3,Longmorn 18 Year Old,48.0,70.0,79.95,1333
4,Lagavulin 2006 Distillers Edition,43.0,70.0,89.95,779


# 2. Generating Employee Data


In [9]:
# Generating 100 Employee Unique id's
employee_id = np.random.default_rng().choice(4000, 100, replace = False)

# Verify that there are as many ids as there are employees
assert len(set(employee_id)) == 100

# Verify that the new ids are unique
assert len(pd.Series(employee_id).unique()) == len(employee_id)

In [10]:
# Generating 100 Employee Data
employee_first_name = []
employee_last_name = []
employee_full_name = []
employee_email = []
employee_city = []
departments = ['Sales', 'Finance', 'Marketing', 'BI']
employee_department = []

# iterate through the employees and generate random data
for i in range(len(employee_id)):
    employee_first_name.append(names.get_first_name())
    employee_last_name.append(names.get_last_name())
    employee_full_name.append(employee_first_name[i] + ' ' + employee_last_name[i])
    employee_email.append(employee_first_name[i] + employee_last_name[i][0].lower() + '@gmail.com')
    employee_city.append(faker.city())
    employee_department.append(np.random.choice(departments, 1)[0])

In [11]:
# Create an employee dataframe
employee_df = pd.DataFrame(employee_id, columns = ['employee_id'])
employee_df['first_name'] = employee_first_name
employee_df['last_name'] = employee_last_name
employee_df['full_name'] = employee_full_name
employee_df['email'] = employee_email
employee_df['city'] = employee_city
employee_df['department'] = employee_department

In [12]:
# Sample Output
employee_df.head()

Unnamed: 0,employee_id,first_name,last_name,full_name,email,city,department
0,2708,Aurelio,Caldwell,Aurelio Caldwell,Aurelioc@gmail.com,Lovemouth,Sales
1,2092,Donna,Ryan,Donna Ryan,Donnar@gmail.com,East Caleb,Sales
2,1231,Helen,Andino,Helen Andino,Helena@gmail.com,New Joshuaview,BI
3,3377,Sherrie,Hernandez,Sherrie Hernandez,Sherrieh@gmail.com,Port Kellyview,Marketing
4,351,Kristen,Arkell,Kristen Arkell,Kristena@gmail.com,Port Jenniferport,BI


# 3. Generating Customer Data


In [13]:
# Generating 1000 Customer Unique id's
customer_id = np.random.default_rng().choice(999999, 1000, replace = False)

# Verify that there are as many ids as there are customers
assert len(set(customer_id)) == 1000

# Verify that the new ids are unique
assert len(pd.Series(customer_id).unique()) == len(customer_id)

In [14]:
# Generating 1000 Customers Data
customer_first_name = []
customer_last_name = []
customer_full_name = []
customer_email = []
customer_last_four_digits = []
customer_country = []
customer_country_code = []
customer_street = []
customer_credit_card_company = []


# iterate through the customers and generate random data
for i in range(len(customer_id)): 
    customer_first_name.append(names.get_first_name())
    customer_last_name.append(names.get_last_name())
    customer_full_name.append(customer_first_name[i] + ' ' + customer_last_name[i])
    customer_email.append(customer_first_name[i] + customer_last_name[i][0].lower() + '@gmail.com')
    customer_last_four_digits.append(np.random.randint(low = 1000, high = 9999, size = 1)[0])
    customer_country.append(faker.country())
    customer_country_code.append(customer_country[i][0:3].upper())
    customer_street.append(faker.street_address())
    customer_credit_card_company.append(faker.credit_card_provider())

In [15]:
# Create a customer dataframe
customer_df = pd.DataFrame(customer_id, columns = ['customer_id'])
customer_df['first_name'] = customer_first_name
customer_df['last_name'] = customer_last_name
customer_df['full_name'] = customer_full_name
customer_df['email'] = customer_email
customer_df['country'] = customer_country
customer_df['country_code'] = customer_country_code
customer_df['street'] = customer_street
customer_df['credit_provider'] = customer_credit_card_company
customer_df['four_digits'] = customer_last_four_digits

In [16]:
# Sample Output
customer_df.head()

Unnamed: 0,customer_id,first_name,last_name,full_name,email,country,country_code,street,credit_provider,four_digits
0,681646,Angela,Evans,Angela Evans,Angelae@gmail.com,South Africa,SOU,4219 Gabrielle Lodge Suite 223,Diners Club / Carte Blanche,4071
1,279225,Jesse,Holdman,Jesse Holdman,Jesseh@gmail.com,Northern Mariana Islands,NOR,88423 Joyce Rapids Apt. 879,Mastercard,6214
2,304461,William,Monroy,William Monroy,Williamm@gmail.com,Congo,CON,7339 Tucker Plain,Diners Club / Carte Blanche,1991
3,1368,Dennis,Cameron,Dennis Cameron,Dennisc@gmail.com,Portugal,POR,5687 Brenda Shore Suite 001,Discover,2635
4,139557,Michael,Parrott,Michael Parrott,Michaelp@gmail.com,Uzbekistan,UZB,80906 Hoffman Park Suite 331,VISA 16 digit,4252


# 3. Generating Payments Data


In [17]:
# Generating random days in the range of 1990 to 2020
date_range = pd.date_range(start = "1990-01-01", end = "2020-12-31", freq="D",)

In [18]:
# Generating Unique payment id's
payment_id = np.random.default_rng().choice(999999, len(date_range), replace = False)

# Verify that there are as many ids as there are dates
assert len(set(payment_id)) == len(date_range)

# Verify that the new ids are unique
assert len(pd.Series(payment_id).unique()) == len(payment_id)

In [19]:
# Generating payments Data
customer_id_payments = []
employee_id_payments = []
product_id_payments = []
dates = []


# iterate through the payments and generate random data
for i in range(len(payment_id)):
    dates.append(datetime.strftime(random.choice(date_range), format='%Y-%m-%d'))
    customer_id_payments.append(random.choice(customer_id))
    employee_id_payments.append(random.choice(employee_id))
    product_id_payments.append(random.choice(product_id))

In [20]:
# Create a payments dataframe
payment_df = pd.DataFrame(payment_id, columns = ['payment_id'])
payment_df['date'] = sorted(dates)
payment_df['customer_id'] = customer_id_payments
payment_df['employee_id'] = employee_id_payments
payment_df['product_id'] = product_id_payments

In [21]:
# Adding the Alcohol_price column to the table
query = '''
select p1.*, p2.Alcohol_Price as price
from payment_df p1
inner join product_df p2
on p1.product_id = p2.product_id
'''

payment_df = sql(query)

In [22]:
# Sample Output
payment_df.head()

Unnamed: 0,payment_id,date,customer_id,employee_id,product_id,price
0,346365,1990-01-01,927736,3980,2805,64.95
1,458894,1990-01-01,373927,1893,3972,62.95
2,540564,1990-01-01,531262,3672,2173,63.95
3,975705,1990-01-02,184571,1835,3990,225.0
4,387441,1990-01-03,855038,2878,3114,33.95


# Normalizing Tables


## 1. Normalizing the Customers Table


In [23]:
# Sample Output
customer_df.head()

Unnamed: 0,customer_id,first_name,last_name,full_name,email,country,country_code,street,credit_provider,four_digits
0,681646,Angela,Evans,Angela Evans,Angelae@gmail.com,South Africa,SOU,4219 Gabrielle Lodge Suite 223,Diners Club / Carte Blanche,4071
1,279225,Jesse,Holdman,Jesse Holdman,Jesseh@gmail.com,Northern Mariana Islands,NOR,88423 Joyce Rapids Apt. 879,Mastercard,6214
2,304461,William,Monroy,William Monroy,Williamm@gmail.com,Congo,CON,7339 Tucker Plain,Diners Club / Carte Blanche,1991
3,1368,Dennis,Cameron,Dennis Cameron,Dennisc@gmail.com,Portugal,POR,5687 Brenda Shore Suite 001,Discover,2635
4,139557,Michael,Parrott,Michael Parrott,Michaelp@gmail.com,Uzbekistan,UZB,80906 Hoffman Park Suite 331,VISA 16 digit,4252


In [24]:
# Creating a new table called countries
unique_countries = customer_df.country.unique()
countries_df = pd.DataFrame(unique_countries, columns = ['Country'])
countries_df['Country_Code'] = countries_df.Country.str[0:3]
countries_df['Country_Code'] = countries_df.Country_Code.str.upper()
countries_df['country_id'] = [*range(0,len(countries_df))]
countries_df.head()

Unnamed: 0,Country,Country_Code,country_id
0,South Africa,SOU,0
1,Northern Mariana Islands,NOR,1
2,Congo,CON,2
3,Portugal,POR,3
4,Uzbekistan,UZB,4


In [25]:
# Extracting the country_id column from customers
query = '''
select countries_df.country_id
from customer_df 
join countries_df
on 
    customer_df.country_code = countries_df.country_code and
    customer_df.country = countries_df.country
'''

country_ids = sql(query)


# Connecting countries to customers by adding the foregin key: country_id
customer_df['country_id'] = country_ids

In [26]:
# Dropping the column country and country_code
customer_df = customer_df.drop(['country','country_code'],axis=1)

In [27]:
# Creating a new table called customer_cc
unique_cc_providers = customer_df.credit_provider.unique()
customer_cc_df = pd.DataFrame(unique_cc_providers, columns = ['credit_provider'])
customer_cc_df['credit_provider_id'] = [*range(0,len(customer_cc_df))]
customer_cc_df.head()

Unnamed: 0,credit_provider,credit_provider_id
0,Diners Club / Carte Blanche,0
1,Mastercard,1
2,Discover,2
3,VISA 16 digit,3
4,JCB 16 digit,4


In [28]:
# Extracting the credit_provider_id column from customers
query = '''
select customer_cc_df.credit_provider_id
from customer_df 
join customer_cc_df
on 
    customer_df.credit_provider = customer_cc_df.credit_provider
'''

credit_provider_id = sql(query)

# Connecting customer_cc to customers by adding the foregin key: credit_provider_id 
customer_df['credit_provider_id'] = credit_provider_id

In [29]:
# Dropping the column credit_provider
customer_df = customer_df.drop(['credit_provider'],axis=1)

In [30]:
# Sample Output
employee_df.head()

Unnamed: 0,employee_id,first_name,last_name,full_name,email,city,department
0,2708,Aurelio,Caldwell,Aurelio Caldwell,Aurelioc@gmail.com,Lovemouth,Sales
1,2092,Donna,Ryan,Donna Ryan,Donnar@gmail.com,East Caleb,Sales
2,1231,Helen,Andino,Helen Andino,Helena@gmail.com,New Joshuaview,BI
3,3377,Sherrie,Hernandez,Sherrie Hernandez,Sherrieh@gmail.com,Port Kellyview,Marketing
4,351,Kristen,Arkell,Kristen Arkell,Kristena@gmail.com,Port Jenniferport,BI


In [31]:
# Extracting the departments from the employees table
departments = pd.Series(employee_df.department.unique()).to_list()

# Generating unique department ids
department_id = [*range(0, len(departments))]

# Creating a table called departments
department_df = pd.DataFrame(department_id, columns=['department_id'])
department_df['department'] = departments

In [32]:
department_df

Unnamed: 0,department_id,department
0,0,Sales
1,1,BI
2,2,Marketing
3,3,Finance


In [33]:
# Extracting the country_id column from customers
query = '''
select department_df.department_id
from employee_df 
join department_df
on 
    employee_df.department = department_df.department
'''

department_ids = sql(query)

# Connecting countries to customers by adding the foregin key: country_id
employee_df['department_id'] = department_ids

In [34]:
# Dropping the column department
employee_df = employee_df.drop('department',axis = 1)

In [35]:
# Sample Output
employee_df.head()

Unnamed: 0,employee_id,first_name,last_name,full_name,email,city,department_id
0,2708,Aurelio,Caldwell,Aurelio Caldwell,Aurelioc@gmail.com,Lovemouth,0
1,2092,Donna,Ryan,Donna Ryan,Donnar@gmail.com,East Caleb,0
2,1231,Helen,Andino,Helen Andino,Helena@gmail.com,New Joshuaview,1
3,3377,Sherrie,Hernandez,Sherrie Hernandez,Sherrieh@gmail.com,Port Kellyview,2
4,351,Kristen,Arkell,Kristen Arkell,Kristena@gmail.com,Port Jenniferport,1


# Step #3– Loading the data into the central RDBMS


In [37]:
!pip install pymysql

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymysql
  Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 1.2 MB/s 
[?25hInstalling collected packages: pymysql
Successfully installed pymysql-1.0.2


## 1. Connecting Python to MySQL


In [39]:
# # Connecting Python to MySQL
# import pymysql
# import pandas as pd

# connection = pymysql.connect(host ='localhost',port=int(3306),user='root',passwd='1365')

# # Creating a cursor object
# cursor = connection.cursor()

## 2. Creating a new Schema


In [40]:
# # Create a new schema called whiskey_shop
# cursor.execute('''
# drop schema if exists whiskey_retail_shop;
# ''')

# cursor.execute('''
# create schema whiskey_retail_shop;
# ''')

# # Use the new schema
# cursor.execute('''
# use whiskey_retail_shop;
# ''')

## 3. Generating empty tables


In [41]:
# cursor.execute('''
# DROP TABLE IF EXISTS countries;
# ''')

# cursor.execute('''
# CREATE TABLE countries (
#     Country VARCHAR(100) NOT NULL,
#     Country_Code VARCHAR(100) NOT NULL,
#     country_id INT PRIMARY KEY
#     );
# ''')

In [42]:
# cursor.execute('''
# DROP TABLE IF EXISTS customer_cc;
# ''')

# cursor.execute('''
# CREATE TABLE customer_cc (
#     credit_provider VARCHAR(100) NOT NULL,
#     credit_provider_id INT PRIMARY KEY
#     );
# ''')

In [43]:
# cursor.execute('''
# DROP TABLE IF EXISTS products;
# ''')

# cursor.execute('''
# CREATE TABLE products (
#     Product_Name VARCHAR(100) NOT NULL,
#     Alcohol_Percent FLOAT NOT NULL,
#     Alcohol_Amount FLOAT NOT NULL,
#     Alcohol_Price FLOAT NOT NULL,
#     product_id int NOT NULL PRIMARY KEY
#     );
# ''')

In [44]:
# cursor.execute('''
# DROP TABLE IF EXISTS departments;
# ''')

# cursor.execute('''
# CREATE TABLE departments (
#     department_id INT PRIMARY KEY,
#     department VARCHAR(100) NOT NULL
#     );
# ''')

In [45]:
# cursor.execute('''
# DROP TABLE IF EXISTS customers;
# ''')

# cursor.execute('''
# CREATE TABLE customers (
#     customer_id INT PRIMARY KEY NOT NULL,
#     first_name VARCHAR(100) NOT NULL,
#     last_name VARCHAR(100) NOT NULL,
#     full_name VARCHAR(100) NOT NULL,
#     email VARCHAR(100) NOT NULL,
#     street VARCHAR(100) NOT NULL,
#     four_digits INT NOT NULL,
#     country_id INT NOT NULL,
#     credit_provider_id INT NOT NULL,
    
#     FOREIGN KEY (country_id) REFERENCES countries (country_id),
#     FOREIGN KEY (credit_provider_id) REFERENCES customer_cc (credit_provider_id)
# );
# ''')

In [46]:
# cursor.execute('''
# DROP TABLE IF EXISTS employees;
# ''')

# cursor.execute('''
# CREATE TABLE employees (
#     employee_id INT PRIMARY KEY NOT NULL,
#     first_name VARCHAR(100) NOT NULL,
#     last_name VARCHAR(100) NOT NULL,
#     full_name VARCHAR(100) NOT NULL,
#     email VARCHAR(100) NOT NULL,
#     city VARCHAR(100) NOT NULL,
#     department_id INT NOT NULL,
    
#     FOREIGN KEY (department_id) REFERENCES departments(department_id)
# );
# ''')

In [47]:
# cursor.execute('''
# DROP TABLE IF EXISTS payments;
# ''')

# cursor.execute('''
# CREATE TABLE payments (
#     payment_id INT NOT NULL PRIMARY KEY,
#     date DATE NOT NULL,
#     customer_id INT NOT NULL,
#     employee_id INT NOT NULL,
#     product_id INT NOT NULL,
#     price FLOAT NOT NULL
#     );
# ''')

## 3. Populating the tables


In [48]:
# # Convert the Dataframe into a list of arrays
# records = countries_df.to_records(index=False)

# # Convert the list of arrays into a tuple of tuples
# result = tuple(records)

# for data in range(0,len(result)):
    
#     # Create a new record
#     query = "insert into countries (country, country_code, country_id) values {}".format(result[data])
    
#     # Execute the query
#     cursor.execute(query)
    
    
# # Commit the transaction
# connection.commit()

In [49]:
# # customer_cc
# # Convert the Dataframe into a list of arrays
# records = customer_cc_df.to_records(index=False)

# # Convert the list of arrays into a tuple of tuples
# result = tuple(records)

# for data in range(0,len(result)):
    
#     # Create a new record
#     query = "insert into customer_cc (credit_provider, credit_provider_id) values {}".format(result[data])
    
#     # Execute the query
#     cursor.execute(query)
    
    
# # Commit the transaction
# connection.commit()

In [50]:
# # Products
# # Convert the Dataframe into a list of arrays
# records = product_df.to_records(index=False)

# # Convert the list of arrays into a tuple of tuples
# result = tuple(records)

# for data in range(0,len(result)):
    
#     # Create a new record
#     query = "insert into products (Product_Name, Alcohol_Percent, Alcohol_Amount, Alcohol_Price,product_id) values {}".format(result[data])
    
#     # Execute the query
#     cursor.execute(query)
    
    
# # Commit the transaction
# connection.commit()

In [51]:
# # Departments
# # Convert the Dataframe into a list of arrays
# records = department_df.to_records(index=False)

# # Convert the list of arrays into a tuple of tuples
# result = tuple(records)

# for data in range(0,len(result)):
    
#     # Create a new record
#     query = "insert into departments (department_id, department) values {}".format(result[data])
    
#     # Execute the query
#     cursor.execute(query)
    
    
# # Commit the transaction
# connection.commit()

In [52]:
# # Customers
# # Convert the Dataframe into a list of arrays
# records = customer_df.to_records(index=False)

# # Convert the list of arrays into a tuple of tuples
# result = tuple(records)

# for data in range(0,len(result)):
    
#     # Create a new record
#     query = "insert into customers (customer_id, first_name, last_name, full_name, email, street, four_digits, country_id, credit_provider_id) values {}".format(result[data])
    
#     # Execute the query
#     cursor.execute(query)
    
    
# # Commit the transaction
# connection.commit()

In [53]:
# # Employees
# # Convert the Dataframe into a list of arrays
# records = employee_df.to_records(index=False)

# # Convert the list of arrays into a tuple of tuples
# result = tuple(records)

# for data in range(0,len(result)):
    
#     # Create a new record
#     query = "insert into employees (employee_id, first_name, last_name, full_name,email,city, department_id) values {}".format(result[data])
    
#     # Execute the query
#     cursor.execute(query)
    
    
# # Commit the transaction
# connection.commit()

In [54]:
# # Payments
# # Convert the Dataframe into a list of arrays
# records = payment_df.to_records(index=False)

# # Convert the list of arrays into a tuple of tuples
# result = tuple(records)

# for data in range(0,len(result)):
    
#     # Create a new record
#     query = "insert into payments (payment_id, date,customer_id,employee_id,product_id,price) values {}".format(result[data])
    
#     # Execute the query
#     cursor.execute(query)
    
    
# # Commit the transaction
# connection.commit()