<a href="https://colab.research.google.com/github/DrAdamDev/ETL-pipeline-for-UK-Employment-data/blob/main/UK_Defence_Industry_Analysis_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install mysql-connector-python

In [69]:
import requests
import pandas as pd
import sqlite3
import re
import os
import zipfile
import mysql.connector
import logging
import socket
import matplotlib.pyplot as plt

In [106]:
def load_excel_data(url):
  response = requests.get(url)
  # Check that the response is successful
  response = requests.get(url)
  # Make sure the request was successful
  response.raise_for_status()
  # Load each sheet as a separate DataFrame
  dfs = pd.read_excel(response.content, sheet_name=None)
  return dfs

def display_data(data_dict):
  # Access each DataFrame by sheet name
  for sheet_name, df in data_dict.items():
      # Perform operations on the DataFrame as required
      print(f"Sheet Name: {sheet_name}")
      print(df.head(25))  # Display the first few rows of the DataFrame

def unzip_files(url):
    response = requests.get(url)
    response.raise_for_status()

    # Save the ZIP file locally
    with open('archive.zip', 'wb') as f:
        f.write(response.content)

    # Extract the contents of the ZIP file to a folder
    extract_folder = 'extracted_files'
    with zipfile.ZipFile('archive.zip', 'r') as zip_ref:
        zip_ref.extractall(extract_folder)

    # Get the path of the folder containing the extracted files
    folder_path = os.path.abspath(extract_folder)

    return folder_path

def get_dict_name(file_name):
    # Extract the relevant parts of the file name using regular expressions
    regex = r"Table\s([\d\.]+[a-z]+)\s+(.*)\d{4}"
    match = re.search(regex, file_name)
    if match:
        table_number = match.group(1).replace(".", "")
        description = match.group(2).strip().lower()
        # Generate the dictionary name by combining the table number and description
        dict_name = f"{description}_data_dict_{table_number}"
        return dict_name
    else:
        return None      

def load_multiple_excel_data(url):
    # Unzip the files and get the path to the folder
    folder_path = unzip_files(url)

    # Load the Excel files in the folder
    data_frames = {}
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        dict_name = get_dict_name(filename)
        df = pd.read_excel(file_path, sheet_name=None)
        data_frames[dict_name] = df

    return data_frames

def create_table(cursor, table_name, columns, primary_key=None, foreign_key=None, indexes=None):
    create_table_query = f"CREATE TABLE IF NOT EXISTS {table_name} ({columns}"

    if primary_key:
        create_table_query += f", PRIMARY KEY ({primary_key})"

    if foreign_key:
      for foreign_key in foreign_key:
        foreign_key_query = f"FOREIGN KEY ({foreign_key[0]}) REFERENCES {foreign_key[1]} ({foreign_key[0]})"
        create_table_query += f", {foreign_key_query}"

    create_table_query += ")"
    cursor.execute(create_table_query)

    print(create_table_query)

    if indexes:
        for index in indexes:
            index_query = f"CREATE INDEX IF NOT EXISTS idx_{table_name}_{index} ON {table_name} ({index})"
            cursor.execute(index_query)

def create_database():

    region_columns = ', '.join(['Region_ID int NOT NULL', 'Region_Name varchar(255) NOT NULL'])
    BIG_columns = ', '.join(['BIG_ID int NOT NULL', 'BIG_Name varchar(255) NOT NULL'])
    region_BIG_columns = ', '.join(['Region_BIG_ID int NOT NULL','Region_ID int NOT NULL', 'BIG_ID int NOT NULL'])
    PT_employees_columns = ', '.join(['Region_BIG_ID int NOT NULL','PT_Public float','PT_Private float', 'PT_Pub_Priv float'])
    FT_employees_columns = ', '.join(['Region_BIG_ID int NOT NULL','FT_Public float','FT_Private float', 'FT_Pub_Priv float'])
    Total_employees_columns = ', '.join(['Region_BIG_ID int NOT NULL','FTPT_Public float','FTPT_Private float', 'FTPT_Pub_Priv float'])
    Total_employment_columns = ', '.join(['Region_BIG_ID int NOT NULL','All_Public float','All_Private float', 'All_Pub_Priv float'])

    # Create tables
    create_table(cursor, 'Region', region_columns, primary_key='Region_ID', foreign_key=None, indexes=['Region_Name'])
    create_table(cursor, 'BIG', BIG_columns, primary_key='BIG_ID', foreign_key=None, indexes=['BIG_Name'])
    create_table(cursor, 'Region_BIG', region_BIG_columns, primary_key='Region_BIG_ID', foreign_key=[['Region_ID', 'Region'],['BIG_ID', 'BIG']], indexes=None)
    create_table(cursor, 'FT_employees', FT_employees_columns, primary_key='Region_BIG_ID', foreign_key=[['Region_BIG_ID', 'Region_BIG']], indexes=None)
    create_table(cursor, 'PT_employees', PT_employees_columns, primary_key='Region_BIG_ID', foreign_key=[['Region_BIG_ID', 'Region_BIG']], indexes=None)
    create_table(cursor, 'FTPT_employees', Total_employees_columns, primary_key='Region_BIG_ID', foreign_key=[['Region_BIG_ID', 'Region_BIG']], indexes=None)
    create_table(cursor, 'All_employees', Total_employment_columns, primary_key='Region_BIG_ID', foreign_key=[['Region_BIG_ID', 'Region_BIG']], indexes=None)

In [None]:
employment_data_url = 'https://www.ons.gov.uk/file?uri=/employmentandlabourmarket/peopleinwork/employmentandemployeetypes/datasets/regionbybroadindustrygroupsicbusinessregisterandemploymentsurveybrestable4/2021provisional/table42021p.xlsx'
jobs_data_url = 'https://www.ons.gov.uk/file?uri=/employmentandlabourmarket/peopleinwork/employmentandemployeetypes/datasets/workforcejobsbyindustryjobs02/current/jobs02mar2023.xls'
earnings_data_url = 'https://www.ons.gov.uk/file?uri=/employmentandlabourmarket/peopleinwork/earningsandworkinghours/datasets/regionbyindustry2digitsicashetable5/2022provisional/ashetable52022provisional.zip'

employment_data_dict = load_excel_data(employment_data_url)
display_data(employment_data_dict)

#jobs_data_dict = load_excel_data(jobs_data_url)
# display_data(jobs_data_dict) 

#earnings_data_dict = load_multiple_excel_data(earnings_data_url)
# for data_dict in earnings_data_dict.values():
# display_data(data_dict)

In [108]:
# Delete meta data (info & contents)
if 'Information' in employment_data_dict.keys():
  del employment_data_dict['Information']
if 'Contents' in employment_data_dict.keys():
  del employment_data_dict['Contents']

# Assign new column names
new_column_names = {
    'Table 4 - Regional level employment (thousands) by BIG (public/private sector split)': 'BIG_Name',
    'Unnamed: 1': 'FT_Public',
    'Unnamed: 2': 'FT_Private',
    'Unnamed: 3': 'FT_Pub_Priv',
    'Unnamed: 4': 'PT_Public',
    'Unnamed: 5': 'PT_Private',
    'Unnamed: 6': 'PT_Pub_Priv',
    'Unnamed: 7': 'FTPT_Public',
    'Unnamed: 8': 'FTPT_Private',
    'Unnamed: 9': 'FTPT_Pub_Priv',
    'Unnamed: 10': 'All_Public',
    'Unnamed: 11': 'All_Private',
    'Unnamed: 12': 'All_Pub_Priv'
}

# Assign new column dtypes
new_column_dtypes = {
    'BIG_Name': str,
    'FT_Public': float,
    'FT_Private': float,
    'FT_Pub_Priv': float,
    'PT_Public': float,
    'PT_Private': float,
    'PT_Pub_Priv': float,
    'FTPT_Public': float,
    'FTPT_Private': float,
    'FTPT_Pub_Priv': float,
    'All_Public': float,
    'All_Private': float,
    'All_Pub_Priv': float 
}

redundant_rows = [0, 1, 2, 21, 22, 23, 24, 25, 26, 27, 28]

regional_dfs = []

for sheet, region in employment_data_dict.items():

  # Update names of columns
  region.rename(columns=new_column_names, inplace=True)
  region.insert(loc=0, column='Region_Name', value=sheet)

  # Drop redundant columns after updating column names
  for column in region.columns:
    if 'Unnamed' in column:
      region.drop(column, axis=1, inplace=True)

  # Drop redundant rows
  for rows in redundant_rows:
    if rows in region.index:
      region.drop(rows, inplace=True)

  # Commit row changes by resetting index
  region.reset_index(drop=True, inplace=True)

  for column in region.columns:
    for index, entry in region[column].items():
      if entry == '-':
        region[column][index] = 0
      elif entry == '*':
        region[column][index] = None

  # Update dtypes of columns
  region.astype(dtype=new_column_dtypes)

  # Prepare list of regional DataFrames for concatenation
  regional_dfs.append(region)

# Concatenate regional DataFrames
regional_data = pd.concat(regional_dfs, ignore_index=True)

# Create Region, BIG, and Region_BIG IDs
regional_data['Region_ID'] = regional_data['Region_Name'].factorize()[0]
regional_data['BIG_ID'] = regional_data['BIG_Name'].factorize()[0]
regional_data['Region_BIG_ID'] = pd.MultiIndex.from_frame(regional_data[['Region_Name', 'BIG_Name']]).factorize()[0]

In [None]:
conn = sqlite3.connect('regional_UK_employment.db')
cursor = conn.cursor()

# Create database
create_database()

# Load data into database
df_Region = regional_data[['Region_Name']]
df_Region = df_Region.groupby('Region_Name').first().reset_index()
df_BIG = regional_data[['BIG_Name']]
df_BIG = df_BIG.groupby('BIG_Name').first().reset_index()
df_Region_BIG = regional_data[['Region_ID', 'BIG_ID']]
df_FT_Employees = regional_data[['FT_Public', 'FT_Private', 'FT_Pub_Priv']]
df_PT_Employees = regional_data[['PT_Public', 'PT_Private', 'PT_Pub_Priv']]
df_FTPT_Employees = regional_data[['FTPT_Public', 'FTPT_Private', 'FTPT_Pub_Priv']]
df_All_Employees = regional_data[['All_Public', 'All_Private', 'All_Pub_Priv']]

print(df_Region)
print(df_BIG)
print(df_Region_BIG)

df_Region.to_sql('Region', conn, if_exists='replace', index='Region_ID')
df_BIG.to_sql('BIG', conn, if_exists='replace', index='BIG_ID')
df_Region_BIG.to_sql('Region_BIG', conn, if_exists='replace', index='Region_BIG_ID')
cursor.execute("DROP TABLE IF EXISTS FT_Employees;")
df_FT_Employees.to_sql('FT_Employees', conn, if_exists='replace', index='Region_BIG_ID')
cursor.execute("DROP TABLE IF EXISTS PT_Employees;")
df_PT_Employees.to_sql('PT_Employees', conn, if_exists='replace', index='Region_BIG_ID')
cursor.execute("DROP TABLE IF EXISTS FTPT_Employees;")
df_FTPT_Employees.to_sql('FTPT_Employees', conn, if_exists='replace', index='Region_BIG_ID')
cursor.execute("DROP TABLE IF EXISTS All_Employees;")
df_All_Employees.to_sql('All_Employees', conn, if_exists='replace', index='Region_BIG_ID')

# Query the database
cursor.execute('''
    SELECT * FROM sqlite_master WHERE type='table' OR type='column';
'''
)

for row in cursor:
  print(row)

# Commit changes and close the connection
conn.commit()
conn.close()

In [None]:
# Connect to the SQLite database
conn = sqlite3.connect('regional_UK_employment.db')
cursor = conn.cursor()

# Query the database
cursor.execute('''
    SELECT * FROM Region;
'''
)

for row in cursor:
  print(row)

# Commit changes and close the connection
conn.commit()
conn.close()