In [None]:
# NOTES FOR MODULE 7

In [None]:
# 7.2.2 Create Tables in SQL

# Use our final ERD as a guide

# create six tables, one for each CSV file

# These table creation statements will be our first introduction to Structured Query Language. 

# A statement is a block of code that, when executed, sends a command to the database.

#  Before we create database, review Diagrams.

In [None]:
# Review Diagrams

# recreating the same tables in the diagram in our SQL database.

# With the help of the diagram, we know the structure of this table: two columns with their data types. 
# Also, the table is already named. All we need to do is transfer over the same information.

# Start by using...

In [None]:
# The Query Tool is pgAdmin's text editor, much like VSCode is for Python.

# so lets create a table

# -- Creating tables for PH-EmployeeDB
# CREATE TABLE departments (
#      dept_no VARCHAR(4) NOT NULL,
#     dept_name VARCHAR(40) NOT NULL,
#     PRIMARY KEY (dept_no),
#     UNIQUE (dept_name)
#);

In [None]:
# CREATE TABLE is the syntax required to create a new table in SQL.

# departments is the name of the table and how it will be referenced in queries.

# So the table has been named, now the structure needs to be created. The content inside the parentheses is how we'll 
# do that.

# dept_no VARCHAR(4) NOT NULL, creates a column named "dept_no" that can hold up to four varying characters, while 
# NOT NULL tells SQL that no null fields will be allowed when importing data.

# There are times when we don't want a data field to be null. For example, the dept_no column is our primary key—each 
# row has a unique number associated with it. If we didn't have the NOT NULL constraint, then there's a chance that a 
# row (or more than one row) won't have a primary key associated with the data.

In [None]:
# What do you think would happen if one of the rows didn’t have a unique identifier?

# Not all of the data would be present in every query, which would skew analysis results and provide incomplete lists.

In [None]:
# dept_name VARCHAR(40) NOT NULL, creates a column similar to the dept_no, only the varying character count has a 
# maximum of 40.

# PRIMARY KEY (dept_no), means that the dept_no column is used as the primary key for this table.

# UNIQUE (dept_name) adds the unique constraint to the dept_name column.

# The unique constraint implies that the data in that column is unique. 
# This ensures that if the table were to be updated in the future, nothing will be duplicated.



In [None]:
# The closing parenthesis and semicolon signal that the SQL CREATE TABLE statement is complete. 

# Any code added after will need to be included in a new SQL statement. 
# A statement is a command that is set up with a certain syntax. 



In [None]:
# What does it mean when the NOT NULL constraint is applied?

# Null values are not allowed in the column.

In [None]:
# Execute the Code

# To save the table to the database, we need to execute the code. 

# In the toolbar of the pgAdmin webpage, find and click the lightning bolt symbol toward the right of the bar. 

# This button runs the code and saves our work to the database.

In [2]:
# Troubleshoot Error Messages

# Any error message will also appear in the same manner. Encountering errors will happen often and troubleshooting 
# them is a large part of being a developer. 

# Thankfully, each error message encountered will tell us why the error occurred. 
# This is great because it helps us, the developers, research and fix the problem.

In [3]:
# ERROR:  relation "departments" already exists
# SQL state: 42P07

In [None]:
# This error occurs because SQL data is persistent and cannot be overwritten if the same command is run again. 
# Once a table has been committed to a database, it is there until a different command is run to delete it.


In [None]:
# Data integrity is the quality of the data we're working with. Clean data will yield better results in analysis, 
# and maintaining the data integrity ensures greater accuracy and reliability.

# Dirty data is data that contains errors such as duplicates, undefined values (i.e., not a number, or NaN), or other 
# inconsistencies. This is why the NOT NULL constraint is in place.

In [None]:
# To avoid encountering this error, highlight the code block you want to run first, then execute it. 
# This tells pgAdmin to run only that code.



In [None]:
# Create Additional Tables

# Create another table for Employees.

# How to start create table: 

# CREATE TABLE employees (
#     emp_no INT NOT NULL,
#     birth_date DATE NOT NULL,
#     first_name VARCHAR NOT NULL,
#     last_name VARCHAR NOT NULL,
#     gender VARCHAR NOT NULL,
#     hire_date DATE NOT NULL,
#     PRIMARY KEY (emp_no)
# );

In [None]:
# Create one more together—this time with foreign keys included. 
# Add the following code to the bottom of your query editor:

# CREATE TABLE dept_manager (
# dept_no VARCHAR(4) NOT NULL,
#     emp_no INT NOT NULL,
#     from_date DATE NOT NULL,
#     to_date DATE NOT NULL,
# FOREIGN KEY (emp_no) REFERENCES employees (emp_no),
# FOREIGN KEY (dept_no) REFERENCES departments (dept_no),
#     PRIMARY KEY (emp_no, dept_no)
# );

In [None]:
# Remember that foreign keys reference the primary key of other tables. In the two lines above we can see that:

# The FOREIGN KEY constraint tells Postgres that there is a link between two tables

# The parentheses following FOREIGN KEY specify which of the current table's columns is linked to another table

# REFERENCES table_name (column_name) tells Postgres which other table uses that column as a primary key

# The primary key is similar, but there are two keys listed this time instead of just one. 

In [None]:
# One thing to keep in mind when working with foreign keys is that it's possible that data insertion will fail if the
# foreign key isn't present. 

# This is a "foreign key constraint" and in this case, it means that the new data needs a reference point 
# (such as dept_no or emp_no) to be successfully added to the table.



In [None]:
# Let's create another table for the data in salaries.csv

# CREATE TABLE salaries (
#   emp_no INT NOT NULL,
#   salary INT NOT NULL,
#   from_date DATE NOT NULL,
#   to_date DATE NOT NULL,
#   FOREIGN KEY (emp_no) REFERENCES employees (emp_no),
#   PRIMARY KEY (emp_no)
# );

In [None]:
# This code tells Postgres that our new table is named "salaries" and we'll have columns for the emp_no, salary, 
#from_date, and to_date. 

# We also have specified that certain fields aren't allowed any null space with the NOT NULL constraint, which is 
# important because we want this data to be persistent for every employee. 

# As a final step in table creation, we've also specified primary and foreign keys.



In [None]:
# Query for Confirmation

# Confirm the tables were created successfully by running a SELECT statement, which performs a query instead of 
# constructing anything.

# Think of it as asking the database a question. For example, say we want to know how many columns are in the 
# departments table. 
# How would we ask that particular question? We would create a SELECT statement, then run the code. 
# This is called "querying the database."

In [None]:
# How to query how many columns ar in the departments table

# In the editor, after the table creation statements, type 

# SELECT * FROM departments;

# The SELECT statement tells Postgres that we're about to query the database.
# The asterisk tells Postgres that we're looking for every column in a table.
# FROM departments tells pgAdmin which table to search.
# The semicolon signifies the completion of the query.
# After executing the SELECT statement, pgAdmin will automatically show the result in the Data Output tab at the bottom of the page.

In [None]:
# The information in the database is static, which means that it will always be in the database unless directly 
# altered, but the query editor is not. 

# It's similar to working in a Microsoft Word document: If something happens to your computer before you saved your 
# work, there's a good chance it'll be lost. 

# If your computer crashes mid-query, the pgAdmin editor won't hold onto your code for you during a reboot.

# Our queries are the meat and potatoes of SQL. We're finding connections between different tables and answering 
# questions with the results. 

# Even though losing a query isn't the end of the world (they can be rebuilt, after all), it does take time to 
# replicate the work already completed.


In [None]:
# 7.2.3 Import Data

# we've created a database. We've written our first SQL code and created tables modeled after our ERD. 

# The next step is to import the data from the CSV files. 

# We'll make sure all of the tables we created in pgAdmin appear in the GUI first, because we'll be using the GUI to 
# import data. 

In [None]:
# SQL is very interactive. Developers are not only importing data and asking it questions through the query language, 
# but they can also update and edit the data stored in the tables as needed.

# For example, if a single row of data needs to be added to an existing table, a developer can manually add it by 
# using the INSERT statement.

# If the data in a table is small enough in scale, it can be manually inserted this way completely, instead of 
# importing a CSV file.

# Alternately, necessary edits and updates are completed manually as well. 

# We won't be manually editing or uploading data to our tables in this lesson because our datasets are too large.



In [None]:
# Prep to import csv files into PH-EmployeeDB

# In the pgAdmin window, select the dropdown menu for our PH-EmployeeDB database. To import data into the tables, first confirm all of our tables are listed:

# Find the PH-EmployeeDB collapsible menu and click it.
# Scroll down and click "Schemas" to expand the menu.
# Click "public."
# Scroll down to "Tables" and note the number in parentheses.


In [None]:
# Start Import 

# To import a CSV into Postgres with pgAdmin, follow these steps. We'll customize our options to fit our data import, 
# and then check the table to make sure the data has been imported successfully. 

# Right-click the first table, departments.

# From the menu that pops up, scroll to Import/Export. 

# Toggle the button to show "Import." 

# Click the ellipsis on the Filename field to search for your project folder.

# Select departments.csv. Make sure Format is set to "csv" and Encoding is blank. Note: By default, the Encoding 
# section is blank. 
# If our files were encoded to provide an extra layer of security, we would need to select the type of encoding before
# importing them to Postgres. 
# We don't have to worry about this, though. 
# Also, if "Encoding" is filled in with an encoding type such as BIG5 or LATIN1, cancel the import and start over. 

# Leave the OID field as is, but toggle the Header field to "Yes" and select the comma as the Delimiter. 
# Note: If we don't specify that there is already a header included in the CSV data, then the header will be imported 
# as data. This would result in errors because headers don't always match the data types in the columns. 

# Click OK to begin importing the data. 


In [None]:
# Check the import by typing SELECT * FROM departments; at the bottom of the query editor. 
# The resulting table should mirror the CSV file:

In [None]:
# 7.2.4 Troubleshoot Imports


In [None]:
# Handle Common Errors

# What if Bobby runs into an error while he's importing the data? Below is an example of a likely error:

# DETAIL: Key(Emp_no)=10001 is not present in table "employees"

# Because the FOREIGN KEY constraint references other tables, we need to import the data in a specific order.

# For example, the dept_emp table references the Employees table through its foreign key. 
# If there is no data in the Employees table, then there are no foreign keys to link to, and an error will occur.

In [None]:
# Handle Mismatched Data Types

# Another common scenario is when a data type in a table we've created doesn't match the CSV data. What should we do?

# Because data within a Postgres database is static, we can't go back and fix a typo in our original table creation 
# code. 

In [None]:
# If you need to update a table column to fix its data type, what is the best approach?
# Delete the table, then recreate it after updating the code.


In [None]:
# Drop a Table

# To drop the table, the following code is used:

# DROP TABLE employees CASCADE;

# DROP TABLE employees tells Postgres that we want to remove the Employees table from the database completely.
# CASCADE; indicates that we also want to remove the connections to other tables in the database.


In [None]:
# More on CASCADE

# Even without data, by adding foreign keys that reference other tables, we've created a network of data connections. 

# Not every table will need the CASCADE constraint, but it will come up when you need to drop a table that already has 
# a defined relationship with another. 

# Any table that does not reference a foreign key can be dropped without the CASCADE constraint.

In [None]:
# 7.3.1 Query Dates

# future-proofing the company by determining how many people will be retiring and, of those employees, who is eligible 
# for a retirement package.

# In Python, conditionals such as "if" and "else," and the logical operator "and," are similar to conditional 
# expressions used in SQL

# search for folks who are retiring soon. The query he builds will include a condition involving employee birthdays. 
# We need to know when they were born to determine when they'll retire

In [None]:
# Determine Retirement Eligibility

# anyone born between 1952 and 1955 will begin to retire. 

# The first query we need to help Bobby write will return a list of those employees.

In [None]:
# SELECT first_name, last_name
# FROM employees
# WHERE birth_date BETWEEN '1952-01-01' AND '1955-12-31';


# The SELECT statement is more specific this time. Instead of an asterisk to indicate that we want all of the records,
# we're requesting only the first and last names of the employees.

# FROM employees tells SQL in which of the six tables to look.

# The WHERE clause brings up even more specifics. We want SQL to look in the birth_date column for anyone born between
# January 1, 1952, and December 31, 1955.

# Notice how BETWEEN and AND are both capitalized in our statement? This is part of the SQL syntax. 
# It not only signals the conditions present, but also makes the code easier to read.

In [None]:
# How many employees are ready for retirement?

# 10,000 or more

In [None]:
# Pewlett Hackard has a lot of employees getting ready to age out of the program. 
# This is going to create a considerable amount of openings. Refine this list further by looking only at how many 
# employees were born in 1952. 


In [None]:
# Create another query that will search for only 1952 birth dates.

# SELECT first_name, last_name
# FROM employees
# WHERE birth_date BETWEEN '1952-01-01' AND '1952-12-31';

# This query is almost the same as the last. We've only changed a single digit: the year was switched from 1955 to 
# 1952 after the AND clause.

In [None]:
# Narrow the Search for Retirement Eligibility

# There are quite a few folks getting ready to retire. Each of those new queries has a lengthy list of people. 

# Let's see if we can narrow it down a bit more by adding another condition to the query. 


In [None]:
# Modified query 

# -- Retirement eligibility
# SELECT first_name, last_name
# FROM employees
# WHERE (birth_date BETWEEN '1952-01-01' AND '1955-12-31')
# AND (hire_date BETWEEN '1985-01-01' AND '1988-12-31');


In [None]:
# We modified this query to include a specific hiring range. This time, we're looking for employees born between 1952 
# and 1955, who were also hired between 1985 and 1988. 

# The modification is subtle, too. We're going to adjust one line of the code block and add another to the end.

# The first piece, an adjustment, is to place parentheses around the WHERE clause (without including the keyword 
# itself). We'll also remove the semicolon since the code block isn't complete yet. 

# dd the final line of code. Our current code has a single condition in place that tells Postgres to search only for 
# people born between 1952 and 1955. The next line of code is our second condition that they were also hired between 
# 1985 and 1988.

# the second condition is inside parentheses? This is a tuple; in Python, data can be stored inside a tuple and 
# accessed in the same way as a list. 

# In SQL, the tuples in this block of code are part of the syntax. They basically place each condition in a group, 
# and Postgres looks for the first group first, then looks inside the second group to deliver the second condition.

# The SELECT statement, pulling data from the first and last name columns
# The FROM statement, telling Postgres from which table we're getting the data
# And two conditional statements: the dates of birth and the dates of hire


In [None]:
# Count the Queries

# -- Number of employees retiring
# SELECT COUNT(first_name)
# FROM employees
# WHERE (birth_date BETWEEN '1952-01-01' AND '1955-12-31')
# AND (hire_date BETWEEN '1985-01-01' AND '1988-12-31');


In [None]:
# Create New Tables

# This time, the change we'll make to the code is also small—we're modifying the SELECT statement into a SELECT INTO 
# statement. 
# This statement tells Postgres that instead of generating a list of results, the data is saved as a new table 
# completely. 

# Update our code to include the INTO portion of the SELECT statement.

# Insert a new, blank line between the SELECT and FROM sections of the code. 
# In this vacant space, type in INTO retirement_info. With the addition of this line, we're telling Postgres to save 
# the data into a table named "retirement_info."

In [None]:
# -- create a new table of retiring employees

# SELECT first_name, last_name
# INTO retirement_info
# FROM employees
# WHERE (birth_date BETWEEN '1952-01-01' AND '1955-12-31')
# AND (hire_date BETWEEN '1985-01-01' AND '1988-12-31');

In [None]:
# Our list of data from earlier is now an actual table that we can use with statements and functions to perform 
# analysis. 

# Additionally, if you refresh the list of tables from the dropdown menu on the left, it will now appear in the list.



In [None]:
# Export Data

# Right-click on your new table and select "Import/Export." 
# Instead of importing anything, this time we'll be exporting.

# STEPS

# Keep the Import/Export button toggled to "Export."

# Click on the ... in the Filename field to automatically select the same directory from which you imported the other 
# CSVs. Select a directory, but be sure to rename it to retirement_info.csv.

# Be sure the format is still CSV.

# Toggle the Header section to "Yes" to include column names in the new CSV files.

# Select the comma as the delimiter to maintain the same format with all CSV files.

# Click OK to start the export. After the file has been created, pgAdmin will confirm our file is ready to be viewed.