-
Notifications
You must be signed in to change notification settings - Fork 0
/
Danny's Diner Challenge 2 Data Cleaning.sql
289 lines (231 loc) · 8.53 KB
/
Danny's Diner Challenge 2 Data Cleaning.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
-- Danny Ma SQL Challenge 2: Pizza Runner
-- Data Wrangling
-- Creating Database and Creating the Tables in the database
DROP DATABASE IF EXISTS pizza_runner; -- delete the database if it exists
CREATE DATABASE pizza_runner; -- creates the database
USE pizza_runner; -- sets the database as the default database
DROP TABLE IF EXISTS runners;
CREATE TABLE runners (
runner_id INTEGER,
registration_date DATE
);
INSERT INTO runners
(runner_id, registration_date)
VALUES
(1, '2021-01-01'),
(2, '2021-01-03'),
(3, '2021-01-08'),
(4, '2021-01-15');
DROP TABLE IF EXISTS customer_orders;
CREATE TABLE customer_orders (
order_id INTEGER,
customer_id INTEGER,
pizza_id INTEGER,
exclusions VARCHAR(4),
extras VARCHAR(4),
order_time TIMESTAMP
);
INSERT INTO customer_orders
(order_id, customer_id, pizza_id, exclusions, extras, order_time)
VALUES
('1', '101', '1', '', '', '2020-01-01 18:05:02'),
('2', '101', '1', '', '', '2020-01-01 19:00:52'),
('3', '102', '1', '', '', '2020-01-02 23:51:23'),
('3', '102', '2', '', NULL, '2020-01-02 23:51:23'),
('4', '103', '1', '4', '', '2020-01-04 13:23:46'),
('4', '103', '1', '4', '', '2020-01-04 13:23:46'),
('4', '103', '2', '4', '', '2020-01-04 13:23:46'),
('5', '104', '1', 'null', '1', '2020-01-08 21:00:29'),
('6', '101', '2', 'null', 'null', '2020-01-08 21:03:13'),
('7', '105', '2', 'null', '1', '2020-01-08 21:20:29'),
('8', '102', '1', 'null', 'null', '2020-01-09 23:54:33'),
('9', '103', '1', '4', '1, 5', '2020-01-10 11:22:59'),
('10', '104', '1', 'null', 'null', '2020-01-11 18:34:49'),
('10', '104', '1', '2, 6', '1, 4', '2020-01-11 18:34:49');
DROP TABLE IF EXISTS runner_orders;
CREATE TABLE runner_orders (
order_id INTEGER,
runner_id INTEGER,
pickup_time VARCHAR(19),
distance VARCHAR(7),
duration VARCHAR(10),
cancellation VARCHAR(23)
);
INSERT INTO runner_orders
(order_id, runner_id, pickup_time, distance, duration, cancellation)
VALUES
('1', '1', '2020-01-01 18:15:34', '20km', '32 minutes', ''),
('2', '1', '2020-01-01 19:10:54', '20km', '27 minutes', ''),
('3', '1', '2020-01-03 00:12:37', '13.4km', '20 mins', NULL),
('4', '2', '2020-01-04 13:53:03', '23.4', '40', NULL),
('5', '3', '2020-01-08 21:10:57', '10', '15', NULL),
('6', '3', 'null', 'null', 'null', 'Restaurant Cancellation'),
('7', '2', '2020-01-08 21:30:45', '25km', '25mins', 'null'),
('8', '2', '2020-01-10 00:15:02', '23.4 km', '15 minute', 'null'),
('9', '2', 'null', 'null', 'null', 'Customer Cancellation'),
('10', '1', '2020-01-11 18:50:20', '10km', '10minutes', 'null');
DROP TABLE IF EXISTS pizza_names;
CREATE TABLE pizza_names (
pizza_id INTEGER,
pizza_name TEXT
);
INSERT INTO pizza_names
(pizza_id, pizza_name)
VALUES
(1, 'Meatlovers'),
(2, 'Vegetarian');
DROP TABLE IF EXISTS pizza_recipes;
CREATE TABLE pizza_recipes (
pizza_id INTEGER,
toppings TEXT
);
INSERT INTO pizza_recipes
(pizza_id, toppings)
VALUES
(1, '1, 2, 3, 4, 5, 6, 8, 10'),
(2, '4, 6, 7, 9, 11, 12');
DROP TABLE IF EXISTS pizza_toppings;
CREATE TABLE pizza_toppings (
topping_id INTEGER,
topping_name TEXT
);
INSERT INTO pizza_toppings
(topping_id, topping_name)
VALUES
(1, 'Bacon'),
(2, 'BBQ Sauce'),
(3, 'Beef'),
(4, 'Cheese'),
(5, 'Chicken'),
(6, 'Mushrooms'),
(7, 'Onions'),
(8, 'Pepperoni'),
(9, 'Peppers'),
(10, 'Salami'),
(11, 'Tomatoes'),
(12, 'Tomato Sauce');
----------------------------------------------------------------------------------------------------------------------
-- Data Wrangling
-- The first step is to visually Assess the tables to look for Data Quality
SELECT * FROM customer_orders;
SELECT * FROM pizza_names;
SELECT * FROM pizza_recipes;
SELECT * FROM pizza_toppings;
SELECT * FROM runner_orders;
SELECT * FROM runners;
-- After Assessing the data, the "runners" table, the "pizza_names" table and the "pizza toppings" table are quality
--------------------------------------------------------------------------------------------------------------
-- To clean the customer_orders table
-- Dealing with the "null" and blank values
SELECT * FROM customer_orders;
-- for the null and blank values in the extrusions column
UPDATE customer_orders
SET exclusions = CASE WHEN exclusions = '' THEN NULL
WHEN exclusions = 'null' THEN NULL
ELSE exclusions END;
-- for the null and blank values in the extras column
UPDATE customer_orders
SET extras = CASE WHEN extras = '' THEN NULL
WHEN extras = 'null' THEN NULL
ELSE extras END;
-- To Check if the changes were made
SELECT *
FROM customer_orders
WHERE exclusions IN ('null', '') OR
extras IN ('null', ''); -- 0 rows returned
-- There is the presence of commas in the exclusions and extras column, to split them into each rows
-- and store in a temporary table
-- First Step is to create a row number to retain each row number in the table
-- because this cleaning step attracts duplicate rows
DROP TEMPORARY TABLE IF EXISTS pre_customer_orders;
CREATE TEMPORARY TABLE pre_customer_orders AS
SELECT *,
ROW_NUMBER() OVER() AS row_num
FROM customer_orders;
-- to handle the commas in the exclusions column
DROP TEMPORARY TABLE IF EXISTS customer_orders_pre_clean;
CREATE TEMPORARY TABLE customer_orders_pre_clean as
select order_id,
customer_id,
pizza_id,
order_time,
extras,
SUBSTRING_INDEX(SUBSTRING_INDEX(exclusions, ',', numbers.n), ',', -1) AS exclusions,
row_num
FROM (SELECT 1 AS n UNION ALL
SELECT 2 UNION ALL
SELECT 3) numbers
right JOIN pre_customer_orders
ON CHAR_LENGTH(exclusions) - CHAR_LENGTH(replace(exclusions, ',', '')) >= numbers.n-1
order by order_id, n;
-- to handle the commas in the extras column
DROP TEMPORARY TABLE IF EXISTS customer_orders_cleaned;
CREATE TEMPORARY TABLE customer_orders_cleaned AS
select order_id,
customer_id,
pizza_id,
order_time,
exclusions,
SUBSTRING_INDEX(SUBSTRING_INDEX(extras, ',', numbers.n), ',', -1) AS extras,
row_num
FROM (SELECT 1 AS n UNION ALL
SELECT 2 UNION ALL
SELECT 3) numbers
RIGHT JOIN customer_orders_pre_clean
ON CHAR_LENGTH(extras) - CHAR_LENGTH(REPLACE(extras, ',', '')) >= numbers.n-1
order by order_id, n;
-- to check if it worked
SELECT * FROM customer_orders_cleaned;
-- to check the data type
DESCRIBE customer_orders;
DESCRIBE customer_orders_cleaned;
-----------------------------------------------------------------------------------------------
-- To clean the runner_orders table
SELECT * FROM runner_orders;
-- the pickup_time column has 'null' values
-- the distance column has the appearance of 'km', 'null' values and ' km'
-- the duration column has the appearance of 'minutes', 'null', 'mins', 'minute'
-- the cancellation column has the appearance of 'null' and ''
UPDATE runner_orders
SET pickup_time = CASE WHEN pickup_time = 'null' THEN NULL
ELSE pickup_time END;
UPDATE runner_orders
SET distance = CAST(CASE WHEN distance = 'null' THEN NULL
WHEN distance LIKE '%km' THEN TRIM(REPLACE(distance, 'km', ''))
ELSE distance END AS FLOAT);
UPDATE runner_orders
SET duration = CAST(CASE WHEN duration LIKE '%min%' THEN LEFT(duration, 2)
WHEN duration = 'null' THEN NULL
ELSE duration END AS float);
UPDATE runner_orders
SET cancellation = CASE WHEN cancellation IN ('null', '') THEN NULL
ELSE cancellation END;
-- To check for the datatypes
DESCRIBE runner_orders;
-- To change the datatype of the columns
ALTER TABLE runner_orders
MODIFY COLUMN pickup_time DATETIME;
ALTER TABLE runner_orders
MODIFY COLUMN distance INT;
ALTER TABLE runner_orders
MODIFY COLUMN duration INT;
------------------------------------------------------------------------------
-- To clean the pizza_recipes table (Optional)
SELECT * FROM pizza_recipes;
-- to handle the commas in the extras column and store in a temporary column
CREATE TEMPORARY TABLE pizza_recipes_clean as (
SELECT p.pizza_id,
REPLACE(SUBSTRING_INDEX(SUBSTRING_INDEX(p.toppings, ',', numbers.n), ',', -1), ' ', '') AS toppings
FROM (select 1 AS n UNION ALL
select 2 UNION ALL
select 3 UNION ALL
select 4 UNION ALL
select 5 UNION ALL
select 6 UNION ALL
select 7 UNION ALL
select 8 ) AS numbers
INNER JOIN pizza_recipes p
ON CHAR_LENGTH(p.toppings) - CHAR_LENGTH(REPLACE(p.toppings, ',', '')) >= numbers.n-1
ORDER BY p.pizza_id, n);
-- To Check if the changes were made in the TEMP Table
SELECT * FROM pizza_recipes_clean;