#### ID 2100

```Given the education levels and salaries of a group of individuals, find what is the average salary for each level of education.```

In [None]:
%%sql
SELECT education, AVG(salary) AS avg_salary
FROM google_salaries
GROUP BY education

In [None]:
df = google_salaries

df.groupby('education', as_index=False).agg(avg_salary=('salary', 'mean'))

#### ID 2101

```Given a single column of numbers, consider all possible permutations of two numbers assuming that pairs of numbers (x,y) and (y,x) are two different permutations. Then, for each permutation, find the maximum of the two numbers. Output three columns: the first number, the second number and the maximum of the two.```

In [None]:
%%sql
SELECT t1.number                      AS number1,
       t2.number                      AS number2,
       GREATEST(t1.number, t2.number) AS max_number
FROM deloitte_numbers AS t1
         CROSS JOIN
     deloitte_numbers AS t2
GROUP BY t1.number,
         t2.number

In [None]:
df = pd.merge(deloitte_numbers, deloitte_numbers, how='cross', suffixes=('1', '2'))

df['max_number'] = df[['number1', 'number2']].max(axis=1)

df

#### ID 2106

```The data engineering team at YouTube want to clean the dataset user_flags. In particular, they want to examine rows that have missing values in more than one column. List these rows.```

In [None]:
%%sql
WITH cte AS (SELECT user_firstname,
                    user_lastname,
                    video_id,
                    flag_id,
                    ARRAY_LENGTH(ARRAY_REMOVE(
                                         ARRAY [user_firstname, user_lastname, video_id, flag_id],
                                         NULL), 1) AS arr_len
             FROM user_flags)
SELECT user_firstname, user_lastname, video_id, flag_id
FROM cte
WHERE arr_len <= 2;

-- 
SELECT user_firstname,
       user_lastname,
       video_id,
       flag_id
FROM user_flags
WHERE num_nulls(user_firstname, user_lastname, video_id, flag_id) >= 2

In [None]:
df = user_flags

df[df.isnull().sum(axis=1) >= 2]

#### ID 2107

```Write a query to return all Customers (cust_id) who are violating primary key constraints in the Customer Dimension (dim_customer) i.e. those Customers who are present more than once in the Customer Dimension. For example if cust_id 'C123' is present thrice then the query should return 'C123' | '3' as output.```

In [None]:
%%sql
SELECT cust_id, count(cust_id) AS n_occurences
FROM dim_customer
GROUP BY cust_id
HAVING count(cust_id) >= 2

In [None]:
df = dim_customer

df.groupby('cust_id', as_index=False).agg(n_occurences=('cust_id', 'count')).query('n_occurences >= 2')

#### ID 2108

```Each Employee is assigned one territory and is responsible for the Customers from this territory. There may be multiple employees assigned to the same territory. Write a query to get the Employees who are responsible for the maximum number of Customers. Output the Employee ID and the number of Customers.```

In [None]:
%%sql
WITH cte AS (SELECT empl_id, COUNT(*) OVER (PARTITION BY empl_id) AS n_customers
             FROM map_employee_territory et
                      JOIN map_customer_territory ct USING (territory_id)
             ORDER BY empl_id)
SELECT DISTINCT empl_id, n_customers
FROM cte
WHERE n_customers = (SELECT MAX(n_customers) FROM cte)

In [None]:
df = pd.merge(map_customer_territory, map_employee_territory, how='inner', on='territory_id')

df_grouped = df.groupby('empl_id', as_index=False).agg(n_customers=('cust_id', 'count'))

max_n_customers = df_grouped.nlargest(1, 'n_customers')['n_customers'].to_list()

df_grouped.query('n_customers == @max_n_customers')

#### ID 2109

```Write a query to get a list of products that have not had any sales. Output the ID and market name of these products.```

In [None]:
%%sql
SELECT dp.prod_sku_id, dp.market_name
FROM dim_product AS dp
         LEFT JOIN fct_customer_sales AS cs ON dp.prod_sku_id = cs.prod_sku_id
WHERE cs.prod_sku_id IS NULL

In [None]:
df = pd.merge(dim_product, fct_customer_sales, how='left', left_on='prod_sku_id', right_on='prod_sku_id').query(
    'cust_id.isnull()')[['prod_sku_id', 'market_name']]

#### ID 2110

```Write a query to get the list of managers whose salary is less than twice the average salary of employees reporting to them. For these managers, output their ID, salary and the average salary of employees reporting to them.```

In [None]:
%%sql
WITH cte AS (SELECT eh.manager_empl_id,
                    de.empl_id,
                    manager_salary,
                    AVG(de.salary)
                    OVER (PARTITION BY eh.manager_empl_id) AS avg_employee_salary
             FROM map_employee_hierarchy eh
                      JOIN
                  dim_employee de
                  ON
                      de.empl_id = eh.empl_id
                      JOIN
                  (SELECT empl_id,
                          salary AS manager_salary
                   FROM dim_employee) AS sq
                  ON
                      sq.empl_id = eh.manager_empl_id)
SELECT DISTINCT manager_empl_id, manager_salary, avg_employee_salary
FROM cte
WHERE manager_salary <= 2 * avg_employee_salary

In [None]:
sq = dim_employee[['empl_id', 'salary']].rename(columns={'salary': 'manager_salary'})

df = map_employee_hierarchy.merge(dim_employee, on='empl_id') \
    .merge(sq, left_on='manager_empl_id', right_on='empl_id', suffixes=('', '_manager'))

df['avg_employee_salary'] = df.groupby('manager_empl_id')['salary'].transform('mean')

df[['manager_empl_id', 'manager_salary', 'avg_employee_salary']].drop_duplicates().query(
    'manager_salary <=  2 * avg_employee_salary')

#### ID 2113

```To remain competitive, the company you work with must reduce the number of extremely late deliveries. A delivery is flagged as extremely late if the actual delivery time is more than 20 minutes (not inclusive) after the predicted delivery time. You have been asked to calculate the percentage of orders that arrive extremely late each month. Your output should include the month in the format 'YYYY-MM' and the percentage of extremely late orders as a percentage of all orders placed in that month.```

In [None]:
%%sql
SELECT TO_CHAR(order_placed_time, 'YYYY-MM') AS year_month,
       COUNT(delivery_id) FILTER (WHERE
           EXTRACT(EPOCH FROM (actual_delivery_time - predicted_delivery_time)) / 60.0 >
           20 ) * 100.0 / COUNT(delivery_id) AS perc_extremely_delayed
FROM delivery_orders
WHERE actual_delivery_time IS NOT NULL
GROUP BY year_month

#### ID 2116

```How many orders were shipped by Speedy Express in total?```

In [None]:
%%sql
SELECT COUNT(order_id) AS n_shipped
FROM shopify_orders so
         JOIN shopify_carriers sc ON so.carrier_id = sc.id
WHERE name = 'Speedy Express'

In [None]:
df = pd.merge(shopify_orders, shopify_carriers, how='inner', left_on='carrier_id', right_on='id')

df.query('name == "Speedy Express"')['order_id'].count()

#### ID 2117

```What is the last name of the employee or employees who are responsible for the most orders?```

In [None]:
%%sql
WITH cte AS (SELECT resp_employee_id,
                    COUNT(order_id)                                   AS n_orders,
                    DENSE_RANK() OVER (ORDER BY COUNT(order_id) DESC) AS rnk
             FROM shopify_orders
             GROUP BY resp_employee_id)
SELECT last_name
FROM cte
         JOIN shopify_employees AS se ON cte.resp_employee_id = se.id
WHERE rnk = 1

In [None]:
df = shopify_orders

df.groupby('resp_employee_id', as_index=False).agg(n_orders=('order_id', 'count')).nlargest(1, 'n_orders',
                                                                                            keep='all').merge(
    shopify_employees, how='inner', left_on='resp_employee_id', right_on='id')['last_name']