In [42]:
import polars as pl
print(pl.__version__)

0.20.31


### Find Total Time Spent by Each Employee

#### Question

DataFrame: Employees

| Column Name | Type |
|:-----------:|:----:|
| emp_id      | int  |
| event_day   | date |
| in_time     | int  |
| out_time    | int  |

(emp_id, event_day, in_time) is the primary key (combinations of columns with unique values) of this table.<br>
The table shows the employees' entries and exits in an office.<br>
event_day is the day at which this event happened, in_time is the minute at which the employee entered the office, and out_time is the minute at which they left the office.<br>
in_time and out_time are between 1 and 1440.<br>
It is guaranteed that no two events on the same day intersect in time, and in_time < out_time.
 

Write a solution to calculate the total time in minutes spent by each employee on each day at the office. Note that within one day, an employee can enter and leave more than once. The time spent in the office for a single entry is out_time - in_time.

Return the result table in any order.

The result format is in the following example.


Input:<br>
Employees dataframe:

| emp_id | event_day  | in_time | out_time |
|:------:|:----------:|:-------:|:--------:|
| 1      | 2020-11-28 | 4       | 32       |
| 1      | 2020-11-28 | 55      | 200      |
| 1      | 2020-12-03 | 1       | 42       |
| 2      | 2020-11-28 | 3       | 33       |
| 2      | 2020-12-09 | 47      | 74       |

Output: 

| day        | emp_id | total_time |
|:----------:|:------:|:----------:|
| 2020-11-28 | 1      | 173        |
| 2020-11-28 | 2      | 30         |
| 2020-12-03 | 1      | 41         |
| 2020-12-09 | 2      | 27         |

Explanation:<br>
Employee 1 has three events: two on day 2020-11-28 with a total of (32 - 4) + (200 - 55) = 173, and one on day 2020-12-03 with a total of (42 - 1) = 41.<br>
Employee 2 has two events: one on day 2020-11-28 with a total of (33 - 3) = 30, and one on day 2020-12-09 with a total of (74 - 47) = 27.

#### Testcase

In [43]:
# Test data
data = [['1', '2020-11-28', '4', '32'], ['1', '2020-11-28', '55', '200'], ['1', '2020-12-3', '1', '42'], ['2', '2020-11-28', '3', '33'], ['2', '2020-12-9', '47', '74']]

# Create the DataFrame
employees = pl.DataFrame(
    data,
    schema=['emp_id', 'event_day', 'in_time', 'out_time']
)

# Display the DataFrame
print(employees)

shape: (5, 4)
┌────────┬────────────┬─────────┬──────────┐
│ emp_id ┆ event_day  ┆ in_time ┆ out_time │
│ ---    ┆ ---        ┆ ---     ┆ ---      │
│ str    ┆ str        ┆ str     ┆ str      │
╞════════╪════════════╪═════════╪══════════╡
│ 1      ┆ 2020-11-28 ┆ 4       ┆ 32       │
│ 1      ┆ 2020-11-28 ┆ 55      ┆ 200      │
│ 1      ┆ 2020-12-3  ┆ 1       ┆ 42       │
│ 2      ┆ 2020-11-28 ┆ 3       ┆ 33       │
│ 2      ┆ 2020-12-9  ┆ 47      ┆ 74       │
└────────┴────────────┴─────────┴──────────┘


#### Solution

In [44]:
def total_time(employees: pl.DataFrame) -> pl.DataFrame:
    
    # Rename 'event_day' to 'day'
    employees = employees.rename({'event_day': 'day'})
    
    # Calculate 'total_time' as the difference between 'out_time' and 'in_time'
    employees = employees.with_columns((pl.col('out_time').cast(pl.Int64) - pl.col('in_time').cast(pl.Int64)).alias('total_time'))
    
    # Group by 'day' and 'emp_id' and sum the 'total_time'
    result = (
        employees
        .group_by(['day', 'emp_id'])
        .agg(pl.sum('total_time').alias('total_time'))
    )
    
    return result

# Display the result
print(total_time(employees=employees))

shape: (4, 3)
┌────────────┬────────┬────────────┐
│ day        ┆ emp_id ┆ total_time │
│ ---        ┆ ---    ┆ ---        │
│ str        ┆ str    ┆ i64        │
╞════════════╪════════╪════════════╡
│ 2020-11-28 ┆ 1      ┆ 173        │
│ 2020-11-28 ┆ 2      ┆ 30         │
│ 2020-12-9  ┆ 2      ┆ 27         │
│ 2020-12-3  ┆ 1      ┆ 41         │
└────────────┴────────┴────────────┘


### Game Play Analysis I

#### Question

DataFrame: Activity

| Column Name  | Type    |
|:------------:|:-------:|
| player_id    | int     |
| device_id    | int     |
| event_date   | date    |
| games_played | int     |

(player_id, event_date) is the primary key (combination of columns with unique values) of this table.<br>
This table shows the activity of players of some games.<br>
Each row is a record of a player who logged in and played a number of games (possibly 0) before logging out on someday using some device.<br>
 

Write a solution to find the first login date for each player.

Return the result table in any order.

The result format is in the following example.

Input:<br>
Activity dataframe:

| player_id | device_id | event_date | games_played |
|:---------:|:---------:|:----------:|:------------:|
| 1         | 2         | 2016-03-01 | 5            |
| 1         | 2         | 2016-05-02 | 6            |
| 2         | 3         | 2017-06-25 | 1            |
| 3         | 1         | 2016-03-02 | 0            |
| 3         | 4         | 2018-07-03 | 5            |

Output: 

| player_id | first_login |
|:---------:|-----------:|
| 1         | 2016-03-01  |
| 2         | 2017-06-25  |
| 3         | 2016-03-02  |

#### Testcase

In [45]:
# Test data
data = [[1, 2, '2016-03-01', 5], [1, 2, '2016-05-02', 6], [2, 3, '2017-06-25', 1], [3, 1, '2016-03-02', 0], [3, 4, '2018-07-03', 5]]

# Create the DataFrame
activity = pl.DataFrame(
    data,
    schema=['player_id', 'device_id', 'event_date', 'games_played']
)

# Display the DataFrame
print(activity)


shape: (5, 4)
┌───────────┬───────────┬────────────┬──────────────┐
│ player_id ┆ device_id ┆ event_date ┆ games_played │
│ ---       ┆ ---       ┆ ---        ┆ ---          │
│ i64       ┆ i64       ┆ str        ┆ i64          │
╞═══════════╪═══════════╪════════════╪══════════════╡
│ 1         ┆ 2         ┆ 2016-03-01 ┆ 5            │
│ 1         ┆ 2         ┆ 2016-05-02 ┆ 6            │
│ 2         ┆ 3         ┆ 2017-06-25 ┆ 1            │
│ 3         ┆ 1         ┆ 2016-03-02 ┆ 0            │
│ 3         ┆ 4         ┆ 2018-07-03 ┆ 5            │
└───────────┴───────────┴────────────┴──────────────┘


#### Solution

In [46]:
def game_analysis(activity: pl.DataFrame) -> pl.DataFrame:
    
    # Group by 'player_id' and get the minimum 'event_date' for each player
    result = (
        activity
        .group_by('player_id')
        .agg(pl.col('event_date').min().alias('first_login'))
    )
    
    return result


# Display the result
print(game_analysis(activity=activity))

shape: (3, 2)
┌───────────┬─────────────┐
│ player_id ┆ first_login │
│ ---       ┆ ---         │
│ i64       ┆ str         │
╞═══════════╪═════════════╡
│ 2         ┆ 2017-06-25  │
│ 3         ┆ 2016-03-02  │
│ 1         ┆ 2016-03-01  │
└───────────┴─────────────┘


### Number of Unique Subjects Taught by Each Teacher

#### Question

DataFrame: Teacher

| Column Name | Type |
|:-----------:|:----:|
| teacher_id  | int  |
| subject_id  | int  |
| dept_id     | int  |

(subject_id, dept_id) is the primary key (combinations of columns with unique values) of this table.<br>
Each row in this table indicates that the teacher with teacher_id teaches the subject subject_id in the department dept_id.
 

Write a solution to calculate the number of unique subjects each teacher teaches in the university.

Return the result table in any order.

The result format is shown in the following example.

Input:<br>
Teacher dataframe:

| teacher_id | subject_id | dept_id |
|:----------:|:----------:|:-------:|
| 1          | 2          | 3       |
| 1          | 2          | 4       |
| 1          | 3          | 3       |
| 2          | 1          | 1       |
| 2          | 2          | 1       |
| 2          | 3          | 1       |
| 2          | 4          | 1       |

Output:  

| teacher_id | cnt |
|:----------:|:---:|
| 1          | 2   |
| 2          | 4   |

Explanation:<br>
Teacher 1:
  - They teach subject 2 in departments 3 and 4.
  - They teach subject 3 in department 3.

Teacher 2:
  - They teach subject 1 in department 1.
  - They teach subject 2 in department 1.
  - They teach subject 3 in department 1.
  - They teach subject 4 in department 1.

#### Testcase

In [47]:
# Test data
data = [[1, 2, 3], [1, 2, 4], [1, 3, 3], [2, 1, 1], [2, 2, 1], [2, 3, 1], [2, 4, 1]]

# Create the DataFrame
teacher = pl.DataFrame(
    data,
    schema=['teacher_id', 'subject_id', 'dept_id']
)

# Display the DataFrame
print(teacher)

shape: (7, 3)
┌────────────┬────────────┬─────────┐
│ teacher_id ┆ subject_id ┆ dept_id │
│ ---        ┆ ---        ┆ ---     │
│ i64        ┆ i64        ┆ i64     │
╞════════════╪════════════╪═════════╡
│ 1          ┆ 2          ┆ 3       │
│ 1          ┆ 2          ┆ 4       │
│ 1          ┆ 3          ┆ 3       │
│ 2          ┆ 1          ┆ 1       │
│ 2          ┆ 2          ┆ 1       │
│ 2          ┆ 3          ┆ 1       │
│ 2          ┆ 4          ┆ 1       │
└────────────┴────────────┴─────────┘


#### Solution

In [48]:
def count_unique_subjects(teacher: pl.DataFrame) -> pl.DataFrame:
    
    # Group by 'teacher_id' and count the unique 'subject_id' for each group
    result = (
        teacher
        .group_by('teacher_id')
        .agg(pl.col('subject_id').n_unique().alias('cnt'))
    )
    
    return result

# Display the result
print(count_unique_subjects(teacher=teacher))

shape: (2, 2)
┌────────────┬─────┐
│ teacher_id ┆ cnt │
│ ---        ┆ --- │
│ i64        ┆ u32 │
╞════════════╪═════╡
│ 1          ┆ 2   │
│ 2          ┆ 4   │
└────────────┴─────┘


### Classes More Than 5 Students

#### Question

Dataframe: Courses

| Column Name | Type    |
|:-----------:|:-------:|
| student     | varchar |
| class       | varchar |

(student, class) is the primary key (combination of columns with unique values) for this table.<br>
Each row of this table indicates the name of a student and the class in which they are enrolled.
 

Write a solution to find all the classes that have at least five students.

Return the result table in any order.

The result format is in the following example.

Input:<br>
Courses dataframe:

| student | class    |
|:-------:|:--------:|
| A       | Math     |
| B       | English  |
| C       | Math     |
| D       | Biology  |
| E       | Math     |
| F       | Computer |
| G       | Math     |
| H       | Math     |
| I       | Math     |

Output: 

| class   |
|:-------:|
| Math    |

Explanation:<br>
- Math has 6 students, so we include it.
- English has 1 student, so we do not include it.
- Biology has 1 student, so we do not include it.
- Computer has 1 student, so we do not include it.

#### Testcase

In [49]:
# Test data
data = [['A', 'Math'], ['B', 'English'], ['C', 'Math'], ['D', 'Biology'], ['E', 'Math'], ['F', 'Computer'], ['G', 'Math'], ['H', 'Math'], ['I', 'Math']]

# Create the DataFrame
courses = pl.DataFrame(
    data,
    schema=['student', 'class']
)

# Display the DataFrame
print(courses)

shape: (9, 2)
┌─────────┬──────────┐
│ student ┆ class    │
│ ---     ┆ ---      │
│ str     ┆ str      │
╞═════════╪══════════╡
│ A       ┆ Math     │
│ B       ┆ English  │
│ C       ┆ Math     │
│ D       ┆ Biology  │
│ E       ┆ Math     │
│ F       ┆ Computer │
│ G       ┆ Math     │
│ H       ┆ Math     │
│ I       ┆ Math     │
└─────────┴──────────┘


#### Solution

In [50]:
def find_classes(courses: pl.DataFrame) -> pl.DataFrame:

    # Group by 'class' and count the number of students in each class
    c = (
        courses
        .group_by('class')
        .agg(pl.col('student').count().alias('student_count'))
    )
    
    # Filter classes with at least 5 students
    result = c.filter(pl.col('student_count') >= 5).select(['class'])
    
    return result

# Display the result
print(find_classes(courses=courses))

shape: (1, 1)
┌───────┐
│ class │
│ ---   │
│ str   │
╞═══════╡
│ Math  │
└───────┘


### Customer Placing the Largest Number of Orders

#### Question

DataFrame: Orders

| Column Name     | Type     |
|:---------------:|:--------:|
| order_number    | int      |
| customer_number | int      |

order_number is the primary key (column with unique values) for this table.<br>
This table contains information about the order ID and the customer ID.
 

Write a solution to find the customer_number for the customer who has placed the largest number of orders.

The test cases are generated so that exactly one customer will have placed more orders than any other customer.

The result format is in the following example.

Input:<br>
Orders dataframe:

| order_number | customer_number |
|:------------:|:---------------:|
| 1            | 1               |
| 2            | 2               |
| 3            | 3               |
| 4            | 3               |

Output:<br>

| customer_number |
|:---------------:|
| 3               |

Explanation:<br>
The customer with number 3 has two orders, which is greater than either customer 1 or 2 because each of them only has one order. 
So the result is customer_number 3.

#### Testcase

In [51]:
# Test data
data = [[1, 1], [2, 2], [3, 3], [4, 3]]

# Create the DataFrame
orders = pl.DataFrame(
    data,
    schema=['order_number', 'customer_number']
)

# Display the DataFrame
print(orders)

shape: (4, 2)
┌──────────────┬─────────────────┐
│ order_number ┆ customer_number │
│ ---          ┆ ---             │
│ i64          ┆ i64             │
╞══════════════╪═════════════════╡
│ 1            ┆ 1               │
│ 2            ┆ 2               │
│ 3            ┆ 3               │
│ 4            ┆ 3               │
└──────────────┴─────────────────┘


#### Solution

In [52]:
def largest_orders(orders: pl.DataFrame) -> pl.DataFrame:
    
    # Group by 'customer_number' and count the number of occurrences for each customer
    customer_counts = (
        orders
        .group_by('customer_number')
        .agg(pl.col('customer_number').count().alias('count'))
    )
    
    # Find the maximum count
    max_count = customer_counts['count'].max()
    
    # Filter customers with the maximum count (mode)
    top_cust = customer_counts.filter(pl.col('count') == max_count).select(['customer_number'])
    
    return top_cust


# Display the result
print(largest_orders(orders=orders))

shape: (1, 1)
┌─────────────────┐
│ customer_number │
│ ---             │
│ i64             │
╞═════════════════╡
│ 3               │
└─────────────────┘


### Group Sold Products By The Date

#### Question

DataFrame Activities:

| Column Name | Type    |
|:-----------:|:-------:|
| sell_date   | date    |
| product     | varchar |

There is no primary key (column with unique values) for this table. It may contain duplicates.<br>
Each row of this table contains the product name and the date it was sold in a market.
 

Write a solution to find for each date the number of different products sold and their names.

The sold products names for each date should be sorted lexicographically.

Return the result table ordered by sell_date.

The result format is in the following example.

Input:<br>
Activities table:

| sell_date  | product    |
|:----------:|:----------:|
| 2020-05-30 | Headphone  |
| 2020-06-01 | Pencil     |
| 2020-06-02 | Mask       |
| 2020-05-30 | Basketball |
| 2020-06-01 | Bible      |
| 2020-06-02 | Mask       |
| 2020-05-30 | T-Shirt    |

Output:<br>

| sell_date  | num_sold | products                     |
|:----------:|:--------:|:----------------------------:|
| 2020-05-30 | 3        | Basketball,Headphone,T-shirt |
| 2020-06-01 | 2        | Bible,Pencil                 |
| 2020-06-02 | 1        | Mask                         |

Explanation:<br>
For 2020-05-30, Sold items were (Headphone, Basketball, T-shirt), we sort them lexicographically and separate them by a comma.<br>
For 2020-06-01, Sold items were (Pencil, Bible), we sort them lexicographically and separate them by a comma.<br>
For 2020-06-02, the Sold item is (Mask), we just return it.

#### Testcase

In [53]:
# Test data
data = [['2020-05-30', 'Headphone'], ['2020-06-01', 'Pencil'], ['2020-06-02', 'Mask'], ['2020-05-30', 'Basketball'], ['2020-06-01', 'Bible'], ['2020-06-02', 'Mask'], ['2020-05-30', 'T-Shirt']]

# Create the DataFrame
activities = pl.DataFrame(
    data,
    schema=['sell_date', 'product']
)

# Display the DataFrame
print(activities)

shape: (7, 2)
┌────────────┬────────────┐
│ sell_date  ┆ product    │
│ ---        ┆ ---        │
│ str        ┆ str        │
╞════════════╪════════════╡
│ 2020-05-30 ┆ Headphone  │
│ 2020-06-01 ┆ Pencil     │
│ 2020-06-02 ┆ Mask       │
│ 2020-05-30 ┆ Basketball │
│ 2020-06-01 ┆ Bible      │
│ 2020-06-02 ┆ Mask       │
│ 2020-05-30 ┆ T-Shirt    │
└────────────┴────────────┘


#### Solution

In [54]:
def categorize_products(activities: pl.DataFrame) -> pl.DataFrame:

    # Group by 'sell_date' and calculate the number of unique products sold and a sorted list of unique products
    result = (
        activities
        .group_by('sell_date')
        .agg([
            pl.col('product').n_unique().alias('num_sold'),
            pl.col('product')
              .unique()
              .map_elements(lambda x: ",".join(sorted(x)), return_dtype=pl.String)
              .alias('products')
        ])
    )

    return result


# Display the result
print(categorize_products(activities=activities))

shape: (3, 3)
┌────────────┬──────────┬──────────────────────────────┐
│ sell_date  ┆ num_sold ┆ products                     │
│ ---        ┆ ---      ┆ ---                          │
│ str        ┆ u32      ┆ str                          │
╞════════════╪══════════╪══════════════════════════════╡
│ 2020-05-30 ┆ 3        ┆ Basketball,Headphone,T-Shirt │
│ 2020-06-01 ┆ 2        ┆ Bible,Pencil                 │
│ 2020-06-02 ┆ 1        ┆ Mask                         │
└────────────┴──────────┴──────────────────────────────┘


### Daily Leads and Partners

#### Question

DataFrame: DailySales


| Column Name | Type    |
|:-----------:|:-------:|
| date_id     | date    |
| make_name   | varchar |
| lead_id     | int     |
| partner_id  | int     |

There is no primary key (column with unique values) for this table. It may contain duplicates.<br>
This table contains the date and the name of the product sold and the IDs of the lead and partner it was sold to.<br>
The name consists of only lowercase English letters.
 

For each date_id and make_name, find the number of distinct lead_id's and distinct partner_id's.

Return the result table in any order.

The result format is in the following example.

Input:<br>
DailySales dataframe:

| date_id   | make_name | lead_id | partner_id |
|:---------:|:---------:|:-------:|:----------:|
| 2020-12-8 | toyota    | 0       | 1          |
| 2020-12-8 | toyota    | 1       | 0          |
| 2020-12-8 | toyota    | 1       | 2          |
| 2020-12-7 | toyota    | 0       | 2          |
| 2020-12-7 | toyota    | 0       | 1          |
| 2020-12-8 | honda     | 1       | 2          |
| 2020-12-8 | honda     | 2       | 1          |
| 2020-12-7 | honda     | 0       | 1          |
| 2020-12-7 | honda     | 1       | 2          |
| 2020-12-7 | honda     | 2       | 1          |

Output: 

| date_id   | make_name | unique_leads | unique_partners |
|:---------:|:---------:|:------------:|:---------------:|
| 2020-12-8 | toyota    | 2            | 3               |
| 2020-12-7 | toyota    | 1            | 2               |
| 2020-12-8 | honda     | 2            | 2               |
| 2020-12-7 | honda     | 3            | 2               |

Explanation:<br>
For 2020-12-8, toyota gets leads = [0, 1] and partners = [0, 1, 2] while honda gets leads = [1, 2] and partners = [1, 2].<br>
For 2020-12-7, toyota gets leads = [0] and partners = [1, 2] while honda gets leads = [0, 1, 2] and partners = [1, 2].

#### Testcase

In [55]:
# Test data
data = [['2020-12-8', 'toyota', 0, 1], ['2020-12-8', 'toyota', 1, 0], ['2020-12-8', 'toyota', 1, 2], ['2020-12-7', 'toyota', 0, 2], ['2020-12-7', 'toyota', 0, 1], ['2020-12-8', 'honda', 1, 2], ['2020-12-8', 'honda', 2, 1], ['2020-12-7', 'honda', 0, 1], ['2020-12-7', 'honda', 1, 2], ['2020-12-7', 'honda', 2, 1]]

# Create the DataFrame
daily_sales = pl.DataFrame(
    data,
    schema=['date_id', 'make_name', 'lead_id', 'partner_id']
)

# Display the DataFrame
print(daily_sales)

shape: (10, 4)
┌───────────┬───────────┬─────────┬────────────┐
│ date_id   ┆ make_name ┆ lead_id ┆ partner_id │
│ ---       ┆ ---       ┆ ---     ┆ ---        │
│ str       ┆ str       ┆ i64     ┆ i64        │
╞═══════════╪═══════════╪═════════╪════════════╡
│ 2020-12-8 ┆ toyota    ┆ 0       ┆ 1          │
│ 2020-12-8 ┆ toyota    ┆ 1       ┆ 0          │
│ 2020-12-8 ┆ toyota    ┆ 1       ┆ 2          │
│ 2020-12-7 ┆ toyota    ┆ 0       ┆ 2          │
│ 2020-12-7 ┆ toyota    ┆ 0       ┆ 1          │
│ 2020-12-8 ┆ honda     ┆ 1       ┆ 2          │
│ 2020-12-8 ┆ honda     ┆ 2       ┆ 1          │
│ 2020-12-7 ┆ honda     ┆ 0       ┆ 1          │
│ 2020-12-7 ┆ honda     ┆ 1       ┆ 2          │
│ 2020-12-7 ┆ honda     ┆ 2       ┆ 1          │
└───────────┴───────────┴─────────┴────────────┘


#### Solution

In [56]:
def daily_leads_and_partners(daily_sales: pl.DataFrame) -> pl.DataFrame:

    # Group by 'date_id' and 'make_name', and count unique 'lead_id' and 'partner_id'
    result = (
        daily_sales
        .group_by(['date_id', 'make_name'])
        .agg([
            pl.col('lead_id').n_unique().alias('unique_leads'),
            pl.col('partner_id').n_unique().alias('unique_partners')
        ])
    )
    
    return result


# Display the result
print(daily_leads_and_partners(daily_sales=daily_sales))

shape: (4, 4)
┌───────────┬───────────┬──────────────┬─────────────────┐
│ date_id   ┆ make_name ┆ unique_leads ┆ unique_partners │
│ ---       ┆ ---       ┆ ---          ┆ ---             │
│ str       ┆ str       ┆ u32          ┆ u32             │
╞═══════════╪═══════════╪══════════════╪═════════════════╡
│ 2020-12-8 ┆ honda     ┆ 2            ┆ 2               │
│ 2020-12-7 ┆ toyota    ┆ 1            ┆ 2               │
│ 2020-12-8 ┆ toyota    ┆ 2            ┆ 3               │
│ 2020-12-7 ┆ honda     ┆ 3            ┆ 2               │
└───────────┴───────────┴──────────────┴─────────────────┘
