# Pandas Exercises
### By: Jingyu Li

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

## 1. Pandas walkthrough: FIFA 20 complete player dataset

#### **Context & Acknowledgements**
The dataset includes the players data for the Career Mode from FIFA 20. The dataset is download from https://www.kaggle.com/stefanoleone992/fifa-20-complete-player-dataset. The original data was scraped from the publicly available website https://sofifa.com.

#### **Dataset Introduction**
The dataset has been loaded. All columns are self-explanatory.

In [2]:
df = pd.read_csv(r'data/FIFA20_player.csv')

In [3]:
df.head()

Unnamed: 0,id,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club,overall_score,value_eur,wage_eur,team_position
0,158023,L. Messi,Lionel Andrés Messi Cuccittini,32.0,6/24/1987,170.0,72.0,Argentina,FC Barcelona,94.0,95500000.0,565000.0,RW
1,20801,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34.0,2/5/1985,187.0,83.0,Portugal,Juventus,93.0,58500000.0,405000.0,LW
2,190871,Neymar Jr,Neymar da Silva Santos Junior,27.0,2/5/1992,175.0,68.0,Brazil,Paris Saint-Germain,92.0,105500000.0,290000.0,CAM
3,200389,J. Oblak,Jan Oblak,26.0,1/7/1993,188.0,87.0,Slovenia,Atlético Madrid,91.0,77500000.0,125000.0,GK
4,183277,E. Hazard,Eden Hazard,28.0,1/7/1991,175.0,74.0,Belgium,Real Madrid,91.0,90000000.0,470000.0,LW


### 1.1 Handling Missing Data

In [4]:
# Q1: count the number of missing values in each column
# Note: for all questions, either modify "df" inplace or assign the new dataframe to "df"


In [5]:
# Q2: add a new category "unknown" into feature team_position


In [6]:
# Q3: impute missing values in value_eur and wage_eur by median of the column


In [7]:
# Q4: impute missing values in overall_score by mean of all players' score in the same club


In [8]:
# Q5: drop the rows with missing value


### 1.2 Data Transformation

In [9]:
# Q6: check if there is any duplicate row in the dataset. If yes, remove the duplicate row


In [10]:
# Q7: transform nationality into uppercase


In [11]:
# Q8: transform the format of club name
# example: Real Madrid --> Real_Madrid


In [12]:
# Q9: create a feature age_bin based on age
# the bins are (0,18], (18,22], (22,28], (28, 35], (35,100] and name each bin as b1, b2, b3, b4, b5


In [13]:
# Q10: count the frequency of each age_bin


In [14]:
# Q11: create a feature height_bin based on height_cm, and cut into quantiles (0, 0.1, 0.2, ..., 0.9, 1)


In [15]:
# Q12: create dummy variables based on team_position in the DataFrame


In [16]:
# Q13: calculate BMI for each player: BMI = weight (in kg) / height (in m)**2


In [17]:
# Q14: create a variable: standard_bmi which equals 1 if BMI in [18,24] else 0


### 1.3 Data Wrangling

In [18]:
# Q15: sort the DataFrame according to weight_kg (high to low) and height_cm (if weights are same, low to high)


In [19]:
# Q16: sort the DataFrame generated in Q12 according to row index (low to high)


In [20]:
data19 = pd.read_csv(r'data/FIFA19_player.csv')
data19.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 4 columns):
sofifa_id        17770 non-null int64
short_name       17770 non-null object
weight_kg        17770 non-null int64
overall_score    17770 non-null int64
dtypes: int64(3), object(1)
memory usage: 555.4+ KB


In [21]:
practice = pd.melt(data19, ['sofifa_id', 'short_name']).sort_values(by=['sofifa_id']).reset_index().drop('index', axis=1)
practice.head()

Unnamed: 0,sofifa_id,short_name,variable,value
0,164,G. Pinzi,overall_score,70
1,164,G. Pinzi,weight_kg,76
2,657,D. Vaughan,overall_score,66
3,657,D. Vaughan,weight_kg,70
4,768,Felipe,overall_score,75


In [22]:
# Q17: the DataFrame "practice" is in "long" format, transform it into 'wide' format (reverse operation of pd.melt)
# then merge the players' weight_kg and overall_score into the DataFrame of FIFA2020


In [23]:
# Q18: create a variable: score_change which equals to
# - "increase" if overall_score/overall_score-1 >= 2%
# - "decrease" if overall_score/overall_score-1 >= -2%
# - otherwise "equal"

# then count the number of players in each category


### 1.4 Data Aggregation

In [24]:
# Q19: calculate the average players' overall_score of each club


In [25]:
# Q20: calculate the median value_eur for each team_position and age_bin (group by the two columns)
# then pivot the index of age_bin to column index


In [26]:
# Q21: calculate the following value for each team_position: 
# - wage_eur: median, max()-min()
# - BMI: standard deviation, max, min, max()-min()
# - age: mean

# then change the column index to one level and find a proper way to rename the column index, e.g. wage_eur_median


In [27]:
# Q22: find the top 5 valuable player (value_eur) in each team_position


In [28]:
# Q23: calculate the weighted average overall_score of each age_bin
df['personal_weight'] = np.random.uniform(0,1,(df.shape[0],1)) # don't remove


In [29]:
# Q24: calculate the correlation coefficients between height_cm and weight_kg for each team_position


## 2. Pandas on Time Series

### 2.1 Date and Time Data Types: Basic

In [30]:
# Q25: change column dob's dtype to datetime


In [31]:
# Q26: count the number of players born in different months (01, 02, ..., 11, 12)


### 2.2 Time Series: Indexing and Slicing

#### **Context & Acknowledgements**
This is a dataset hosted by the City of New York. It includes the daily number of individuals residing in the Department of Homeless Services (DHS) shelter system. The dataset is download from https://www.kaggle.com/new-york-city/nyc-dhs-daily-report. The orginal dataset is maintained using Socrata's API and Kaggle's API.

#### **Dataset Introduction**
The dataset has been loaded. All columns are self-explanatory. The dataset is modified for exercising purpose. It may not reflect the real statistics.

In [32]:
dhs = pd.read_csv(r'data/dhs_daily_report.csv')
dhs['Date'] = pd.to_datetime(dhs['Date'])
dhs = dhs.set_index('Date')
dhs = dhs.sort_index(axis=0)

In [33]:
dhs.head()

Unnamed: 0_level_0,Total_Individuals_in_Shelter
Date,Unnamed: 1_level_1
2013-08-21,49673
2013-08-22,49690
2013-08-23,49548
2013-08-24,49617
2013-08-25,49858


In [34]:
# Q27: return the daily number during Nov, 2018


In [35]:
# Q28: return the daily number from 2018-03-12 to 2018-10-29


In [36]:
# Q29: return the daily number after 2017-01-01


In [37]:
# Q30: return the daily number of every Friday


In [38]:
# Q31: return the average number of different days of the week (Sunday, Monday, ...)


### 2.3 Time Series: Date Range and Shifting

In [39]:
# Q32: generate DatetimeIndex ranging from 2017-01-01 to 2018-01-01


In [40]:
# Q33: generate DatetimeIndex of 100 days starting from 2017-01-01


In [41]:
# Q34: generate DatetimeIndex of business days ranging from 2017-01-01 to 2018-01-01


In [42]:
# Q35: generate DatetimeIndex of the dates of each Tuesday ranging from 2017-01-01 to 2018-01-01


In [43]:
# Q36: generate DatetimeIndex of first calendar days of each month ranging from 2017-01-01 to 2019-7-31
# e.g. 2017-01-01, 2017-02-01, ...


In [44]:
# Q37: generate DatetimeIndex of last calendar days of each quarter ranging from 2017-01-01 to 2019-7-31 
# e.g. 2017-03-31, 2017-06-30, 2017-09-30, ...


In [45]:
# Q38: generate DatetimeIndex for every 4 hours ranging from 2017-01-01 00:00 to 2017-01-04 22:00


In [46]:
# Q39: return the daily number in every first day of each month (using date range)


In [47]:
# Q40: return the daily number of every Sunday (using date range)


In [48]:
# Q41: create a new column: daily_change_percent, which record the daily change rate (day2 number/day1 number - 1)


In [49]:
# Q42: return a new dataframe as described below:
# - index: Date of each month's last day
# - total_end_of_month: daily number of the corresponding day
# - total_end_of_previous_month: daily number on the last day of previous month


### 2.4 Time Series: Resampling

In [50]:
# Q43: calculate the average number in each month and change the index to corresponding PeriodIndex (downsampling)


In [51]:
# Q43: calculate the following metrics in each quarter and change the index to corresponding PeriodIndex (downsampling)
# - mean
# - max-min
# - lastday/firstday-1


In [52]:
# The following problems use the DataFrame ex
dates = pd.to_datetime(['2019-01-01', '2019-01-02', '2019-01-02', '2019-01-03', '2019-01-05', '2019-01-08'])
ex = pd.DataFrame(np.random.randn(len(dates)), columns=['a'], index=dates)
ex

Unnamed: 0,a
2019-01-01,-0.379247
2019-01-02,1.03013
2019-01-02,-1.474037
2019-01-03,0.743005
2019-01-05,-0.297177
2019-01-08,0.427858


In [53]:
# Q44: check if duplicate dates exist. If yes, group by them using mean, assign the new DataFrame to ex


In [54]:
# Q45: suppose the ex should be daily basis, check if some dates are missing in ex.
# If yes, add the missing dates into ex and fill the missing value using the value of previous day (upsampling)


### 2.5 Time Series: Moving Window Function

In [55]:
# The following problems use the DataFrame dhs
# Q46: calculate the average number of last 7 days


In [56]:
# Q47: calculate the standard deviation of the number from day 1 up to now


## 3. Rewrite SQL in Pandas

### 3.1 Basics

In *salary* table, the salary of employees are recorded.
- Id: ID of the employee
- Name: Name of the employee
- Salary: Monthly salary of the employee
- DepartmentId: ID of the department that the employee belongs to

In *dp* table,
- Id: ID of the department
- Name: Name of the department

Rewrite the query to generate same table (DataFrame) by using Pandas

In [57]:
salary = pd.read_csv(r'data/salary.csv')
salary.head()

Unnamed: 0,Id,Name,Salary,DepartmentId
0,1,William,1600,1
1,2,James,1140,2
2,3,Harper,4788,2
3,4,Mason,6567,2
4,5,Evelyn,1112,1


In [58]:
dp = pd.read_csv(r'data/department.csv')
dp

Unnamed: 0,Id,Name
0,1,IT
1,2,DS
2,3,Sales
3,4,HR
4,5,Finance
5,6,Product


In [59]:
# Q48: get the second highest value of salary among all employees
'''
SELECT DISTINCT Salary AS secondhigh
FROM salary
ORDER BY Salary DESC
LIMIT 1 OFFSET 1
'''


'\nSELECT DISTINCT Salary AS secondhigh\nFROM salary\nORDER BY Salary DESC\nLIMIT 1 OFFSET 1\n'

In [60]:
# Q49: query the Name and Salary of employees who satisfies the following conditions
# - salary is greater than 2000
# - in department 1 and 2
# order the results by Salary from low to high
'''
SELECT Name, Salary
FROM salary
WHERE Salary>2000 AND (DepartmentId=1 OR DepartmentId=2)
ORDER BY Salary
'''


'\nSELECT Name, Salary\nFROM salary\nWHERE Salary>2000 AND (DepartmentId=1 OR DepartmentId=2)\nORDER BY Salary\n'

In [61]:
# Q50: query the average salary of each department, the table contains two column:
# -name: name of the department
# -avg_salary: average salary of the department
# All departments in dp should be listed. If there aren't any employees belonging to a department in salary table,
# then avg_salary should be np.nan
# Order by average salary from high to low
'''
SELECT d.Name, sub.avg_salary
FROM dp d
LEFT JOIN (SELECT DepartmentId, AVG(Salary) AS avg_salary FROM salary GROUP BY DepartmentId) sub ON sub.DepartmentId=d.Id
ORDER BY sub.avg_salary DESC
'''


'\nSELECT d.Name, sub.avg_salary\nFROM dp d\nLEFT JOIN (SELECT DepartmentId, AVG(Salary) AS avg_salary FROM salary GROUP BY DepartmentId) sub ON sub.DepartmentId=d.Id\nORDER BY sub.avg_salary DESC\n'

In [62]:
# Q51: return the employee's name whose id is in the following list: [1,3,7,9,14,15,19]
'''
SELECT Name
FROM salary
WHERE Id IN (1,3,7,9,14,15,19)
'''


'\nSELECT Name\nFROM salary\nWHERE Id IN (1,3,7,9,14,15,19)\n'

In [63]:
# Q52: query the average salary of the department, in which the number of employees arenot less than 5
# the table contains three column:
# -Id: ID of the department
# -avg_salary: average salary of the department
# -employee_num: number of employees in that department
'''
SELECT DepartmentId AS Id, AVG(Salary) AS avg_salary, COUNT(Id) AS employee_num
FROM Salary
GROUP BY DepartmentId
HAVING COUNT(Id)>=5
'''


'\nSELECT DepartmentId AS Id, AVG(Salary) AS avg_salary, COUNT(Id) AS employee_num\nFROM Salary\nGROUP BY DepartmentId\nHAVING COUNT(Id)>=5\n'

In [64]:
# Q53: query the top 3 Salaries of each department. Only the departments appear in salary table should be returned
# the table contains three column:
# -department: Name of the department
# -employee: Name of the employee
# -salary: salary of the employee
# Note: rank the salary using dense rank 
# e.g. a-1000, b-800, c-800, d-600, b and c both earn the second highest salary, d earn the third highest salary
# a, b, c and d should be all included
'''
WITH sub AS(
SELECT DepartmentId, Name AS employee, Salary AS salary,
       DENSE_RANK() OVER (PARTITION BY DepartmentId ORDER BY Salary DESC) AS ranks
FROM salary
)

SELECT d.Name AS department, sub.employee, sub.salary
FROM sub
LEFT JOIN dp d ON d.Id=sub.DepartmentId
WHERE sub.ranks <= 3
'''


'\nWITH sub AS(\nSELECT DepartmentId, Name AS employee, Salary AS salary,\n       DENSE_RANK() OVER (PARTITION BY DepartmentId ORDER BY Salary DESC) AS ranks\nFROM salary\n)\n\nSELECT d.Name AS department, sub.employee, sub.salary\nFROM sub\nLEFT JOIN dp d ON d.Id=sub.DepartmentId\nWHERE sub.ranks <= 3\n'

In *sales* table, the daily revenue from 2020-01-01 to 2020-02-29 of product 1, 2 and 3 are recorded. We can assume the dates in the table are continuous and there are no duplicates.
- Id: Product ID
- Date: Date
- Revenue: Daily revenue

Rewrite the query to generate same table (DataFrame) by using Pandas

In [65]:
sales = pd.read_csv(r'data/sales.csv')
sales.head()

Unnamed: 0,Id,Date,Revenue
0,3,1/19/2020,718
1,1,1/22/2020,690
2,2,1/22/2020,172
3,1,2/22/2020,139
4,2,2/22/2020,707


In [66]:
# Q54: query the cumulative revenue from day 1 for each product. Order the results by Id (1,2,3) and Date (low to high)
'''
SELECT Id, Date, SUM(Revenue) OVER (PARTITION BY Id ORDER BY Date) AS cum_revenue
FROM sales
ORDER BY Id, Date
'''


'\nSELECT Id, Date, SUM(Revenue) OVER (PARTITION BY Id ORDER BY Date) AS cum_revenue\nFROM sales\nORDER BY Id, Date\n'

In [67]:
# Q55: return each product's total revenue in Jan and Feb of 2020 
'''
SELECT LEFT(Date, 7) AS month, Id, SUM(Revenue)
FROM sales
GROUP BY LEFT(Date, 7), Id
'''


'\nSELECT LEFT(Date, 7) AS month, Id, SUM(Revenue)\nFROM sales\nGROUP BY LEFT(Date, 7), Id\n'

In [68]:
# Q56: return total daily revenue from 2020-01-15 to 2020-02-03 
'''
SELECT Date, SUM(Revenue)
FROM sales
WHERE Date BETWEEN '2020-01-15' AND '2020-02-03'
GROUP BY Date
'''


"\nSELECT Date, SUM(Revenue)\nFROM sales\nWHERE Date BETWEEN '2020-01-15' AND '2020-02-03'\nGROUP BY Date\n"

In [69]:
# Q57: return each product's total revenue on each day of the week. Columns of the table are:
# - id: product ID
# - monday: total revenue on Monday
# - tuesday: total revenue on Tuesday
# ......
# - sunday: total revenue on Sunday
'''
SELECT Id,
       SUM(CASE WHEN dow=2 THEN total ELSE 0 END) 'Monday',
       SUM(CASE WHEN dow=3 THEN total ELSE 0 END) 'Tuesday',
       SUM(CASE WHEN dow=4 THEN total ELSE 0 END) 'Wednesday',
       SUM(CASE WHEN dow=5 THEN total ELSE 0 END) 'Thursday',
       SUM(CASE WHEN dow=6 THEN total ELSE 0 END) 'Friday',
       SUM(CASE WHEN dow=7 THEN total ELSE 0 END) 'Saturday',
       SUM(CASE WHEN dow=1 THEN total ELSE 0 END) 'Sunday'
FROM
(
SELECT Id,
       DAYOFWEEK(Date) AS dow,
       SUM(Revenue) AS total
FROM sales
GROUP BY Id, dow) sub
GROUP BY Id
'''


"\nSELECT Id,\n       SUM(CASE WHEN dow=2 THEN total ELSE 0 END) 'Monday',\n       SUM(CASE WHEN dow=3 THEN total ELSE 0 END) 'Tuesday',\n       SUM(CASE WHEN dow=4 THEN total ELSE 0 END) 'Wednesday',\n       SUM(CASE WHEN dow=5 THEN total ELSE 0 END) 'Thursday',\n       SUM(CASE WHEN dow=6 THEN total ELSE 0 END) 'Friday',\n       SUM(CASE WHEN dow=7 THEN total ELSE 0 END) 'Saturday',\n       SUM(CASE WHEN dow=1 THEN total ELSE 0 END) 'Sunday'\nFROM\n(\nSELECT Id,\n       DAYOFWEEK(Date) AS dow,\n       SUM(Revenue) AS total\nFROM sales\nGROUP BY Id, dow) sub\nGROUP BY Id\n"

In [70]:
# Q58: return the rolling average of daily revenue in last 14 days
# - id: product ID
# - date: date
# - rolling_avg: average revenue in last 14 days
# Note: the first 13 days shouldn't be included in the final output
'''
SELECT Id, Date, AVG(Revenue) OVER (ORDER BY Date ROWS BETWEEN 13 PRECEDING AND 0 FOLLOWING) AS rolling_avg
FROM sales
WHERE Date >= (SELECT DISTINCT Date FROM sales ORDER BY Date LIMIT 1 OFFSET 13)
'''


'\nSELECT Id, Date, AVG(Revenue) OVER (ORDER BY Date ROWS BETWEEN 13 PRECEDING AND 0 FOLLOWING) AS rolling_avg\nFROM sales\nWHERE Date >= (SELECT DISTINCT Date FROM sales ORDER BY Date LIMIT 1 OFFSET 13)\n'

In [71]:
# Q59: calculate the daily changes in overall revenue of three products
'''
SELECT Date, daily_total/LAG(daily_total, 1) OVER (ORDER BY Date)-1 AS change_rate
FROM(
SELECT Date, SUM(Revenue) AS daily_total
FROM sales
GROUP BY Date
ORDER BY Date) sub
'''


'\nSELECT Date, daily_total/LAG(daily_total, 1) OVER (ORDER BY Date)-1 AS change_rate\nFROM(\nSELECT Date, SUM(Revenue) AS daily_total\nFROM sales\nGROUP BY Date\nORDER BY Date) sub\n'

### 3.2 Case: Game Play Analysis

This case contains a series of problems, which is modified from the SQL exercise in Leetcode.

In *activity* table, each row is a record of a player who logged in and played a number of games (possibly 0) before logging out on some day using some device.
- player_id: ID of the player
- device_id: ID of the device
- event_date: Date of the record
- games_played: number of games played

Rewrite the query to generate same table (DataFrame) by using Pandas

In [72]:
activity = pd.read_csv(r'data/activity.csv')
activity.sample(5)

Unnamed: 0,player_id,device_id,event_date,games_played
862,6,186,1/16/2019,5
887,13,178,3/15/2019,71
215,43,157,1/10/2019,45
812,34,38,2/3/2019,63
313,19,120,3/9/2019,54


In [73]:
# Q60: report the first login date for each player.
'''
SELECT player_id, MIN(event_date) AS first_login
FROM Activity
GROUP BY player_id
'''


'\nSELECT player_id, MIN(event_date) AS first_login\nFROM Activity\nGROUP BY player_id\n'

In [74]:
# Q61: report the device that is first logged in for each player.
'''
SELECT player_id, device_id
FROM
(SELECT player_id, device_id, RANK() OVER (PARTITION BY player_id ORDER BY event_date) AS ranks FROM Activity) sub
WHERE ranks = 1
'''


'\nSELECT player_id, device_id\nFROM\n(SELECT player_id, device_id, RANK() OVER (PARTITION BY player_id ORDER BY event_date) AS ranks FROM Activity) sub\nWHERE ranks = 1\n'

In [75]:
# Q62: report for each player and date, how many games played so far by the player. 
# That is, the total number of games played by the player until that date.
'''
SELECT a1.player_id, a1.event_date, SUM(a2.games_played) AS games_played_so_far
FROM Activity a1
LEFT JOIN Activity a2 ON a1.player_id=a2.player_id AND a1.event_date >= a2.event_date
GROUP BY a1.player_id, a1.event_date
ORDER BY a1.player_id, a1.event_date
'''


'\nSELECT a1.player_id, a1.event_date, SUM(a2.games_played) AS games_played_so_far\nFROM Activity a1\nLEFT JOIN Activity a2 ON a1.player_id=a2.player_id AND a1.event_date >= a2.event_date\nGROUP BY a1.player_id, a1.event_date\nORDER BY a1.player_id, a1.event_date\n'

In [76]:
# Q63: report the fraction of players that logged in again on the day after the day they first logged in. 
# In other words, count the number of players that logged in for at least two consecutive days 
# starting from their first login date, then divide that number by the total number of players.
'''
WITH firstlog AS
(
SELECT player_id, event_date, RANK() OVER (PARTITION BY player_id ORDER BY event_date) AS ranks
FROM Activity
),

daytwo AS
(
SELECT a1.player_id
FROM firstlog a1, firstlog a2
WHERE a1.player_id=a2.player_id AND DATEDIFF(a2.event_date, a1.event_date) = 1 AND a1.ranks=1
)

SELECT ROUND(COUNT(DISTINCT daytwo.player_id)/COUNT(DISTINCT a.player_id),2) AS fraction
FROM daytwo, Activity a
'''


'\nWITH firstlog AS\n(\nSELECT player_id, event_date, RANK() OVER (PARTITION BY player_id ORDER BY event_date) AS ranks\nFROM Activity\n),\n\ndaytwo AS\n(\nSELECT a1.player_id\nFROM firstlog a1, firstlog a2\nWHERE a1.player_id=a2.player_id AND DATEDIFF(a2.event_date, a1.event_date) = 1 AND a1.ranks=1\n)\n\nSELECT ROUND(COUNT(DISTINCT daytwo.player_id)/COUNT(DISTINCT a.player_id),2) AS fraction\nFROM daytwo, Activity a\n'

In [77]:
# Q64: define the install date of a player to be the first login day of that player.
# define day 1 retention of some date X to be the number of players whose install date is X and 
# they logged back in on the day right after X, divided by the number of players whose install date is X.
# report for each install date, the number of players that installed the game on that day and the day 1 retention.
# - install_dt: install date
# - installs: the number of players that installed the game on that day
# - d1_retention: day 1 retention rate of the install date
'''
SELECT a1.register_date AS install_dt,
       COUNT(a1.register_date) AS installs,
       ROUND(COUNT(a2.event_date) / COUNT(a1.register_date)*1.0,2) AS d1_retention
FROM 
    (SELECT player_id, MIN(event_date) AS register_date
     FROM Activity
     GROUP BY player_id) a1
LEFT JOIN Activity a2 ON a1.player_id = a2.player_id AND DATEDIFF(a2.event_date, a1.register_date) =1
GROUP BY a1.register_date
'''


'\nSELECT a1.register_date AS install_dt,\n       COUNT(a1.register_date) AS installs,\n       ROUND(COUNT(a2.event_date) / COUNT(a1.register_date)*1.0,2) AS d1_retention\nFROM \n    (SELECT player_id, MIN(event_date) AS register_date\n     FROM Activity\n     GROUP BY player_id) a1\nLEFT JOIN Activity a2 ON a1.player_id = a2.player_id AND DATEDIFF(a2.event_date, a1.register_date) =1\nGROUP BY a1.register_date\n'