In [116]:
# Required libraries
from sqlalchemy import create_engine
import pandas as pd
import haversine

def query_to_dataframe(connection_string, query):
    """
    Query a MySQL database using SQLAlchemy and return the result as a pandas DataFrame.

    Parameters:
    - connection_string (str): The connection string for the database.
    - query (str): The SQL query to be executed.

    Returns:
    - df (pd.DataFrame): The result of the query as a pandas DataFrame.
    """
    # Create an engine
    engine = create_engine(connection_string)
    
    # Connect to the engine and execute the query
    with engine.connect() as connection:
        df = pd.read_sql(query, connection)
        
    return df




In [6]:
# Example Usage:

# Define the connection string
# Format: "mysql+<driver_name>://<username>:<password>@<host>:<port>/<database_name>"
connection_string = "mysql+mysqlconnector://arasham:Generalfuzn03?@tdt4225-10.idi.ntnu.no:3306/default_db"

In [7]:
sql_query1 = "select * from user"
sql_query2 = "select * from activity"
sql_query3 = "select * from track_point"


In [8]:
df1 = query_to_dataframe(connection_string, sql_query1)
df1

Unnamed: 0,id,has_labels
0,000,0
1,001,0
2,002,0
3,003,0
4,004,0
...,...,...
168,176,0
169,178,0
170,179,1
171,180,0


In [9]:
df2 = query_to_dataframe(connection_string, sql_query2)
df2

Unnamed: 0,id,user_id,transportation_mode,start_date_time,end_date_time
0,20000101231219163,163,,2000-01-01 23:12:19,2000-01-01 23:15:23
1,20070412093132142,142,,2007-04-12 09:31:32,2007-04-12 11:33:40
2,20070412101853161,161,,2007-04-12 10:18:53,2007-04-12 10:23:15
3,20070412102116163,163,bike,2007-04-12 10:21:16,2007-04-12 14:56:56
4,20070412102325161,161,,2007-04-12 10:23:25,2007-04-12 10:26:25
...,...,...,...,...,...
16043,20120724204557163,163,,2012-07-24 20:45:57,2012-07-24 22:01:15
16044,20120725080203163,163,,2012-07-25 08:02:03,2012-07-25 08:26:13
16045,20120725190300163,163,,2012-07-25 19:03:00,2012-07-25 19:55:08
16046,20120726080106163,163,,2012-07-26 08:01:06,2012-07-26 08:27:24


In [51]:
df2["transportation_mode"].unique()

array([None, 'bike', 'taxi', 'walk', 'car', 'bus', 'subway', 'run',
       'train', 'airplane', 'boat'], dtype=object)

In [10]:
df3 = query_to_dataframe(connection_string, sql_query3)
df3

Unnamed: 0,id,activity_id,lat,lon,altitude,date_days,date_time
0,1,20090412073303000,40.000017,116.327479,105,39915.314618,2009-04-12 07:33:03
1,2,20090412073303000,40.000168,116.327474,80,39915.314688,2009-04-12 07:33:09
2,3,20090412073303000,40.000055,116.327454,99,39915.314745,2009-04-12 07:33:14
3,4,20090412073303000,40.000021,116.327407,109,39915.314803,2009-04-12 07:33:19
4,5,20090412073303000,40.000035,116.327281,111,39915.314861,2009-04-12 07:33:24
...,...,...,...,...,...,...,...
9681751,9681752,20071208010931181,39.866483,116.415383,249,39424.298426,2007-12-08 07:09:44
9681752,9681753,20071208010931181,39.866033,116.415683,174,39424.299537,2007-12-08 07:11:20
9681753,9681754,20071208010931181,39.865550,116.415733,187,39424.300486,2007-12-08 07:12:42
9681754,9681755,20071208010931181,39.864883,116.415750,226,39424.301412,2007-12-08 07:14:02


In [16]:
#Task 1: 

sql_query1 = """
    SELECT COUNT(*) AS num_users
    FROM user;
"""

sql_query2 = """
    SELECT COUNT(*) AS num_activities 
    FROM activity;
"""

sql_query3 = """ 
    SELECT COUNT(*) AS num_trackpoints
    FROM track_point;
"""

df = query_to_dataframe(connection_string, sql_query3)
df





Unnamed: 0,num_trackpoints
0,9681756


In [82]:
# Task 2
sql_query = """
    WITH user_tp_count AS (
        SELECT user_id, count(*) AS tp_count
        FROM track_point tp JOIN activity a ON a.id = tp.activity_id
        JOIN user u ON a.user_id = u.id
        GROUP BY user_id
    )
    SELECT MIN(tp_count), MAX(tp_count), AVG(tp_count) FROM user_tp_count;
    """

# Get the data as a DataFrame
df = query_to_dataframe(connection_string, sql_query)
df

Unnamed: 0,MIN(tp_count),MAX(tp_count),AVG(tp_count)
0,17,1010325,55963.9075


In [83]:
# Task 3

sql_query = """ 
    SELECT user_id, count(a.id) AS activity_count
    FROM user u JOIN activity a ON a.user_id = u.id
    GROUP BY u.id
    ORDER BY activity_count DESC
    LIMIT 15;
"""
df = query_to_dataframe(connection_string, sql_query)
df

Unnamed: 0,user_id,activity_count
0,128,2102
1,153,1793
2,25,715
3,163,704
4,62,691
5,144,563
6,41,399
7,85,364
8,4,346
9,140,345


In [85]:
#Task 4

sql_query = """ 
    SELECT user_id, transportation_mode  
    FROM user u JOIN activity a ON a.user_id = u.id
    WHERE a.transportation_mode = "bus"; 
"""
df = query_to_dataframe(connection_string, sql_query)
df

Unnamed: 0,user_id,transportation_mode
0,091,bus
1,175,bus
2,091,bus
3,092,bus
4,010,bus
...,...,...
194,085,bus
195,085,bus
196,085,bus
197,085,bus


In [86]:
#Task 5

#This is including those that have transportation_mode = NONE
sql_query = """ 
    SELECT user_id, COUNT(DISTINCT a.transportation_mode) AS num_of_different_transport_mode
    FROM user u JOIN activity a ON a.user_id = u.id
    GROUP BY u.id
    ORDER BY num_of_different_transport_mode DESC
    LIMIT 10; 
"""
df = query_to_dataframe(connection_string, sql_query)
df


Unnamed: 0,user_id,num_of_different_transport_mode
0,128,9
1,62,7
2,85,4
3,84,3
4,58,3
5,163,3
6,78,3
7,81,3
8,112,3
9,65,2


In [81]:
#Task 5 ALternative way
sql_query1 = """ 
    SELECT transportation_mode 
    FROM activity
"""
activity_df = query_to_dataframe(connection_string, sql_query1)
trans_mode_list = activity_df["transportation_mode"].unique()

select_statement = ", ".join([f"MAX(CASE WHEN transportation_mode = '{mode}' THEN 1 ELSE 0 END) AS {mode}" for mode in trans_mode_list])

sql_query = f"""
    SELECT
        user_id, 
        {select_statement},
        COUNT(DISTINCT a.transportation_mode) as num_of_different_transport_mode
    FROM user u JOIN activity a ON a.user_id = u.id
    GROUP BY u.id
    ORDER BY num_of_different_transport_mode DESC
    LIMIT 10; 
"""
df = query_to_dataframe(connection_string, sql_query)
df


Unnamed: 0,user_id,None,bike,taxi,walk,car,bus,subway,run,train,airplane,boat,num_of_different_transport_mode
0,128,0,1,1,1,1,1,1,0,1,1,1,9
1,62,0,1,1,1,1,1,0,1,1,0,0,7
2,85,0,0,1,1,0,1,1,0,0,0,0,4
3,84,0,0,0,1,0,1,1,0,0,0,0,3
4,58,0,0,1,1,1,0,0,0,0,0,0,3
5,163,0,1,1,1,0,0,0,0,0,0,0,3
6,78,0,0,1,1,0,0,1,0,0,0,0,3
7,81,0,1,0,1,0,1,0,0,0,0,0,3
8,112,0,1,0,1,0,1,0,0,0,0,0,3
9,65,0,1,0,1,0,0,0,0,0,0,0,2


In [87]:
#Task 6

sql_query = """ 

    SELECT id, COUNT(*) AS activity_count
    FROM activity 
    GROUP BY id
    HAVING COUNT(*) > 1 
    ORDER BY activity_count DESC;
"""

df = query_to_dataframe(connection_string, sql_query)
df

Unnamed: 0,id,activity_count


In [111]:
#Task 7a 

sql_query = """

    SELECT COUNT(DISTINCT user_id) as number_of_users
    FROM activity a 
    WHERE DATE(a.start_date_time) != DATE(a.end_date_time) 
    AND TIMESTAMPDIFF(DAY, a.start_date_time, a.end_date_time) >= 1

"""

df = query_to_dataframe(connection_string, sql_query)
df

Unnamed: 0,number_of_users
0,5


In [115]:
#Task 7b 

sql_query = """

    SELECT a.transportation_mode, user_id, 
    CONCAT(
        TIMESTAMPDIFF(DAY, a.start_date_time, a.end_date_time), ' days ',
        HOUR(TIMEDIFF(a.end_date_time, a.start_date_time)), ' hours ',
        MINUTE(TIMEDIFF(a.end_date_time, a.start_date_time)), ' minutes ',
        SECOND(TIMEDIFF(a.end_date_time, a.start_date_time)), ' seconds'
    ) AS Duration
    FROM activity a
    WHERE DATE(a.start_date_time) != DATE(a.end_date_time)
    AND TIMESTAMPDIFF(DAY, a.start_date_time, a.end_date_time) >= 1

"""

df = query_to_dataframe(connection_string, sql_query)
df

Unnamed: 0,transportation_mode,user_id,Duration
0,,99,1 days 28 hours 49 minutes 19 seconds
1,,51,1 days 29 hours 1 minutes 3 seconds
2,,144,1 days 28 hours 28 minutes 34 seconds
3,,144,1 days 26 hours 28 minutes 35 seconds
4,,28,1 days 29 hours 12 minutes 45 seconds
5,,17,1 days 24 hours 32 minutes 16 seconds


In [None]:
# Task 8

sql_query = """

    SELECT COUNT(DISTINCT u.id), TIMESTAMPDIFF(SECOND,  )
"""

df = query_to_dataframe(connection_string, sql_query)
df