### Import Statements

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

### Data type constraints

<u>Useful functions and methods</u>
- `df.info()` - returns the data type of each column and the number of non-null values
- `df.dtypes` - returns the data type of each column
- `df.select_dtypes()` - returns a dataframe with only the columns that are of the specified data type
- `df.describe()` - returns a dataframe with the count, mean, standard deviation, minimum, maximum, and quartiles of each numeric column
- `df.astype()` - converts the data type of a column/multiple columns to the specified data type(s)
- The `.str` accessor - allows you to apply string methods to a column that contains strings/objects

In [2]:
df_ride_sharing = pd.read_csv("../datasets/ride_sharing_new.csv")

In [5]:
df_ride_sharing.head(3)

Unnamed: 0.1,Unnamed: 0,duration,station_A_id,station_A_name,station_B_id,station_B_name,bike_id,user_type,user_birth_year,user_gender
0,0,12 minutes,81,Berry St at 4th St,323,Broadway at Kearny,5480,2,1959,Male
1,1,24 minutes,3,Powell St BART Station (Market St at 4th St),118,Eureka Valley Recreation Center,5193,2,1965,Male
2,2,8 minutes,67,San Francisco Caltrain Station 2 (Townsend St...,23,The Embarcadero at Steuart St,3652,3,1993,Male


In [4]:
df_ride_sharing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25760 entries, 0 to 25759
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       25760 non-null  int64 
 1   duration         25760 non-null  object
 2   station_A_id     25760 non-null  int64 
 3   station_A_name   25760 non-null  object
 4   station_B_id     25760 non-null  int64 
 5   station_B_name   25760 non-null  object
 6   bike_id          25760 non-null  int64 
 7   user_type        25760 non-null  int64 
 8   user_birth_year  25760 non-null  int64 
 9   user_gender      25760 non-null  object
dtypes: int64(6), object(4)
memory usage: 2.0+ MB


In [8]:
# let's just drop the first column
# we'll then look at handling data types
df_ride_sharing = df_ride_sharing.drop(columns=df_ride_sharing.columns[0])

In [9]:
df_ride_sharing.head(2)

Unnamed: 0,duration,station_A_id,station_A_name,station_B_id,station_B_name,bike_id,user_type,user_birth_year,user_gender
0,12 minutes,81,Berry St at 4th St,323,Broadway at Kearny,5480,2,1959,Male
1,24 minutes,3,Powell St BART Station (Market St at 4th St),118,Eureka Valley Recreation Center,5193,2,1965,Male


In [10]:
# duration should be int
df_ride_sharing["duration"] = (
    df_ride_sharing["duration"].str.strip(" minutes").astype(int)
)

In [11]:
assert df_ride_sharing["duration"].dtype == "int"

In [12]:
df_ride_sharing.nunique()

duration            172
station_A_id          9
station_A_name        9
station_B_id        152
station_B_name      152
bike_id            1805
user_type             3
user_birth_year      63
user_gender           3
dtype: int64

In [14]:
# although "station_A_id", "station_B_id", "bike_id", "user_type" all are stored as int type, they are
# actually categorical data. using summary statistics on them will produce misleading data at best
# and will cause wrong conclusion at worst
col_int_to_cat_map = {
    "station_A_id": "category",
    "station_B_id": "category",
    "bike_id": "category",
    "user_type": "category",
}
df_ride_sharing = df_ride_sharing.astype(col_int_to_cat_map)

# Also "station_A_name", "station_B_name" columns should be categorical (though not strictly necessary)

In [15]:
df_ride_sharing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25760 entries, 0 to 25759
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   duration         25760 non-null  int64   
 1   station_A_id     25760 non-null  category
 2   station_A_name   25760 non-null  object  
 3   station_B_id     25760 non-null  category
 4   station_B_name   25760 non-null  object  
 5   bike_id          25760 non-null  category
 6   user_type        25760 non-null  category
 7   user_birth_year  25760 non-null  int64   
 8   user_gender      25760 non-null  object  
dtypes: category(4), int64(2), object(3)
memory usage: 1.2+ MB
