# UNDERSTANDING THE DATA FOR THE BIKESHARE PROJECT

##### Importing the required library

In [1]:
import pandas as pd

### explore_chicago

In [2]:
df=pd.read_csv('chicago.csv')

##### #1. What columns are in this dataset?

In [3]:
print(df.columns)

Index(['Unnamed: 0', 'Start Time', 'End Time', 'Trip Duration',
       'Start Station', 'End Station', 'User Type', 'Gender', 'Birth Year'],
      dtype='object')


##### #2. Are there any missing values?

In [4]:
print(df.isnull().any())

Unnamed: 0       False
Start Time       False
End Time         False
Trip Duration    False
Start Station    False
End Station      False
User Type        False
Gender            True
Birth Year        True
dtype: bool


##### #3. What are the different types of values in each column?

In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     300000 non-null  int64  
 1   Start Time     300000 non-null  object 
 2   End Time       300000 non-null  object 
 3   Trip Duration  300000 non-null  int64  
 4   Start Station  300000 non-null  object 
 5   End Station    300000 non-null  object 
 6   User Type      300000 non-null  object 
 7   Gender         238948 non-null  object 
 8   Birth Year     238981 non-null  float64
dtypes: float64(1), int64(2), object(6)
memory usage: 20.6+ MB
None


### filter_data

In [6]:
CITY_DATA =  {'chicago': 'chicago.csv',
          'new york city': 'new_york_city.csv',
          'washington': 'washington.csv'}

In [7]:
def load_data(city, month, day):
    day = day[0].upper()+day[1:]
    """
    Loads data for the specified city and filters by month and day if applicable.

    Args:
        (str) city - name of the city to analyze
        (str) month - name of the month to filter by, or "all" to apply no month filter
        (str) day - name of the day of week to filter by, or "all" to apply no day filter
    Returns:
        df - pandas DataFrame containing city data filtered by month and day
    """
    
    # load data file into a dataframe
    
    for key,value in CITY_DATA.items():
        if key==city:
            df=pd.read_csv(value)
    

    # convert the Start Time column to datetime
    df['Start Time'] = pd.to_datetime(df['Start Time'])

    # extract month and day of week from Start Time to create new columns
    df['month'] = pd.DatetimeIndex(df['Start Time']).month
    df['day_of_week'] = pd.DatetimeIndex(df['Start Time']).day


    # filter by month if applicable
    if month != 'all':
        # use the index of the months list to get the corresponding int
        months = ['january', 'february', 'march', 'april', 'may', 'june']
        month = months.index(month)+1
    
        # filter by month to create the new dataframe
        df = df[df['Start Time'].map(lambda x: x.month)==month]

    # filter by day of week if applicable
    if day != 'all':
        # filter by day of week to create the new dataframe
        df = df[df['Start Time'].dt.day_name()==day]
    
    return df
    

In [8]:
load_data('chicago','march','friday')

Unnamed: 0.1,Unnamed: 0,Start Time,End Time,Trip Duration,Start Station,End Station,User Type,Gender,Birth Year,month,day_of_week
37,395803,2017-03-24 15:35:55,2017-03-24 15:46:10,615,Dearborn St & Erie St,State St & Van Buren St,Subscriber,Male,1989.0,3,24
93,395735,2017-03-24 15:32:04,2017-03-24 15:52:53,1249,Sedgwick St & Webster Ave,Western Ave & Winnebago Ave,Subscriber,Female,1964.0,3,24
175,395402,2017-03-24 15:10:29,2017-03-24 15:19:44,555,Franklin St & Monroe St,Aberdeen St & Monroe St,Subscriber,Male,1987.0,3,24
190,393400,2017-03-24 12:29:30,2017-03-24 12:48:56,1166,Southport Ave & Wellington Ave,Lake Shore Dr & North Blvd,Subscriber,Female,1984.0,3,24
198,427496,2017-03-31 08:25:53,2017-03-31 08:39:09,796,Clinton St & Jackson Blvd,Racine Ave (May St) & Fulton St,Subscriber,Male,1983.0,3,31
...,...,...,...,...,...,...,...,...,...,...,...
299816,333246,2017-03-10 17:40:53,2017-03-10 17:44:59,246,Wells St & Walton St,Rush St & Cedar St,Subscriber,Female,1992.0,3,10
299839,392682,2017-03-24 11:17:50,2017-03-24 11:51:44,2034,Lake Shore Dr & Monroe St,Streeter Dr & Grand Ave,Customer,,,3,24
299860,290125,2017-03-03 12:19:29,2017-03-03 12:32:58,809,Aberdeen St & Monroe St,Clark St & 9th St (AMLI),Subscriber,Male,1975.0,3,3
299865,288513,2017-03-03 07:26:48,2017-03-03 07:31:22,274,Damen Ave & Melrose Ave,Lincoln Ave & Roscoe St,Subscriber,Female,1981.0,3,3


### popular_times

In [9]:
filename = 'chicago.csv'

# load data file into a dataframe
df = pd.read_csv(filename)

# convert the Start Time column to datetime
df['Start Time'] = pd.to_datetime(df['Start Time'])

# extract hour from the Start Time column to create an hour column
df['hour'] = pd.DatetimeIndex(df['Start Time']).hour

# find the most common hour (from 0 to 23)
popular_hour = df['hour'].mode()
    
print('Most Frequent Start Hour:', popular_hour)

Most Frequent Start Hour: 0    17
dtype: int64


### user_types

In [10]:
filename = 'chicago.csv'

# load data file into a dataframe
df = pd.read_csv(filename)

# print value counts for each user type
user_types = df['User Type'].value_counts()

print(user_types)

Subscriber    238889
Customer       61110
Dependent          1
Name: User Type, dtype: int64
