
# Cab Investment – G2M Strategy

**Jason Robinson**

**XYZ** in implementing its Go-to-market strategy we will examine specific features and their relationships to determine if investment would be profitable. Accompanying methods will incorporate statistical analysis using hypothesis (A/B) testing to measure principal features and outcomes


In [None]:
# Basic Libraries
import os
import sys
import pandas as pd
import numpy as np

# Time Series 
import statsmodels
import statsmodels.api as sm
from statsmodels.tsa import stattools as ts
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.arima_model import ARIMAResults

# Scientific
import scipy

# Matplotlib
import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams
plt.style.use('fivethirtyeight')
%matplotlib inline

# Visualizations
import plotly
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly import tools
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

# Function to display plotly in jupyter notebook
def enable_plotly_in_cell():
    import IPython
    from plotly.offline import init_notebook_mode
    display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"></script>'''))
    init_notebook_mode(connected=False)


print('Matplotlib:',matplotlib.__version__)
print('Pandas:',pd.__version__)
print('Numpy:',np.__version__)
print('StatsModel:',statsmodels.__version__)
print('Scipy:',scipy.__version__)

In [None]:
# Import the data by reeading multiple files
city = pd.read_csv('/Users/jasonrobinson/Desktop/cab_dataset/City.csv')
cab = pd.read_csv('/Users/jasonrobinson/Desktop/cab_dataset/Cab_Data.csv')
customer = pd.read_csv('/Users/jasonrobinson/Desktop/cab_dataset/Customer_ID.csv')
transaction = pd.read_csv('/Users/jasonrobinson/Desktop/cab_dataset/Transaction_ID.csv')


In [None]:
print('City:',city.shape)
print('Cab:',cab.shape)
print('Customer:',customer.shape)
print('Transaction:',transaction.shape)

We will merge our datasets according to their primary keys. For this instance we are performing a left join on transaction id and a right join on customer id.

In [None]:
# Merge the datasets on primary keys
cab_data = cab.merge(transaction, on='Transaction ID').merge(customer, on='Customer ID')
print(cab_data.shape)
cab_data.sample(5)

In [None]:
cab_data.info()

In [None]:
# Check for nan values in each dataset
def check_nan(cab_data):
    print(cab_data.isnull().sum()/len(cab_data)*100,'%','\n')
    return cab_data

check_nan(cab_data)

In [None]:
# Iterate through each dataset and format header to lowercase and join with ('_')
def format_header(cab_data):
    cab_data.columns = [x.lower().replace(' ', '_') for x in cab_data.columns]
    cab_data.rename(columns={'cost_of_trip':'trip_cost', 
                            'income_(usd/month)':'monthly_income',
                             'transaction_id':'transact_id'}, inplace=True)
    return cab_data    


cab_data = format_header(cab_data)
cab_data.head(2)

***
Start Date Refactor

In [None]:
# Convert date of travel string into datetime providing our 
# timestep function for begin

begin = pd.Timestamp('1900-1-28')

cab_data["travel_date"] = pd.to_datetime(cab_data["date_of_travel"], unit="D", origin=begin)
cab_data = cab_data.drop('date_of_travel', axis=1)

cab_data.head()

In [None]:

cab_data = cab_data[cab_data['travel_date'] < '2019-1-1']
cab_data['travel_date'].sort_values()

In [None]:
cab_data['travel_date'] = pd.to_datetime(cab_data['travel_date'])
cab_data = cab_data.set_index('travel_date')

In [None]:
cab_data.index

In [None]:
# Filter to 31/01/2016 to 31/12/2018
cab_column = cab_data[cab_data.index < '2019-01-01']
cab_column.index.sort_values()

In [None]:
#def check_nan(cab_daa):
#    print(cab_data.isnull().sum()/len(cab_data)*100,'%','\n')
#    return cab_data
#
#check_nan(cab_data)

***
End Date Refactor

In [None]:
cab_data = cab_data.copy()

In [None]:
# Display a distribution City in cab data
fig, ax = plt.subplots(figsize=(10,5))
sns.histplot(cab_data['city'],
             palette='Set1',
             ax=ax)
plt.show()

In [None]:
# Display a distribution hist for users in city
fig, ax = plt.subplots(figsize=(10,5))
sns.histplot(city['Users'],
            palette='Set3',
            ax=ax)
plt.show()

In [None]:
# Distribution of the travel_date
fig, ax = plt.subplots(figsize=(10,5))
sns.histplot(cab_data.index,
            palette='Set1',
            ax=ax)
plt.show()

A clear pattern of seasonality, with annual highs consistently for the time range, which helps us better predict future activity.

### Summary Statistics

In [None]:
# Display the min, max, mean, and median of each numeric column
def min_max_mean_median(cab_data):
    print(f'Min, Max, Mean, Median for the {cab_data.company} Company:')
    print(cab_data.describe())
    return cab_data
#min_max_mean_median(cab_data)

#### AGE

In [None]:
print(f'Mean Age:-----------------------',cab_data.describe().at['mean','age'])
print(f'Standard Deviation Age:---------',cab_data.describe().at['std', 'age'])
print(f'Age within the (25th%):---------',cab_data.describe().at['25%', 'age'])
print(f'Age within the (50th%):---------',cab_data.describe().at['50%', 'age'])
print(f'Age within the (75th%):---------',cab_data.describe().at['75%', 'age'])

#### Price Charged

In [None]:
print(f'Mean Price_Charged:-----------------------',cab_data.describe().at['mean','price_charged'])
print(f'Standard Deviation Price_Charged:---------',cab_data.describe().at['std', 'price_charged'])
print(f'Price_charged within the (25th%):---------',cab_data.describe().at['25%', 'price_charged'])
print(f'Price_charged within the (50th%):---------',cab_data.describe().at['50%', 'price_charged'])
print(f'Price_charged within the (75th%):---------',cab_data.describe().at['75%', 'price_charged'])

#### KM Travelled

In [None]:
print(f'Mean KM_Traveled:-----------------------',cab_data.describe().at['mean','km_travelled'])
print(f'Standard Deviation KM_Traveled:---------',cab_data.describe().at['std', 'km_travelled'])
print(f'KM_Traveled within the (25th%):---------',cab_data.describe().at['25%', 'km_travelled'])
print(f'KM_Traveled within the (50th%):---------',cab_data.describe().at['50%', 'km_travelled'])
print(f'KM_Traveled within the (75th%):---------',cab_data.describe().at['75%', 'km_travelled'])

#### Trip Cost

In [None]:
print(f'Mean Trip_Cost:-----------------------',cab_data.describe().at['mean','trip_cost'])
print(f'Standard Deviation Trip_Cost:---------',cab_data.describe().at['std', 'trip_cost'])
print(f'Trip_Cost within the (25th%):---------',cab_data.describe().at['25%', 'trip_cost'])
print(f'Trip_Cost within the (50th%):---------',cab_data.describe().at['50%', 'trip_cost'])
print(f'Trip_Cost within the (75th%):---------',cab_data.describe().at['75%', 'trip_cost'])

### Visualizations

Split dataset up by company for comparison of data between companies.

In [None]:
# Create a dataset for each company
cab_yellow = cab_data[cab_data['company'] == 'Yellow Cab']
cab_pink = cab_data[cab_data['company'] == 'Pink Cab']
print(f'Shape of the Yellow Cab Company:', cab_yellow.shape)
print(f'Shape of the Pink Cab Company:', cab_pink.shape)

In [None]:
# Display a count pink and yellow cab companies in reference to transactions
sns.countplot(x='company', 
              data=cab_data, 
              palette='Set2', 
              order=['Yellow Cab', 'Pink Cab'],
              hue='company');

### Frequecy of Transactions

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.histplot(cab_pink.index,
            palette='Set1',
            ax=ax)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.histplot(cab_yellow.index,
            palette='Set1',
            ax=ax)
plt.show()

Follow some of our questions for **both** companies:

1. How many transactions are there in the data?
2. What is the mean age of the users in the data?
3. What is the mean price charged for the transactions in the data?
4. What is the mean KM traveled for the transactions in the data?
5. What is the mean trip cost for the transactions in the data?

In [None]:
print(f'Number of transactions:', cab_data.shape[0])
print(f'Mean Age:--------------',cab_data.describe().at['mean','age'])
print(f'Mean Price_Charged:----',cab_data.describe().at['mean','price_charged'])
print(f'Mean KM_Traveled:------',cab_data.describe().at['mean','km_travelled'])
print(f'Mean Trip_Cost:--------',cab_data.describe().at['mean','trip_cost'])

Follow some of our questions for **each** company:

1. How many transactions are there in the data?
2. What is the mean age of the users in the data?
3. What is the mean price charged for the transactions in the data?
4. What is the mean KM traveled for the transactions in the data?
5. What is the mean trip cost for the transactions in the data?

In [None]:
print(f'Number of transactions:', cab_yellow.shape[0])
print(f'Mean Age:--------------',cab_yellow.describe().at['mean','age'])
print(f'Mean Price_Charged:----',cab_yellow.describe().at['mean','price_charged'])
print(f'Mean KM_Traveled:------',cab_yellow.describe().at['mean','km_travelled'])
print(f'Mean Trip_Cost:--------',cab_yellow.describe().at['mean','trip_cost'])

In [None]:
print(f'Number of transactions:', cab_pink.shape[0])
print(f'Mean Age:--------------',cab_pink.describe().at['mean','age'])
print(f'Mean Price_Charged:----',cab_pink.describe().at['mean','price_charged'])
print(f'Mean KM_Traveled:------',cab_pink.describe().at['mean','km_travelled'])
print(f'Mean Trip_Cost:--------',cab_pink.describe().at['mean','trip_cost'])

In [None]:
# Create a function to calculate the difference between the mean of the two companies
def mean_difference(cab_data):
    print(f'Mean Difference for Age:', cab_yellow.describe().at['mean','age'] - cab_pink.describe().at['mean','age'])
    return cab_data

In [None]:

# Print out the mean difference for each feature
print(f'Yellow Cab Mean Difference for Age:',cab_yellow.describe().at['mean','age'] - cab_pink.describe().at['mean','age'])
print(f'Yellow Cab Mean Difference for KM Travel:',cab_yellow.describe().at['mean','km_travelled'] - cab_pink.describe().at['mean','km_travelled'])
print(f'Yellow Cab Mean Difference for Price Charged:',cab_yellow.describe().at['mean','price_charged'] - cab_pink.describe().at['mean','price_charged'])
print(f'Yellow Cab Mean Difference for Trip Cost:',cab_yellow.describe().at['mean','trip_cost'] - cab_pink.describe().at['mean','age'])
print(f'Yellow Cab Mean Difference for Monthly Income:',cab_yellow.describe().at['mean','monthly_income'] - cab_pink.describe().at['mean','monthly_income'])

In [None]:
# Print out the standard deviation difference for each feature
print(f'Yellow Cab Standard Deviation Difference for Age:',cab_yellow.describe().at['std','age'] - cab_pink.describe().at['std','age'])
print(f'Yellow Cab Standard Deviation Difference for KM Travel:',cab_yellow.describe().at['std','km_travelled'] - cab_pink.describe().at['std','km_travelled'])
print(f'Yellow Cab Standard Deviation Difference for Price Charged:',cab_yellow.describe().at['std','price_charged'] - cab_pink.describe().at['std','price_charged'])
print(f'Yellow Cab Standard Deviation Difference for Trip Cost:',cab_yellow.describe().at['std','trip_cost'] - cab_pink.describe().at['std','trip_cost'])
print(f'Yellow Cab Standard Deviation Difference for Monthly Income:',cab_yellow.describe().at['std','monthly_income'] - cab_pink.describe().at['std','monthly_income'])

#### Insights 

The yellow company is consisting of more total number of transactions by an average of **1.5** times the number of transactions of the pink company. 

In [None]:
cab_data_sub = sns.pairplot(cab_data[['km_travelled', 'price_charged', 'trip_cost', 'monthly_income']])

### Correlation Matrix Comparison

Finding the correlation between the features of the two companies is useful to determine what features are most related to each other.  

In [None]:
# Create a correlation matrix cab data using pearsonr
corr = cab_data.corr()
print(corr)


# Create a heatmap of the correlation matrix
sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
            annot=True,
            cmap='RdBu_r',
            vmin=-1,
            vmax=1)
plt.show()

What we can determine from our measures and visuals from our Heatmap that distance or kilometers travelled is highly correlated with the cost of the trip and the price charged for the actual trip, and as such trip cost and price charged have a high level of correlation.

#### What is the overall relation between the price charged and the km travelled?

In [None]:
# What is the overall relation between the price charged and the km travelled?
sns.lmplot(x='km_travelled',
           y='price_charged',
           data=cab_data,
           fit_reg=False,
           hue='price_charged',
           palette='Set2',
           scatter_kws={'s': 50})
plt.show()


In [None]:
#import plotly.graph_objects as go
#
#
## Create a scatter plot of the km travelled and price charged
#fig = go.Figure(data=go.Scatter(x=cab_data['km_travelled'],
#                                y=cab_data['price_charged'],
#                                mode='markers',
#                                marker=dict(size=10,
#                                            color=cab_data['company'],
#                                            colorscale='Viridis',
#                                            showscale=True)))
#fig.update_layout(title='KM Travelled vs Price Charged',
#                    xaxis_title='KM Travelled',
#                    yaxis_title='Price Charged')
#fig.show()

#### Is there a relationship between age and income?

In [None]:
# Is there a relationship between age and income?
sns.lmplot(x='age',
              y='monthly_income',
                data=cab_data,
                fit_reg=False,
                hue='company',
                palette='Set2',
                scatter_kws={'s': 50})
plt.show()


#### What is the variance of price charged and trip cost?

In [None]:
# What is the variance of price charged and trip cost?
sns.lmplot(x='trip_cost',
                y='price_charged',
                data=cab_data,
                fit_reg=False,
                hue='gender',
                palette='Set2',
                scatter_kws={'s': 50})
plt.show()

#### Relation between travel date and trip cost? 

In [None]:
 # Relation between travel date and trip cost?
#sns.lineplot(x=cab_data.index,
#                y='trip_cost',
#                data=cab_data,
#                hue='company',
#                palette='Set2')
#plt.show()
#

In [None]:
def get_mean(cab_data):
    return cab_data.resample('D').mean()

cab_data['travel_date'] = cab_data.index

# We can then use the .date_range to get the date range of the data.
date_range = pd.date_range(start='2016-01-31', end='2018-12-31')

# We can then use the .resample() method to get the mean of the data.
mean_data = get_mean(cab_data['trip_cost'])

# We can then use the .plot() method to plot the data.
mean_data.plot()
plt.show()

In [None]:
#def get_mean(cab_data):
#    return cab_data.resample('D').mean()
#
#mean_data = get_mean(cab_data['trip_cost'])
#mean_data.plot()
#plt.show()

In [None]:
# Relation between travel date and trip cost?
def travel_date_vs_trip_cost():
    """
    Method:
        Plot the travel date vs trip cost
    """
    plt.figure(figsize=(8,5))
    plt.scatter(cab_data['travel_date'], cab_data['trip_cost'])
    plt.xlabel('Travel Date')
    plt.ylabel('Trip Cost')
    plt.title('Travel Date vs Trip Cost')
    plt.show()
    return

travel_date_vs_trip_cost()

In [None]:
# Distribution of all features
def distribution_all():
    """
    Method:
        Distribution of all features
    """
    plt.figure(figsize=(10,6))
    plt.hist(cab_data['price_charged'], bins=50)
    plt.xlabel('Price Charged')
    plt.ylabel('Frequency')
    plt.title('Price Charged Distribution')
    plt.show()
    return


distribution_all()

In [None]:
city['Population'] = city['Population'].str.replace(',', '')

In [None]:
city['Users'] = city['Users'].str.replace(',', '')

Incorporate our City dataset to answer questions surrounding number of users in the city and the population of the ciites.

In [None]:
city['Population'] = city['Population'].astype(int)
city['Users'] = city['Users'].astype(int)

In [None]:
# Drop the population column
city = city.drop(columns=['population'])

In [None]:
city.head()

In [None]:
# Distribution of population per city
def distribution_population(city):
    """
    Method:
        Distribution of population per city
    
    Args:
        city: seaborn 
    """
    plt.figure(figsize=(10,6))
    sns.barplot(x='City', y='Users', data=city, palette='Set2')
    plt.title('Distribution of Users per Population of Each City')
    plt.xticks(rotation=90)
    plt.xlabel('City')
    plt.show()

distribution_population(city)

In [None]:
def get_percentage_users(city):
    """
    Method:
        Get the percentage of users per population of each city
    
    Args:
        city: pandas dataframe 
    """
    city['Percentage_Users'] = city['Users'] / city['Population'] * 100
    return city


city = get_percentage_users(city)
city

In [None]:
# Plot
def plot_percentage_users(city):
    """
    Method:
        Plot the percentage of users per population of each city
    
    Args:
        city: pandas dataframe 
    """
    plt.figure(figsize=(10,6))
    sns.barplot(x='City', y='Percentage_Users', data=city, palette='Set2')
    plt.title('Percentage of Users per Population of Each City')
    plt.xticks(rotation=90)
    plt.xlabel('City')
    plt.show()
    return


plot_percentage_users(city)

In [None]:
# Determine profit 

In [None]:
# Find total_cost and price_charged per company
total_cost = cab_data.groupby('price_charged').sum()
price_charged = cab_data.groupby('trip_cost').mean()

In [None]:
total_cost

In [None]:
# Determine our profit
def get_profit(total_cost, price_charged):
    """
    Method:
        Get the profit per company
    
    Args:
        total_cost: pandas dataframe 
        price_charged: pandas dataframe 
    """
    profit = total_cost['trip_cost'] - price_charged['price_charged']
    return profit


# Create a class that find total_cost and price_charged per company 
total_cost_per_company = TotalCostPerCompany(cab_data)


# Get the profit per company    
profit = get_profit(total_cost_per_company.total_cost, total_cost_per_company.price_charged)


In [None]:
get_profit(total_cost, price_charged)

In [None]:
def get_profit_yellow(cab_yellow):

    cab_yellow['profit'] = cab_yellow['price_charged'] - cab_yellow['trip_cost']
    return cab_yellow

get_profit_yellow(cab_yellow)

In [None]:
cab_yellow['profit'] = cab_yellow['price_charged'] - cab_yellow['trip_cost']
cab_pink['profit'] = cab_pink['price_charged'] - cab_pink['trip_cost']

In [None]:
cab_yellow

In [None]:
cab_pink

In [None]:
def plot_profit_per_company(cab_data):
    plt.figure(figsize=(10,6))
    sns.barplot(x=cab_pink['company'], y=cab_pink['profit'], data=cab_pink, palette='Set2')
    plt.title('Profit per Company')
    plt.xlabel('Company')
    plt.show()
    return


plot_profit_per_company(cab_data)

In [None]:
def plot_profit_per_company(cab_data):
    plt.figure(figsize=(10,6))
    sns.barplot(x=cab_yellow['company'], y=cab_yellow['profit'], data=cab_yellow, palette='Set2')
    plt.title('Profit per Company')
    plt.xlabel('Company')
    plt.show()
    return


plot_profit_per_company(cab_data)

In [None]:
price_charged = np.array(cab_data["price_charged"])
travel_cost = np.array(cab_data["trip_cost"])
profit = np.subtract(price_charged,travel_cost)
new = zip(city["City"],profit)
profit_df = (pd.DataFrame(new,columns = ["City","Profits"]).groupby("City").sum()).sort_values(by = "Profits",ascending=False)
new_df = profit_df.round()

plt.figure(figsize = (14,6))
plt.bar(new_df.index,new_df.Profits)
plt.xticks(rotation="90")
plt.ylabel("Profits")
plt.title("Profits per city")
plt.show()

print(new_df)