In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **G2M (Go-to-Market strategy) Insight for Cab Investment Firm EDA (Exploratory Data Analysis)**

#### **Problem Definition and Delivery** 
◾XYZ is a private firm in US. Due to remarkable growth in the Cab Industry in last few years and multiple key players in the market, it is planning for an investment in Cab industry and as per their Go-to-Market(G2M) strategy they want to understand the market before taking final decision.

◾XYZ is interested in using your actionable insights to help them identify the right company to make their investment.

#### **Dataset**
There have been provided 4 individual data sets. Time period of data is from **31/01/2016** to **31/12/2018**.
Below are the list of datasets which are provided for the analysis:

**Cab_Data.csv** – this file includes details of transaction for 2 cab companies

**Customer_ID.csv** – this is a mapping table that contains a unique identifier which links the customer’s demographic details

**Transaction_ID.csv** – this is a mapping table that contains transaction to customer mapping and payment mode

**City.csv** – this file contains list of US cities, their population and number of cab users

**Author : Ugur Selim Ozen**


In [2]:
# I imported following python libraries to utilize in EDA process.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import plotly.express as px
import plotly.graph_objs as go
import plotly
import plotly.graph_objects as go
import datetime

### **Data Overview**

#### In this section , I analyzed dataset's shape(entries,features) , missing values(NA or Null values) , feature's datatypes and looked at head of dataset's.

In [3]:
pathfile1 = '/kaggle/input/cabs-fare-data/Cab_Data.csv'
CabData = pd.read_csv(pathfile1)
missing_value = 0
for column in CabData.columns :
    missing_value += CabData[column].isna().sum() + CabData[column].isnull().sum()
print('CabData dataset has {} entries , {} features and {} missing values'.format(CabData.shape[0] ,CabData.shape[1] ,missing_value))
print("\nFeature's datatypes\n\n{}".format(CabData.dtypes))
CabData.head(10)

In [4]:
pathfile2 = '/kaggle/input/cabs-fare-data/City.csv'
City = pd.read_csv(pathfile2)
missing_value = 0
for column in City.columns :
    missing_value += City[column].isna().sum() + City[column].isnull().sum()
print('City dataset has {} entries , {} features and {} missing values'.format(City.shape[0] ,City.shape[1] ,missing_value))
print("\nFeature's datatypes\n\n{}".format(City.dtypes))
City.head(20)

In [5]:
pathfile3 = '/kaggle/input/cabs-fare-data/Customer_ID.csv'
CustomerID = pd.read_csv(pathfile3)
missing_value = 0
for column in CustomerID.columns :
    missing_value += CustomerID[column].isna().sum() + CustomerID[column].isnull().sum()
print('CustomerID dataset has {} entries , {} features and {} missing values'.format(CustomerID.shape[0] ,CustomerID.shape[1] ,missing_value))
print("\nFeature's datatypes\n\n{}".format(CustomerID.dtypes))
CustomerID.head(10)

In [6]:
pathfile4 = '/kaggle/input/cabs-fare-data/Transaction_ID.csv'
TransactionID = pd.read_csv(pathfile4)
missing_value = 0
for column in TransactionID.columns :
    missing_value += TransactionID[column].isna().sum() + TransactionID[column].isnull().sum()
print('TransactionID dataset has {} entries , {} features and {} missing values'.format(TransactionID.shape[0] ,TransactionID.shape[1] ,missing_value))
print("\nFeature's datatypes\n\n{}".format(TransactionID.dtypes))
TransactionID.head(10)

#### **Merging All Datasets to MasterData** 

In [7]:
# Here merged all 4 files into one MasterData  to analyze entire dataset.
MasterData = CabData.merge(TransactionID, on= 'Transaction ID').merge(CustomerID, on ='Customer ID').merge(City, on = 'City')
missing_value = 0
for column in MasterData.columns :
    missing_value += MasterData[column].isna().sum() + MasterData[column].isnull().sum()
print('MasterData dataset has {} entries , {} features and {} missing values'.format(MasterData.shape[0] ,MasterData.shape[1] ,missing_value))
print("\nFeature's datatypes\n\n{}".format(MasterData.dtypes))
MasterData.head(10)

### **Data Cleaning and Feature Engineering**

#### In this section , I transformed some features to appropriate datatypes and formats.

In [8]:
#removing ',' in population and users column values
#replacing spaces with '_' in column names 

for column in MasterData.columns:
    if ' ' in column:
        MasterData = MasterData.rename(columns={column:column.replace(' ','_')})
        
for column in ["Population","Users"] :
    
        MasterData[column] = MasterData[column].str.replace(',','')
        
MasterData.head(10)

In [9]:
# Converted some columns from object to category or int64 datatypes to work with appropriate ones.
# Converted Data of Travel column to datetime object.

MasterData['Date_of_Travel'] = pd.to_datetime(MasterData['Date_of_Travel'])

for column in ["Company", "City" , "Payment_Mode" , "Gender" ] :
    
        MasterData[column] = MasterData[column].astype('category')
        
        
for column in ["Population", "Users" ] :
    
        MasterData[column] = MasterData[column].astype('int64')
        
print("\nFeature's datatypes\n\n{}".format(MasterData.dtypes))

### **Statistical Overview and Data Analysis**

#### In this section , I analyzed statinfos , distributions , outliers , boxplots , correlations , covariance and so on of some columns of entire MasterData.

In [10]:
# Here I splitted MasterData into two dataset for every Cab Firm to analyze them individually.
Pink_MasterData = MasterData [MasterData['Company'] == 'Pink Cab'] 
Yellow_MasterData = MasterData [MasterData['Company'] == 'Yellow Cab']
selected_columns = ['KM_Travelled','Price_Charged','Cost_of_Trip', 'Age' ,'Income_(USD/Month)', 'Population' , 'Users']

In [11]:
# Getting the statinfo's of selected columns from MasterData for Pink Cab Firm
filtered_Pink_MasterData = Pink_MasterData[selected_columns]
Pink_Cab_StatsInfo = filtered_Pink_MasterData.describe()
Pink_Cab_StatsInfo

In [12]:
# Getting the statinfo's of selected columns from MasterData for Pink Cab Firm
filtered_Yellow_MasterData = Yellow_MasterData[selected_columns]
Yellow_Cab_StatsInfo = filtered_Yellow_MasterData.describe()
Yellow_Cab_StatsInfo

In [13]:
#The KDE curves and distribtion plots of selected variables with respect to Cab Firms drawn below.
fig,axes = plt.subplots(1, 3 , figsize=(26,8), sharey=True)
fig.suptitle('Distributions of Variables')
sns.histplot(ax=axes[0], data=MasterData , x='Cost_of_Trip' , kde = True , hue="Company")
sns.histplot(ax=axes[1], data=MasterData , x='Price_Charged' , kde = True , hue="Company")
sns.histplot(ax=axes[2], data=MasterData , x='Income_(USD/Month)' , kde = True , hue="Company") 

In [14]:
#The KDE curves and distribtion plots of selected variables with respect to Cab Firms drawn below.
fig,axes = plt.subplots(1, 2 ,figsize=(22,9), sharey=True)
fig.suptitle('Distributions of Variables')
sns.histplot(ax=axes[0],data=MasterData , x='Age' , kde = True , hue="Company")
sns.histplot(ax=axes[1], data=MasterData , x='KM_Travelled' , kde = True , hue="Company")

In [15]:
#The KDE curves and distribtion plots of selected variables with respect to Cab Firms drawn below.
fig,axes = plt.subplots(1, 2 , figsize=(22 ,9), sharey=True)
fig.suptitle('Distributions of Variables')
sns.histplot(ax=axes[0], data=MasterData , x='Population' , kde = True , hue="Company")
sns.histplot(ax=axes[1], data=MasterData , x='Users' , kde = True , hue="Company")

In [16]:
#Outlier points and lower-upper limits were analyzed by IQR method by filtering the data from the statistical value table of the desired variables.
def showIQRstats(dataFrame_statsinfo,column_name):
    
    mean   = dataFrame_statsinfo[column_name][1]
    median = dataFrame_statsinfo[column_name][5]
    median_mean_ratio = median/mean
    Q1     = dataFrame_statsinfo[column_name][4]
    Q3     = dataFrame_statsinfo[column_name][6]
    IQR    = Q3-Q1
    upper_band = Q3 + 1.5*IQR
    lower_band = Q1 - 1.5*IQR
    
    print('\n\n' + "mean of : " + str(column_name) + " is " + str(mean) +
          '\n' + "median of : " + str(column_name) + " is " + str(median) +
          '\n' + "median/mean ratio of : " +  str(column_name) + " is " + str(median_mean_ratio) +
          '\n' + "Q1 value of : "  + str(column_name) + " is "  + str(Q1) +
          '\n' + "Q3 value of : "  + str(column_name) + " is "  + str(Q3) + 
          '\n' + "IQR value of : " + str(column_name) + " is " + str(IQR) + 
          '\n' + "Upper and Lower Limits of " + str(column_name) + " is "  + str((lower_band,upper_band))
         )
    
    return upper_band , lower_band

In [17]:
print('Statistical infos of Pink Cab Firm :')
for column_name in selected_columns :
    upper_band , lower_band = showIQRstats(Pink_Cab_StatsInfo , column_name)
    outliers = filtered_Pink_MasterData[(filtered_Pink_MasterData[column_name] < lower_band) | (filtered_Pink_MasterData[column_name] > upper_band)][column_name].values
    print('\n' + str(column_name) + " has " + str(len(outliers)) + " outliers : "    + str(outliers))
    print('\n\n**********************************************************\n')

In [18]:
print('Statistical infos of Yellow Cab Firm :')
for column_name in selected_columns :
    upper_band , lower_band = showIQRstats(Yellow_Cab_StatsInfo , column_name)
    outliers = filtered_Yellow_MasterData[(filtered_Yellow_MasterData[column_name] < lower_band) | (filtered_Yellow_MasterData[column_name] > upper_band)][column_name].values
    print('\n' + str(column_name) + " has " + str(len(outliers)) + " outliers : "    + str(outliers))
    print('\n\n**********************************************************\n')

In [19]:
#Boxplot distributions of the desired variables were drawn.
fig,axes = plt.subplots(2, 3, figsize=(25, 8), sharey=True)
fig.suptitle('Boxplot Distributions of the Variables')

sns.boxplot(ax=axes[0,0], x='KM_Travelled' ,        data=filtered_Pink_MasterData ).set_ylabel("Pink Cab")
sns.boxplot(ax=axes[0,1], x='Price_Charged'   ,     data=filtered_Pink_MasterData  ) .set_ylabel("Pink Cab")
sns.boxplot(ax=axes[0,2], x='Cost_of_Trip' ,        data=filtered_Pink_MasterData ).set_ylabel("Pink Cab")
sns.boxplot(ax=axes[1,0], x='KM_Travelled' ,        data=filtered_Yellow_MasterData ).set_ylabel("Yellow Cab")
sns.boxplot(ax=axes[1,1], x='Price_Charged'   ,     data=filtered_Yellow_MasterData  ) .set_ylabel("Yellow Cab")
sns.boxplot(ax=axes[1,2], x='Cost_of_Trip' ,        data=filtered_Yellow_MasterData ).set_ylabel("Yellow Cab")


In [20]:
#Boxplot distributions of the desired variables were drawn.
fig,axes = plt.subplots(2, 3, figsize=(25, 8), sharey=True)
fig.suptitle('Boxplot Distributions of the Variables')
sns.boxplot(ax=axes[0,0], x='Age' ,                 data=filtered_Pink_MasterData ).set_ylabel("Pink Cab")
sns.boxplot(ax=axes[0,1], x='Income_(USD/Month)' ,  data=filtered_Pink_MasterData  ) .set_ylabel("Pink Cab")
sns.boxplot(ax=axes[0,2], x='Population' ,          data=filtered_Pink_MasterData ).set_ylabel("Pink Cab")
sns.boxplot(ax=axes[1,0], x='Age' ,                 data=filtered_Yellow_MasterData ).set_ylabel("Yellow Cab")
sns.boxplot(ax=axes[1,1], x='Income_(USD/Month)' ,  data=filtered_Yellow_MasterData  ) .set_ylabel("Yellow Cab")
sns.boxplot(ax=axes[1,2], x='Population' ,          data=filtered_Yellow_MasterData ).set_ylabel("Yellow Cab")

In [21]:
#Boxplot distributions of the desired variables were drawn.
fig,axes = plt.subplots(1, 2, figsize=(25, 8), sharey=True)
sns.boxplot(ax=axes[0], x='Users' ,    data=filtered_Pink_MasterData ).set_ylabel("Pink Cab")
sns.boxplot(ax=axes[1], x='Users'   ,  data=filtered_Yellow_MasterData  ) .set_ylabel("Yellow Cab")

In [22]:
# Here visualized the correlation of MasterData features by heatmap for Pink Cab Firm.
Pink_Cab_Corr = filtered_Pink_MasterData.corr()
plt.figure(figsize=(20,10))
sns.heatmap(Pink_Cab_Corr,annot=True,vmin=-1,vmax=1,cmap='coolwarm')

In [23]:
# Here visualized the correlation of MasterData features by heatmap for Pink Cab Firm.
Yellow_Cab_Corr = filtered_Yellow_MasterData.corr()
plt.figure(figsize=(20,10))
sns.heatmap(Yellow_Cab_Corr,annot=True,vmin=-1,vmax=1,cmap='coolwarm')

In [24]:
# Here visualized scatter plots of MasterData features to see correlations between them if exists
fig,axes = plt.subplots(1, 2, figsize=(25, 8), sharey=True)
fig.suptitle('Boxplot Distributions')
sns.scatterplot(ax=axes[0],data=MasterData, x='KM_Travelled', y='Price_Charged' , hue="Company"  ).set_title("KM_Travelled  - Price_Charged")
sns.scatterplot(ax=axes[1],data=MasterData, x='KM_Travelled', y='Cost_of_Trip' , hue="Company").set_title("KM_Travelled  - Cost_of_Trip")

In [25]:
# Here visualized scatter plots of MasterData features to see correlations between them if exists
fig,axes = plt.subplots(figsize=(16, 6), sharey=True)
fig.suptitle('Boxplot Distributions')
sns.scatterplot(data=MasterData, x='Price_Charged', y='Cost_of_Trip' , hue="Company").set_title("Price_Charged - Cost_of_Trip")

In [26]:
# Here visualized scatter plots of MasterData features to see correlations between them if exists
fig,axes = plt.subplots(figsize=(16, 6), sharey=True)
fig.suptitle('Boxplot Distributions')
sns.scatterplot(data=MasterData, x='Users', y='Population' , hue="Company"  ).set_title("Population  - Users")

In [27]:
# Here visualized scatter plots of MasterData features to see correlations between them if exists
fig,axes = plt.subplots(1, 2, figsize=(25, 8), sharey=True)
fig.suptitle('Pink Cab Firm Boxplot Distributions')
sns.scatterplot(ax=axes[0],data=MasterData, x='Population', y='Price_Charged' , hue="Company").set_title("Population  - Price_Charged")
sns.scatterplot(ax=axes[1],data=MasterData, x='Users', y='Price_Charged' , hue="Company").set_title("Users - Price_Charged")

In [28]:
# Obtained covariance values for given features.

cov1  = np.cov(MasterData['KM_Travelled'],MasterData['Price_Charged'])[0][1]
cov2  = np.cov(MasterData['KM_Travelled'],MasterData['Cost_of_Trip'])[0][1]
cov3  = np.cov(MasterData['Price_Charged'],MasterData['Cost_of_Trip'])[0][1]
cov4  = np.cov(MasterData['Users'],MasterData['Population'])[0][1]
cov5  = np.cov(MasterData['Population'],MasterData['Price_Charged'])[0][1]
cov6  = np.cov(MasterData['Users'],MasterData['Price_Charged'])[0][1]

print('Covariance for  KM_Travelled - Price_Charged : '+ str(cov1))
print('Covariance for  KM_Travelled - Cost_of_Trip : '+ str(cov2))
print('Covariance for  Price_Charged - Cost_of_Trip : '+ str(cov3))
print('Covariance for  Users - Population : '+ str(cov4))
print('Covariance for  Population - Price_Charged : '+ str(cov5))
print('Covariance for  Users - Price_Charged : '+ str(cov6))


In [29]:
# Obtained pearson correlation coefficients for given features.

pcorr_coef1 , _ = stats.pearsonr(MasterData['KM_Travelled'],MasterData['Price_Charged'])
pcorr_coef2 , _ = stats.pearsonr(MasterData['KM_Travelled'],MasterData['Cost_of_Trip'])
pcorr_coef3 , _ = stats.pearsonr(MasterData['Price_Charged'],MasterData['Cost_of_Trip'])
pcorr_coef4 , _ = stats.pearsonr(MasterData['Users'],MasterData['Population'])
pcorr_coef5 , _ = stats.pearsonr(MasterData['Population'],MasterData['Price_Charged'])
pcorr_coef6 , _ = stats.pearsonr(MasterData['Users'],MasterData['Price_Charged'])

print('pearson correlation coefficient for  KM_Travelled - Price_Charged : '+ str(pcorr_coef1))
print('pearson correlation coefficient for  KM_Travelled - Cost_of_Trip : '+ str(pcorr_coef2))
print('pearson correlation coefficient for  Price_Charged - Cost_of_Trip : '+ str(pcorr_coef3))
print('pearson correlation coefficient for  Users - Population : '+ str(pcorr_coef4))
print('pearson correlation coefficient for  Population - Price_Charged : '+ str(pcorr_coef5))
print('pearson correlation coefficient for  Users - Price_Charged : '+ str(pcorr_coef6))

In [30]:
# Obtained spearman rank correlations and p - values for given features. 
spearman_rank_coeff1   = stats.spearmanr(MasterData['KM_Travelled'],MasterData['Price_Charged'])
spearman_rank_coeff2   = stats.spearmanr(MasterData['KM_Travelled'],MasterData['Cost_of_Trip'])
spearman_rank_coeff3   = stats.spearmanr(MasterData['Price_Charged'],MasterData['Cost_of_Trip'])
spearman_rank_coeff4   = stats.spearmanr(MasterData['Users'],MasterData['Population'])
spearman_rank_coeff5   = stats.spearmanr(MasterData['Population'],MasterData['Price_Charged'])
spearman_rank_coeff6   = stats.spearmanr(MasterData['Users'],MasterData['Price_Charged'])

print('spearman rank coefficient coefficient for  KM_Travelled - Price_Charged : '+ str(spearman_rank_coeff1))
print('spearman rank coefficient coefficient for  KM_Travelled - Cost_of_Trip : '+ str(spearman_rank_coeff2))
print('spearman rank coefficient coefficient for  Price_Charged - Cost_of_Trip : '+ str(spearman_rank_coeff3))
print('spearman rank coefficient coefficient for  Users - Population : '+ str(spearman_rank_coeff4))
print('spearman rank coefficient coefficient for  Population - Price_Charged : '+ str(spearman_rank_coeff5))
print('spearman rank coefficient coefficient for  Users - Price_Charged : '+ str(spearman_rank_coeff6))

#### ⚫ From the statistical data analysis ; We see that there is **strongly positive correlation** between **KM_Travelled - Price_Charged**  , **KM_Travelled - Cost_of_Trip** , **Price_Charged - Cost_of_Trip** , **Users - Population**  and also we prove that correlations by using spearman rank correlation , pearson correlation coefficient statistical tests and zero p-values which are lower than 0.05 value to satisfy the correlation values as statistically. 

### **Inferencial Data Analysis**

#### In this section ,  I analyzed MasterData and some meaningful pieces of MasterData which are extracted from it to make a sensable decision which Cab Firm is most valueble for investing in future times by taking insights and inferences from data analysis and visualizations with different aspects.

In [31]:
data0 = MasterData.groupby("Company").count()

fig1 = px.pie(data0,
                        values=data0.Users,
                        names=data0.index ,title="Pink & Yellow Cab Firm Total Users Overview"
                        )
fig1.update_traces(
                        textposition='inside',
                        textinfo='percent+label'
                        )
fig1.update_layout( width = 1100 , height = 600 )

#### ⚫ As seen from this Pie Chart; The total number of users of **Yellow Cab** is **approximately 3 times that of Pink Cab**.

In [32]:
data1_0 = MasterData[MasterData["Company"] == "Pink Cab"].groupby("City").count()
data1_1 = MasterData[MasterData["Company"] == "Yellow Cab"].groupby("City").count()

fig = go.Figure()
fig.add_trace(go.Bar(
                    x=data1_0.index,
                    y=data1_0['Users'],
                    name='Pink Cab',
                    marker_color='indianred' 
                    ))            
fig.add_trace(go.Bar(
                    x=data1_1.index,
                    y=data1_1['Users'],
                    name='Yellow Cab',
                    marker_color='blue'
                    ))
fig.update_layout(
    yaxis_title="Users",
    title="Pink & Yellow Cab Firm Users Distribution Over City" )

#### ⚫ As seen from this Bar Chart; For the **Yellow Cab** Company, the highest number of users on a city basis are in **New York, Washington and Chicago**, while for the **Pink Cab** Company, the most are in **Los Angeles, New York and San Diego**.

In [33]:
data2_0 = MasterData.groupby("City").count()
fig1 = px.pie(data2_0,
                        values=data2_0.Users,
                        names=data2_0.index ,title="Total Users Overview by Cities"
                        )
fig1.update_traces(
                        textposition='inside',
                        textinfo='percent+label'
                        )
fig1.update_layout( width = 1100 , height = 600 )

#### ⚫ As seen from this Pie Chart; On the basis of **cities**, the **highest number of total users**  are in **New York, Chicago, Los Angeles, Washington and Boston**.

In [34]:
MasterData["User_Pop_Ratio"] = MasterData["Users"] / MasterData["Population"]
MasterData["Profit_of_Trip"] = MasterData["Price_Charged"] - MasterData["Cost_of_Trip"]
data2_1 = MasterData.groupby("City").sum()
fig1 = px.pie(data2_1,
                        values=data2_1.Profit_of_Trip,
                        names=data2_1.index ,title="Total Market Profit Share by Cities"
                        )
fig1.update_traces(
                        textposition='inside',
                        textinfo='percent+label'
                        )
fig1.update_layout( width = 1100 , height = 600 )

#### ⚫ As seen from this Pie Chart; **More than half of the total market profit share** on the basis of cities belongs to **New York**.

In [35]:
pinkData = MasterData[MasterData['Company'] == 'Pink Cab']
yellowData = MasterData[MasterData['Company'] == 'Yellow Cab']

dict1 = {"Pink Cab" : [pinkData["Profit_of_Trip"].sum()] ,"Yellow Cab" : [yellowData["Profit_of_Trip"].sum()] }

data2_2 = pd.DataFrame(dict1).T

fig1 = px.pie(data2_2,
                        values=data2_2[0],
                        names=data2_2.index ,title="Total Market Profit Share by Cab Firms"
                        )
fig1.update_traces(
                        textposition='inside',
                        textinfo='percent+label'
                        )
fig1.update_layout( width = 1100 , height = 600 )

#### ⚫ As seen from this Pie Chart; The **total market profit share** of **Yellow Cab** is **approximately 9 times that of Pink Cab**.

In [36]:
data3_0 = MasterData.groupby("Gender").count()
fig1 = px.pie(data3_0,
                        values=data3_0.Users,
                        names=data3_0.index ,title="Total Users Overview by Gender"
                        )
fig1.update_traces(
                        textposition='inside',
                        textinfo='percent+label'
                        )
fig1.update_layout( width = 1100 , height = 600 )

#### ⚫ As seen from this Pie Chart; In the **distribution of users by gender**, there is an approximate **3 to 2 ratio for men and women**.

In [37]:
data3_1 = MasterData[MasterData["Company"] == "Pink Cab"].groupby("Gender").count()
data3_2 = MasterData[MasterData["Company"] == "Yellow Cab"].groupby("Gender").count()

fig = go.Figure()
fig.add_trace(go.Bar(
                    x=data3_1.index,
                    y=data3_1['Users'],
                    name='Pink Cab',
                    marker_color='pink' 
                    ))            
fig.add_trace(go.Bar(
                    x=data3_2.index,
                    y=data3_2['Users'],
                    name='Yellow Cab',
                    marker_color='orange'
                    ))
fig.update_layout(
    yaxis_title="Users",
    title="Pink & Yellow Cab Firm Users Distribution Over Gender" )

#### ⚫ As seen from this Bar Chart; When the distribution of users by gender is analyzed on a company basis, while the **male-female ratio is 57.6% - 42.4% in Yellow Cab** Company, the **male-female ratio is 55.9% - 44.1% in Pink Cab** Company.

In [38]:
data3_3 = MasterData.groupby("Payment_Mode").count()
fig1 = px.pie(data3_3,
                        values=data3_3.Users,
                        names=data3_3.index ,title="Total Users Overview by Payment Method"
                        )
fig1.update_traces(
                        textposition='inside',
                        textinfo='percent+label'
                        )
fig1.update_layout( width = 1100 , height = 600 )

#### ⚫ As seen from this Pie Chart; Considering the payment preferences of all users, the **credit card- cash payment ratio is 3 to 2**.

In [39]:
ageGroup_Young  = MasterData[(MasterData ["Age"] >= 18) & (MasterData ["Age"] < 25)].count()
ageGroup_Middle = MasterData[(MasterData ["Age"] >= 25) & (MasterData ["Age"] < 40)].count()
ageGroup_Old    = MasterData[(MasterData ["Age"] >= 40) & (MasterData ["Age"] <= 65)].count()

dict = {"25 > Age >= 18 (YOUNG)" : ageGroup_Young ,"40 > Age >= 25 (MIDDLE)" : ageGroup_Middle , "65 >= Age >= 40 (OLD)" : ageGroup_Old }
data4_0 = pd.DataFrame(dict).T

fig1 = px.pie(data4_0 ,
                        values=data4_0.Users,
                        names=data4_0.index ,title="Total Users Overview by Age Groups"
                        )
fig1.update_traces(
                        textposition='inside',
                        textinfo='percent+label'
                        )
fig1.update_layout( width = 1100 , height = 600 ) 

 

#### ⚫ As seen from this Pie Chart; Looking at the **age distribution of all users**, it is seen that **approximately half of them** are between the ages of **18-25**.

In [40]:
ageGroup_Young_Pink    = MasterData[(MasterData ["Age"] >= 18) & (MasterData ["Age"] < 25) & (MasterData["Company"] == "Pink Cab")] .count()
ageGroup_Middle_Pink   = MasterData[(MasterData ["Age"] >= 25) & (MasterData ["Age"] < 40) & (MasterData["Company"] == "Pink Cab")].count()
ageGroup_Old_Pink      = MasterData[(MasterData ["Age"] >= 40) & (MasterData ["Age"] <= 65) & (MasterData["Company"] == "Pink Cab")].count()

ageGroup_Young_Yellow  = MasterData[(MasterData ["Age"] >= 18) & (MasterData ["Age"] < 25) & (MasterData["Company"] == "Yellow Cab")] .count()
ageGroup_Middle_Yellow = MasterData[(MasterData ["Age"] >= 25) & (MasterData ["Age"] < 40) & (MasterData["Company"] == "Yellow Cab")].count()
ageGroup_Old_Yellow    = MasterData[(MasterData ["Age"] >= 40) & (MasterData ["Age"] <= 65) & (MasterData["Company"] == "Yellow Cab")].count()

dict0 = { "25 > Age >= 18 (YOUNG)" : ageGroup_Young_Pink ,"40 > Age >= 25 (MIDDLE)" : ageGroup_Middle_Pink , "65 >= Age >= 40 (OLD)" : ageGroup_Old_Pink }

dict1 = { "25 > Age >= 18 (YOUNG)" : ageGroup_Young_Yellow ,"40 > Age >= 25 (MIDDLE)" : ageGroup_Middle_Yellow , "65 >= Age >= 40 (OLD)" : ageGroup_Old_Yellow }

data4_1 = pd.DataFrame(dict0).T
data4_2 = pd.DataFrame(dict1).T

fig = go.Figure()
fig.add_trace(go.Bar(
                    x=data4_1.index,
                    y=data4_1['Users'],
                    name='Pink Cab',
                    marker_color='pink' 
                    ))            
fig.add_trace(go.Bar(
                    x=data4_2.index,
                    y=data4_2['Users'],
                    name='Yellow Cab',
                    marker_color='orange'
                    ))
fig.update_layout(
    yaxis_title="Users",
    title="Pink & Yellow Cab Firm Users Distributions by Age Groups" )

#### ⚫ As seen from this Bar Chart; Looking at the **age distribution of all users in the basis of companies**, it is seen that both have the **same percentage distribution for every age group** .

In [41]:
data5_0 = MasterData.groupby("City")["Income_(USD/Month)"].mean()
data5_0 = pd.DataFrame(data5_0)
fig1 = px.pie(data5_0,
                        values=data5_0["Income_(USD/Month)"],
                        names=data5_0.index ,title="Average Income by Cities"
                        )
fig1.update_traces(
                        textposition='inside',
                        textinfo='percent+label'
                        )
fig1.update_layout( width = 1100 , height = 600 )


#### ⚫ As seen from this Pie Chart; The **average income of all users by city** is approximately **equal**.

In [42]:
data5_1 = MasterData.groupby("Company")["Income_(USD/Month)"].mean()
data5_1 = pd.DataFrame(data5_1)
fig1 = px.pie(data5_1,
                        values=data5_1["Income_(USD/Month)"],
                        names=data5_1.index ,title="Average Income by Cab Firm"
                        )
fig1.update_traces(
                        textposition='inside',
                        textinfo='percent+label'
                        )
fig1.update_layout( width = 1100 , height = 600 )

#### ⚫ As seen from this Pie Chart; The **average income of all users by companies** is approximately **equal**.

In [43]:
data5_2 = MasterData.groupby("City")["KM_Travelled"].sum()
data5_2 = pd.DataFrame(data5_2)
fig1 = px.pie(data5_2,
                        values=data5_2["KM_Travelled"],
                        names=data5_2.index ,title="Total KM Travelled by Cities"
                        )
fig1.update_traces(
                        textposition='inside',
                        textinfo='percent+label'
                        )
fig1.update_layout( width = 1100 , height = 600 )

#### ⚫ As seen from this Pie Chart; On the basis of **cities**, the **most travelled in KM**  are **New York, Chicago, Los Angeles, Washington and Boston**.

In [44]:
data5_3 = MasterData.groupby("Company")["KM_Travelled"].sum()
data5_3 = pd.DataFrame(data5_3)
fig1 = px.pie(data5_3,
                        values=data5_3["KM_Travelled"],
                        names=data5_3.index ,title="Total KM Travelled by Cab Firm"
                        )
fig1.update_traces(
                        textposition='inside',
                        textinfo='percent+label'
                        )
fig1.update_layout( width = 1100 , height = 600 )

#### ⚫ As seen from this Pie Chart; The **total travelled in KM** for **Yellow Cab** is **approximately 3 times that of Pink Cab**.

In [45]:
MasterData["Profit_per_KM"] = MasterData["Profit_of_Trip"].sum() / MasterData["KM_Travelled"].sum()
data5_4 = MasterData.groupby("City")["Profit_per_KM"].mean()
data5_4 = pd.DataFrame(data5_4)

fig1 = px.pie(data5_4,
                        values=data5_4["Profit_per_KM"],
                        names=data5_4.index ,title="Average Profit per KM Travelled by Cities"
                        )
fig1.update_traces(
                        textposition='inside',
                        textinfo='percent+label'
                        )
fig1.update_layout( width = 1100 , height = 600 )

#### ⚫ As seen from this Pie Chart; The **average profit per travelled in KM by cities** is approximately **equal**.

In [46]:
data5_5 = MasterData.groupby("Company")["Profit_per_KM"].mean()
data5_5 = pd.DataFrame(data5_5)

fig1 = px.pie(data5_5,
                        values=data5_5["Profit_per_KM"],
                        names=data5_5.index ,title="Average Profit per KM Travelled by Cab Firm"
                        )
fig1.update_traces(
                        textposition='inside',
                        textinfo='percent+label'
                        )
fig1.update_layout( width = 1100 , height = 600 )

#### ⚫ As seen from this Pie Chart; The **average profit per travelled in KM by companies** is approximately **equal**.

#### **Time Series Analysis**

In [47]:
MasterData['Year_of_Travel'] = MasterData['Date_of_Travel'].dt.year
MasterData['Month_of_Travel'] = MasterData['Date_of_Travel'].dt.month
MasterData['Day_of_Travel'] = MasterData['Date_of_Travel'].dt.day

MasterData_TimeSeries=MasterData.set_index('Date_of_Travel')
MasterData_TimeSeries.head(10)

In [48]:
years_list = ["2016","2017","2018"]

def year_pink():
    
    
    
    profit_2016 = round ( MasterData[(MasterData['Year_of_Travel'] == 2016) & (MasterData.Company == "Pink Cab")]["Profit_of_Trip"].sum() ,2)
    profit_2017 = round ( MasterData[(MasterData['Year_of_Travel'] == 2017) & (MasterData.Company == "Pink Cab")]["Profit_of_Trip"].sum() ,2)
    profit_2018 = round ( MasterData[(MasterData['Year_of_Travel'] == 2018) & (MasterData.Company == "Pink Cab")]["Profit_of_Trip"].sum() ,2)
    
    profits = [profit_2016, profit_2017, profit_2018]
    
    return profits

   

def year_yellow():
    
    
    profit_2016 = round ( MasterData[(MasterData['Year_of_Travel'] == 2016) & (MasterData.Company == "Yellow Cab")]["Profit_of_Trip"].sum() ,2)
    profit_2017 = round ( MasterData[(MasterData['Year_of_Travel'] == 2017) & (MasterData.Company == "Yellow Cab")]["Profit_of_Trip"].sum() ,2)
    profit_2018 = round ( MasterData[(MasterData['Year_of_Travel'] == 2018) & (MasterData.Company == "Yellow Cab")]["Profit_of_Trip"].sum() ,2)
    
    profits = [profit_2016, profit_2017, profit_2018]
    
    return profits
   

yellow = year_yellow()
pink = year_pink()
zippedy = zip(yellow,years_list)
zippedp = zip(pink,years_list)

datap = pd.DataFrame(zippedp,columns=["Profit","Year"])
datay = pd.DataFrame(zippedy,columns=["Profit","Year"])

fig = go.Figure()
fig.add_trace(go.Scatter(x=datay.Year, y=datay.Profit,
                    mode='lines+markers',
                    name='Yellow Cab'
                    ))
fig.add_trace(go.Scatter(x=datap.Year, y=datap.Profit,
                    mode='lines+markers',
                    name='Pink Cab'
                     ))
fig.update_layout(
    title="Total Profit per year by Cab Firm",
    xaxis_title="Years",
    yaxis_title="Profits",
    legend_title="Cab Companies",
)

fig.show()


#### ⚫ As seen from this Pie Chart; 
In 2016 , The **total market profit share** of **Yellow Cab** is **approximately 8.15 times that of Pink Cab**.
In 2017 , The **total market profit share** of **Yellow Cab** is **approximately 8.16 times that of Pink Cab**.
In 2018 , The **total market profit share** of **Yellow Cab** is **approximately 8.66 times that of Pink Cab**

### ⚫ **Overall Conclusion**

When we consider for both Cab Firms in terms of **total market profit share** , **total user share**  , **yearly market profit share** , **total travelled in KM by Users**  ;  we will recommend **Yellow Cab Firm** for investment. 
