In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
from bokeh.layouts import gridplot
from bokeh.plotting import figure, output_file, show
from bokeh.models import HoverTool,ColumnDataSource
from scipy import stats
from itertools import combinations


#I tried to run this lines to import all the year but it is too much for my 6GB ram 

#data = ["yellow_tripdata_2018-01.csv","yellow_tripdata_2018-02.csv","yellow_tripdata_2018-03.csv",
#        "yellow_tripdata_2018-04.csv","yellow_tripdata_2018-05.csv","yellow_tripdata_2018-06.csv"]
#df = pd.DataFrame()
#for i in data:
#    df1 = pd.read_csv(i)
#    df = df.append(df1)
#    del df1

#So I run only January
df = pd.read_csv("yellow_tripdata_2018-01.csv")

__*[CRQ1]*: Does the fare for mile change across NY's borough? We want to discover whether the expenses of a user that
          enjoys Taxis in one zone is different from those that uses it in another one.__
          
* Considering the fare amount:
  1. Compute the price per mile __P__ for each trip.  

In [3]:
#Create the price per mile column and append it to the dataframe

da = df['fare_amount']/df['trip_distance']
df['price_mile'] = pd.Series(da)
df['price_mile'].head()

0    9.000000
1    5.185185
2    7.500000
3    3.284314
4    5.000000
Name: price_mile, dtype: float64

In [4]:
#Download the taxi zone and select only what i need

taxi_zone = pd.read_csv("taxi_zone_lookup.csv")
taxi_zone = taxi_zone[['LocationID','Borough']]



Since now I want to merge the dataframes, and __pd.merge()__ with big data becomes really slow, I take a sample of the 'df' dataframe

In [6]:
#Choose 0.1 as fraction of the dataframe because some boroughs has low number of runs
#and less observations could ruin the analysis 
df_samp = df.sample(frac = 0.1)

#Merge the dataframe
bor = pd.merge(df_samp,taxi_zone, how = 'left', left_on = 'PULocationID', right_on = 'LocationID')

#Get a look at the 'price_mile' column that we're going to need
bor['price_mile'].describe()


count    8.758780e+05
mean              NaN
std               NaN
min              -inf
25%      4.355401e+00
50%      5.535714e+00
75%      7.042254e+00
max               inf
Name: price_mile, dtype: float64

In [7]:

#Count the NaN values
sum(bor['price_mile'].isna())

109

To adjust our column we need to get rid of the __NaN__ and __inf__ values and the __negative__ ones(due to errors in the dataset)

In [8]:
#Transform inf value in NaN
pd.options.mode.use_inf_as_na = True

#Deal with "NaN" values replacing them with 0
bor['price_mile'] = bor['price_mile'].replace(np.nan, 0)

#Transform all the negative values in 0
bor[bor['price_mile']<0] = 0

#Drop the "0" since it won't affect our analisys beacuse the zeros are all the NaN and negative values
bor = bor[bor['price_mile'] !=0]


bor['price_mile'].describe()

count    869969.000000
mean          6.995820
std          55.586562
min           0.000281
25%           4.347826
50%           5.517241
75%           7.000000
max        9200.000000
Name: price_mile, dtype: float64

Considering the fare amount:
     
  1. Compute the price per mile __P__ for each trip.  
    
  __2__. Run the __mean__ and the __standard deviation__ of the new variable for each borough. Then plot the distribution. What do you see?

Now we can start to compare every __borough__

In [10]:
#Divide the df for every borough to plot them
bor = bor[['Borough', 'price_mile']]

man = bor[bor['Borough'] == 'Manhattan']
broo = bor[bor['Borough'] == 'Brooklyn']
que = bor[bor['Borough'] == 'Queens']
bro = bor[bor['Borough'] == 'Bronx']
sta = bor[bor['Borough'] == 'Staten Island']
ewr = bor[bor['Borough'] == 'EWR']

man.head()

Unnamed: 0,Borough,price_mile
0,Manhattan,2.979452
1,Manhattan,4.893617
2,Manhattan,9.574468
3,Manhattan,8.653846
4,Manhattan,4.977376


In [11]:
#Make a list to put togheter all the dataframes
l = [man, broo, que, bro, sta, ewr]
borough_names = ['Manhattan','Brooklyn','Queens','Bronx','Staten Island','EWR']

#Iterate to find the mean and the standard deviation
for i,j in zip(l,borough_names):
    print('The mean of %s is : %.2f' % (j,i['price_mile'].mean()))
    print('The standard deviation of %s is: %.2f' % (j,i['price_mile'].std()))

    

The mean of Manhattan is : 6.73
The standard deviation of Manhattan is: 38.56
The mean of Brooklyn is : 6.10
The standard deviation of Brooklyn is: 32.55
The mean of Queens is : 6.86
The standard deviation of Queens is: 101.29
The mean of Bronx is : 6.04
The standard deviation of Bronx is: 8.07
The mean of Staten Island is : 5.91
The standard deviation of Staten Island is: 2.49
The mean of EWR is : 400.35
The standard deviation of EWR is: 1255.98


In [12]:
#I use Bokeh to make my plots
p_hist, edges = np.histogram(man['price_mile'], bins = 50, range = [0,10])

p_mile = pd.DataFrame({'pr_mile': p_hist, 'left': edges[:-1], 'right': edges[1:]})

# Add a column showing the extent of each interval
p_mile['p_interval'] = ['%.1f to %.1f $ per mile' % (left, right) for left,right in zip(p_mile['left'], p_mile['right'])]


#Create a function to make the plot
def make_plot(title, hist, edges, p_mile):
    p = figure(title=title,background_fill_color="#fafafa", x_axis_label = 'Price/mile',y_axis_label = 'Number of runs')
    
    src = ColumnDataSource(p_mile)
    
    p.quad(top='pr_mile', bottom=0, left='left', right='right',source = src,
           fill_color="navy", fill_alpha=0.75, hover_fill_alpha = 1.0, hover_fill_color = 'red')
    
    p.add_tools(HoverTool(tooltips=[('Interval', '@p_interval'),
                             ('Num of runs', '@pr_mile')]))
    
    
    p.title.text_font_size = '18pt'
    p.xaxis.axis_label_standoff = 20
    p.yaxis.axis_label_standoff = 20
    p.xaxis.axis_label_text_font_size = "16pt"
    p.yaxis.axis_label_text_font_size = "16pt"
    p.grid.grid_line_color="white"
    return p

#Recall the plot function
p_man= make_plot('Distribution of price per mile: Manhattan', p_hist, edges, p_mile)

#Change the data to make the plot for every borough
p_hist, edges = np.histogram(broo['price_mile'], bins = 50, range = [0,10])
p_mile = pd.DataFrame({'pr_mile': p_hist, 'left': edges[:-1], 'right': edges[1:]})
p_mile['p_interval'] = ['%.1f to %.1f $ per mile' % (left, right) for left,
      right in zip(p_mile['left'], p_mile['right'])]
p_broo= make_plot('Distribution of price per mile: Brooklyn', p_hist, edges, p_mile)

p_hist, edges = np.histogram(que['price_mile'], bins = 50, range = [0,10])
p_mile = pd.DataFrame({'pr_mile': p_hist, 'left': edges[:-1], 'right': edges[1:]})
p_mile['p_interval'] = ['%.1f to %.1f $ per mile' % (left, right) for left,
      right in zip(p_mile['left'], p_mile['right'])]
p_que= make_plot('Distribution of price per mile: Queens', p_hist, edges, p_mile)

p_hist, edges = np.histogram(bro['price_mile'], bins = 50, range = [0,10])
p_mile = pd.DataFrame({'pr_mile': p_hist, 'left': edges[:-1], 'right': edges[1:]})
p_mile['p_interval'] = ['%.1f to %.1f $ per mile' % (left, right) for left,
      right in zip(p_mile['left'], p_mile['right'])]
p_bro= make_plot('Distribution of price per mile: Bronx', p_hist, edges, p_mile)

p_hist, edges = np.histogram(sta['price_mile'], bins = 50, range = [0,10])
p_mile = pd.DataFrame({'pr_mile': p_hist, 'left': edges[:-1], 'right': edges[1:]})
p_mile['p_interval'] = ['%.1f to %.1f $ per mile' % (left, right) for left,
      right in zip(p_mile['left'], p_mile['right'])]
p_sta= make_plot('Distribution of price per mile: Staten Island', p_hist, edges, p_mile)

p_hist, edges = np.histogram(ewr['price_mile'], bins = 50, range = [0,10])
p_mile = pd.DataFrame({'pr_mile': p_hist, 'left': edges[:-1], 'right': edges[1:]})
p_mile['p_interval'] = ['%.1f to %.1f $ per mile' % (left, right) for left,
      right in zip(p_mile['left'], p_mile['right'])]
p_ewr= make_plot('Distribution of price per mile: EWR', p_hist, edges, p_mile)

output_file('hist.html')

#Create a grid plot with all the histograms of the distributions
show(gridplot([p_man,p_broo, p_que,p_bro, p_sta, p_ewr], ncols = 3, 
              plot_width = 600, plot_height = 600,toolbar_location = None))


What we can say about the dataset is:
* The most of the taxis are used in the borough of __Manhattan__ 
* In __Staten Island__ and __EWR__ doesn't pass almost any of the taxis(could be due to the fact that I sampled the dataset) 

Considering the fare amount:
        
  1. Compute the price per mile __P__ for each trip.  
    
  2. Run the __mean__ and the __standard deviation__ of the new variable for each borough. Then plot the distribution. What do you see?
  
  __3__. Run the __t-test__ among all the possible pairs of distribution of different boroughs.

In [13]:
#Recall the 'l' list of the boroughs without 'Staten Island' and 'EWR' because for them there are too low observations
l = [man, broo, que, bro]
c = list(combinations(l,2))

#Run the t-test
for i in c:
    print(i[0].iloc[0,0],i[1].iloc[0,0],stats.ttest_ind(i[0]['price_mile'],i[1]['price_mile'], equal_var = False), sep='\n')
            
    

Manhattan
Brooklyn
Ttest_indResult(statistic=1.8838155398163332, pvalue=0.05961875415402047)
Manhattan
Queens
Ttest_indResult(statistic=-0.28808078735936793, pvalue=0.773286041293168)
Manhattan
Bronx
Ttest_indResult(statistic=2.132024137758602, pvalue=0.03337314445027568)
Brooklyn
Queens
Ttest_indResult(statistic=-1.3612323613642636, pvalue=0.17344670125607223)
Brooklyn
Bronx
Ttest_indResult(statistic=0.1313379457290958, pvalue=0.8955184062514081)
Queens
Bronx
Ttest_indResult(statistic=1.4863742634931738, pvalue=0.1372393966767187)


Considering the fare amount:
        
  1. Compute the price per mile __P__ for each trip.  
    
  2. Run the __mean__ and the __standard deviation__ of the new variable for each borough. Then plot the distribution. What do you see?
  
  3. Run the __t-test__ among all the possible pairs of distribution of different boroughs.
  
  __4__. Can you say that __statistically significant differences__, on the averages, hold among zones? In other words, are Taxis trip in some boroughs, on average, more expensive than others?



The __p-value__ can be thought of as the probability of observing
the two data samples given the base assumption (null hypothesis)
that the two samples were drawn from a population with the same distribution.


 if __p<= 0.05__  we can reject the null hypotesis(same distribution).They have different distribution 
         
Useful information from our T-test could be taken from the borough with a valuable number of runs.
Those are __Manhattan__ and __Brooklyn__. Their p-value very small tells us that they have different distribution and we can say that the price per mile is different between the two.

The strange value of the p-value is the one between __Manhattan__ and __Queens__ that is quite high(0.78), saying that probably the price per mile between those __is the same__

The price per mile might depend on traffic the Taxi finds on its way. So we try to mitigate this effect:

 1. Likely, the duration of the trip says something about the city's congestion, especially if combined with the distances. It might be a good idea to weight the price for mile using the time $T$ needed to complete the trip.

  Thus, instead of $P$, you can use $P' = \frac{P}{T}$ , where $T$ is the time needed to complete the trip. 

In [22]:
#Select only the columns that I need
df_samp = df_samp[['tpep_pickup_datetime','tpep_dropoff_datetime', 'PULocationID', 'price_mile']]

#Merge df with taxi zone 
bor = pd.merge(df_samp,taxi_zone, how = 'left', left_on = 'PULocationID', right_on = 'LocationID')

#Operate with the date to get the duration of every trip in seconds
pu = pd.to_datetime(bor['tpep_pickup_datetime'])
do = pd.to_datetime(bor['tpep_dropoff_datetime'])
duration = (do-pu).dt.seconds
bor['duration_min'] = pd.Series(duration/60)

#Create the new column price per mile per time and append it to 'bor'
data = bor['price_mile']
dat = bor['duration_min']
da = data/dat
bor['p_mile_time'] = pd.Series(da)

#Deal with NaN and negative values as above
bor['p_mile_time'] = bor['p_mile_time'].replace(np.nan, 0)
bor[bor['p_mile_time']<0] = 0
bor = bor[bor['p_mile_time'] !=0]

bor.head()


Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,price_mile,LocationID,Borough,duration_min,p_mile_time
0,2018-01-09 21:08:13,2018-01-09 21:45:26,162,2.979452,162,Manhattan,37.216667,0.080057
1,2018-01-02 19:59:37,2018-01-02 20:13:33,142,4.893617,142,Manhattan,13.933333,0.351217
2,2018-01-17 22:25:18,2018-01-17 22:29:34,148,9.574468,148,Manhattan,4.266667,2.244016
3,2018-01-17 15:22:23,2018-01-17 15:26:37,164,8.653846,164,Manhattan,4.233333,2.044216
4,2018-01-12 17:49:30,2018-01-12 18:03:26,163,4.977376,163,Manhattan,13.933333,0.357228


In [15]:
bor['p_mile_time'].describe()

count    869953.000000
mean          9.775694
std         991.495625
min           0.000002
25%           0.285843
50%           0.545350
75%           1.032406
max      552000.000000
Name: p_mile_time, dtype: float64

The price per mile might depend on traffic the Taxi finds on its way. So we try to mitigate this effect:

  1. Likely, the duration of the trip says something about the city's congestion, especially if combined with the distances. It might be a good idea to weight the price for mile using the time $T$ needed to complete the trip.

  Thus, instead of $P$, you can use $P' = \frac{P}{T}$ , where $T$ is the time needed to complete the trip.
  
  __2__. Run the mean and the standard deviation of the new variable for each borough. Then plot the distribution. What do you see?

In [23]:
#Divide the df for every borough to plot them
bor = bor[['Borough', 'p_mile_time','price_mile']]

man = bor[bor['Borough'] == 'Manhattan']
broo = bor[bor['Borough'] == 'Brooklyn']
que = bor[bor['Borough'] == 'Queens']
bro = bor[bor['Borough'] == 'Bronx']
sta = bor[bor['Borough'] == 'Staten Island']
ewr = bor[bor['Borough'] == 'EWR']

#Make a list to put togheter all the dataframes
l = [man, broo, que, bro, sta, ewr]
borough_names = ['Manhattan','Brooklyn','Queens','Bronx','Staten Island','EWR']

#iterate to find the mean and the standard deviation
for i,j in zip(l,borough_names):
    print('The mean of %s is : %.2f' % (j,i['p_mile_time'].mean()))
    print('The standard deviation of %s is: %.2f' % (j,i['p_mile_time'].std()))

The mean of Manhattan is : 3.86
The standard deviation of Manhattan is: 353.39
The mean of Brooklyn is : 7.15
The standard deviation of Brooklyn is: 311.02
The mean of Queens is : 17.92
The standard deviation of Queens is: 999.09
The mean of Bronx is : 3.41
The standard deviation of Bronx is: 33.50
The mean of Staten Island is : 4.13
The standard deviation of Staten Island is: 7.23
The mean of EWR is : 362.99
The standard deviation of EWR is: 830.10


In [17]:
#I use Bokeh to make my plots
p_hist, edges = np.histogram(man['p_mile_time'], bins = 40, range = [0,4])

p_mile_time = pd.DataFrame({'pr_mile_time': p_hist, 'left': edges[:-1], 'right': edges[1:]})

# Add a column showing the extent of each interval
p_mile_time['p_interval'] = ['%.1f to %.1f $ per mile per minute' % (left, right)
                        for left,right in zip(p_mile_time['left'], p_mile_time['right'])]


#Create a function to make the plot
def make_plot(title, hist, edges, p_mile_time):
    p = figure(title=title,background_fill_color="#fafafa", 
               x_axis_label = 'Price/(mile x time)($,mile,min)',y_axis_label = 'Number of runs')
    
    src = ColumnDataSource(p_mile_time)
    
    p.quad(top='pr_mile_time', bottom=0, left='left', right='right',source = src,
           fill_color="yellow", fill_alpha=0.75, hover_fill_alpha = 1.0, hover_fill_color = 'black')
    
    p.add_tools(HoverTool(tooltips=[('Interval', '@p_interval'),
                             ('Num of runs', '@pr_mile_time')]))
    
    
    p.title.text_font_size = '16pt'
    p.xaxis.axis_label_standoff = 20
    p.yaxis.axis_label_standoff = 20
    p.xaxis.axis_label_text_font_size = "14pt"
    p.yaxis.axis_label_text_font_size = "14pt"
    return p

#Recall the plot function
p_man= make_plot('Distribution of price per mile: Manhattan', p_hist, edges, p_mile_time)

#Change the data to make the plot for every borough
p_hist, edges = np.histogram(broo['p_mile_time'], bins = 40, range = [0,4])
p_mile_time = pd.DataFrame({'pr_mile_time': p_hist, 'left': edges[:-1], 'right': edges[1:]})
p_mile_time['p_interval'] = ['%.1f to %.1f $ per mile per minute' % (left, right) for left,
      right in zip(p_mile_time['left'], p_mile_time['right'])]
p_broo= make_plot('Distribution of price per mile per minute: Brooklyn', p_hist, edges, p_mile_time)

p_hist, edges = np.histogram(que['p_mile_time'], bins = 40, range = [0,4])
p_mile_time = pd.DataFrame({'pr_mile_time': p_hist, 'left': edges[:-1], 'right': edges[1:]})
p_mile_time['p_interval'] = ['%.1f to %.1f $ per mile per minute' % (left, right) for left,
      right in zip(p_mile_time['left'], p_mile_time['right'])]
p_que= make_plot('Distribution of price per mile per minute: Queens', p_hist, edges, p_mile_time)

p_hist, edges = np.histogram(bro['p_mile_time'], bins = 40, range = [0,4])
p_mile_time = pd.DataFrame({'pr_mile_time': p_hist, 'left': edges[:-1], 'right': edges[1:]})
p_mile_time['p_interval'] = ['%.1f to %.1f $ per mile per minute' % (left, right) for left,
      right in zip(p_mile_time['left'], p_mile_time['right'])]
p_bro= make_plot('Distribution of price per mile per minute: Bronx', p_hist, edges, p_mile_time)


p_hist, edges = np.histogram(sta['p_mile_time'], bins = 40, range = [0,4])
p_mile_time = pd.DataFrame({'pr_mile_time': p_hist, 'left': edges[:-1], 'right': edges[1:]})
p_mile_time['p_interval'] = ['%.1f to %.1f $ per mile per minute' % (left, right) for left,
      right in zip(p_mile_time['left'], p_mile_time['right'])]
p_sta= make_plot('Distribution of price per mile per minute: St. Island', p_hist, edges, p_mile_time)


p_hist, edges = np.histogram(ewr['p_mile_time'], bins = 40, range = [0,4])
p_mile_time = pd.DataFrame({'pr_mile_time': p_hist, 'left': edges[:-1], 'right': edges[1:]})
p_mile_time['p_interval'] = ['%.1f to %.1f $ per mile per minute' % (left, right) for left,
      right in zip(p_mile_time['left'], p_mile_time['right'])]
p_ewr= make_plot('Distribution of price per mile per minute: EWR', p_hist, edges, p_mile_time)

output_file('hist.html')

#Create a grid plot with all the histograms of the distributions
show(gridplot([p_man,p_broo, p_que,p_bro, p_sta, p_ewr], ncols = 3,
              plot_width = 600, plot_height = 600,toolbar_location = None))


In [18]:
#Describe the variable to see why the mean and the plot doesn't coincide
man['p_mile_time'].describe()


count    793564.000000
mean          3.858789
std         353.389328
min           0.000002
25%           0.321571
50%           0.580495
75%           1.072784
max      156000.000000
Name: p_mile_time, dtype: float64

From the __mean__, the __deviation__ and the __plot__ we can see, taking from example Manhattan, that the plot seem to go toward a mean of __0.2/0.3__, versus the mean that says that is almost __4__. That is due to the fact that we sampled the dataset and that the variable has a great standard deviation, so is possibile that the plot could not coincide with the mean. Overall we can see above that the 75th percentile is 1.06 so is more than possible to have a plot that varies from the mean.

As before 'EWR' and 'Staten Island' haven't enough values to be plotted giving that bad result 

The price per mile might depend on traffic the Taxi finds on its way. So we try to mitigate this effect:

  1. Likely, the duration of the trip says something about the city's congestion, especially if combined with the distances. It might be a good idea to weight the price for mile using the time $T$ needed to complete the trip.

  Thus, instead of $P$, you can use $P' = \frac{P}{T}$ , where $T$ is the time needed to complete the trip.
  
  2. Run the mean and the standard deviation of the new variable for each borough. Then plot the distribution. What do you see?
  
  __3__.Run the __T-test__ among all the possible pairs of new distribution of different boroughs.

In [19]:
#Recall the 'l' list of the boroughs without 'Staten Island' and 'EWR' because for them there are too low observations
l = [man, broo, que, bro]
c = list(combinations(l,2))

#Run the t-test
for i in c:
    print(i[0].iloc[0,0],i[1].iloc[0,0],stats.ttest_ind(i[0]['p_mile_time'],i[1]['p_mile_time'], equal_var = False), sep='\n')

Manhattan
Brooklyn
Ttest_indResult(statistic=-1.0366191399091664, pvalue=0.29993838061544)
Manhattan
Queens
Ttest_indResult(statistic=-3.1738001322372877, pvalue=0.0015054464571046208)
Manhattan
Bronx
Ttest_indResult(statistic=0.3258345504923033, pvalue=0.7446396977817862)
Brooklyn
Queens
Ttest_indResult(statistic=-1.98437262569883, pvalue=0.047219858185060914)
Brooklyn
Bronx
Ttest_indResult(statistic=1.094929921164507, pvalue=0.27357627459457146)
Queens
Bronx
Ttest_indResult(statistic=3.1495743255609607, pvalue=0.0016363936585582458)


The price per mile might depend on traffic the Taxi finds on its way. So we try to mitigate this effect:

  1. Likely, the duration of the trip says something about the city's congestion, especially if combined with the distances. It might be a good idea to weight the price for mile using the time $T$ needed to complete the trip.

  Thus, instead of $P$, you can use $P' = \frac{P}{T}$ , where $T$ is the time needed to complete the trip.
  
  2. Run the mean and the standard deviation of the new variable for each borough. Then plot the distribution. What do you see?
  
  3. Run the __T-test__ among all the possible pairs of new distribution of different boroughs.
  
  __4__.Can you say that __statistically significant differences__, on the averages, hold among zones? In other words, are Taxis trip in some boroughs, on average, more expensive than others?

from this T-test we can say that __almost every borough__ has different price from each other but Bronx(0.74,0.27). 

The price per mile might depend on traffic the Taxi finds on its way. So we try to mitigate this effect:

  1. Likely, the duration of the trip says something about the city's congestion, especially if combined with the distances. It might be a good idea to weight the price for mile using the time $T$ needed to complete the trip.

  Thus, instead of $P$, you can use $P' = \frac{P}{T}$ , where $T$ is the time needed to complete the trip.
  
  2. Run the mean and the standard deviation of the new variable for each borough. Then plot the distribution. What do you see?
  
  3. Run the __T-test__ among all the possible pairs of new distribution of different boroughs.
  
  4. Can you say that __statistically significant differences__, on the averages, hold among zones? In other words, are Taxis trip in some boroughs, on average, more expensive than others?

 __5__.Compare the results obtained for the price per mile and the weighted price for mile. What do you think about that?

In [28]:
#Make a list to put togheter all the dataframes
l = [man, broo, que, bro, sta, ewr]
borough_names = ['Manhattan','Brooklyn','Queens','Bronx','Staten Island','EWR']

#iterate to find the mean and the standard deviation for 'price_mile' and 'p_mile_time'
for i,j in zip(l,borough_names):
    print('\nThe mean of price per mile of %s is : %.2f' % (j,i['price_mile'].mean()))
    print('\nThe mean of price per mile per minute of  %s is : %.2f' % (j,i['p_mile_time'].mean()))
    print('\nThe standard deviation of price per mile of %s is: %.2f' % (j,i['price_mile'].std()))
    print('\nThe standard deviation of price per mile per minute of  %s is: %.2f' % (j,i['p_mile_time'].std()))
    
#Recall the 'l' list of the boroughs without 'Staten Island' and 'EWR' because for them there are too low observations
l = [man, broo, que, bro]
c = list(combinations(l,2))

#Run the t-test
for i in c:
    print(i[0].iloc[0,0],i[1].iloc[0,0],stats.ttest_ind(i[0]['price_mile'],i[1]['price_mile'], equal_var = False), sep='\n')
    print(i[0].iloc[0,0],i[1].iloc[0,0],stats.ttest_ind(i[0]['p_mile_time'],i[1]['p_mile_time'], equal_var = False), sep='\n')    
    


The mean of price per mile of Manhattan is : 6.72

The mean of price per mile per minute of  Manhattan is : 3.86

The standard deviation of price per mile of Manhattan is: 38.12

The standard deviation of price per mile per minute of  Manhattan is: 353.39

The mean of price per mile of Brooklyn is : 6.10

The mean of price per mile per minute of  Brooklyn is : 7.15

The standard deviation of price per mile of Brooklyn is: 32.55

The standard deviation of price per mile per minute of  Brooklyn is: 311.02

The mean of price per mile of Queens is : 6.86

The mean of price per mile per minute of  Queens is : 17.92

The standard deviation of price per mile of Queens is: 101.30

The standard deviation of price per mile per minute of  Queens is: 999.09

The mean of price per mile of Bronx is : 6.04

The mean of price per mile per minute of  Bronx is : 3.41

The standard deviation of price per mile of Bronx is: 8.07

The standard deviation of price per mile per minute of  Bronx is: 33.50

The

The analisys of the __means__ of the two different price tells us that,as could be predict, __the traffic has influence__ on the price of the run:

     Manhattan price per mile: 6.72
               price per mile per minute: 3.86

That we can see,instead, is that in boroughs with not too much traffic as Manhattan like Brooklyn, the price for mile is __smaller__ than the one weighted:

     Brooklyn price per mile: 6.10
              price per mile per minute: 7.15
          
     Queens price per mile: 6.86
              price per mile per minute: 17.92

The effect of the weigthed price is that create a __great variance_ in the data 

From the __T-test__ taking from example the comparison between Manhattan and Brooklyn:
                   
     Manhattan-Brooklyn's price per mile p-value: 0.06
                          price per mile per minute p-value: 0.29
                          
We can see that they could __seem to be different__, but when we weight the price we are not so sure about their difference.

Instead if we look at Manhattan-Queens and Brooklyn-Queens:

     Manhattan-Queens's price per mile p-value: 0.76
                        price per mile per minute p-value: 0.001
                        
     Brooklyn-Queens's  price per mile p-value: 0.17
                        price per mile per minute p-value: 0.04               
                  
What we deduce is that when we took off the variable of the time we can obviusly say that they are different and that on average __they have different price.__
      
        