In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.weightstats import zconfint
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('yellow_tripdata_sample.csv')

# Part B: Inferential Statistics
print("Part B: Inferential Statistics\n")

# 1. Confidence Intervals
print("1. Confidence Intervals\n")
# Using statsmodels zconfint for 95% confidence interval
# Assuming a large sample size, we can use the z-distribution
# Alternatively, for smaller samples, a t-distribution would be more appropriate.
def get_confidence_interval(data, alpha=0.05):
    """
    Computes the 95% confidence interval for the mean of a given data series.
    """
    mean = np.mean(data)
    std_err = stats.sem(data)
    z_critical = stats.norm.ppf(1 - alpha / 2)
    margin_of_error = z_critical * std_err
    confidence_interval = (mean - margin_of_error, mean + margin_of_error)
    return confidence_interval

trip_distance_ci = get_confidence_interval(df['trip_distance'])
fare_amount_ci = get_confidence_interval(df['fare_amount'])
tip_amount_ci = get_confidence_interval(df['tip_amount'])

print(f"95% Confidence Interval for Mean Trip Distance: {trip_distance_ci}")
print(f"95% Confidence Interval for Mean Fare Amount: {fare_amount_ci}")
print(f"95% Confidence Interval for Mean Tip Amount: {tip_amount_ci}")

print("\n" + "-"*50 + "\n")

# 2. Hypothesis Testing
print("2. Hypothesis Testing\n")

# Task 1: One-sample t-test
# H0: The average tip amount is equal to $2
# H1: The average tip amount is different from $2
mu_0 = 2
t_stat_tip, p_value_tip = stats.ttest_1samp(df['tip_amount'], mu_0)
print(f"One-sample t-test for Tip Amount vs. $2:")
print(f"  T-statistic: {t_stat_tip:.4f}")
print(f"  P-value: {p_value_tip:.4e}")
if p_value_tip < 0.05:
    print("  Conclusion: We reject the null hypothesis. The average tip amount is significantly different from $2.")
else:
    print("  Conclusion: We fail to reject the null hypothesis. The average tip amount is not significantly different from $2.")

print("\n" + "-"*50 + "\n")

# Task 2: Two-sample t-test
# Compare average fare_amount between two Payment_type groups (e.g., credit card vs cash).
# Identify the two most common payment types to compare.
top_payment_types = df['payment_type'].value_counts().nlargest(2).index
if len(top_payment_types) < 2:
    print("Not enough unique payment types to perform a two-sample t-test.")
else:
    group1_name = top_payment_types[0]
    group2_name = top_payment_types[1]
    group1 = df[df['payment_type'] == group1_name]['fare_amount']
    group2 = df[df['payment_type'] == group2_name]['fare_amount']

    # Assuming unequal variances (Welch's t-test)
    t_stat_fare, p_value_fare = stats.ttest_ind(group1, group2, equal_var=False)

    print(f"Two-sample t-test for Fare Amount between Payment Types '{group1_name}' and '{group2_name}':")
    print(f"  T-statistic: {t_stat_fare:.4f}")
    print(f"  P-value: {p_value_fare:.4e}")
    if p_value_fare < 0.05:
        print(f"  Conclusion: We reject the null hypothesis. The average fare amount is significantly different between '{group1_name}' and '{group2_name}'.")
    else:
        print(f"  Conclusion: We fail to reject the null hypothesis. The average fare amount is not significantly different between '{group1_name}' and '{group2_name}'.")

print("\n" + "-"*50 + "\n")

# Task 3: Chi-square Test of Independence
# Test if Payment_type and RateCodeID are independent.
if 'RateCodeID' in df.columns and 'payment_type' in df.columns:
    # Create a contingency table
    contingency_table = pd.crosstab(df['payment_type'], df['RateCodeID'])
    chi2, p_chi2, dof, expected = stats.chi2_contingency(contingency_table)

    print("Chi-square Test of Independence for Payment_type and RateCodeID:")
    print(f"  Chi-square statistic: {chi2:.4f}")
    print(f"  P-value: {p_chi2:.4e}")
    if p_chi2 < 0.05:
        print("  Conclusion: We reject the null hypothesis. There is a significant association between Payment_type and RateCodeID.")
    else:
        print("  Conclusion: We fail to reject the null hypothesis. Payment_type and RateCodeID are independent.")
else:
    print("Chi-square test cannot be performed as 'RateCodeID' or 'payment_type' columns are not available.")

print("\n" + "-"*50 + "\n")

# 3. Correlation Analysis
print("3. Correlation Analysis\n")

# Compute Pearson correlation
pearson_corr_dist_fare = df['trip_distance'].corr(df['fare_amount'], method='pearson')
pearson_corr_fare_tip = df['fare_amount'].corr(df['tip_amount'], method='pearson')

print("Pearson Correlation:")
print(f"  Trip_distance vs. Fare_amount: {pearson_corr_dist_fare:.4f}")
print(f"  Fare_amount vs. Tip_amount: {pearson_corr_fare_tip:.4f}")

# Create a correlation matrix heatmap
corr_matrix = df[['trip_distance', 'fare_amount', 'tip_amount', 'total_amount']].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix Heatmap')
plt.tight_layout()
plt.savefig('correlation_matrix_heatmap.png')
print("\nCorrelation matrix heatmap saved as 'correlation_matrix_heatmap.png'")

print("\n" + "-"*50 + "\n")

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.weightstats import zconfint
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('yellow_tripdata_sample.csv')

In [5]:
#if the study is repeated many times, the true value would fall int the interval 95% of the time

print("1. Confidence Intervals\n")
def get_confidence_interval(data, alpha=0.05):
    """
    Computes the 95% confidence interval for the mean of a given data series.
    """
    mean = np.mean(data)
    std_err = stats.sem(data)
    z_critical = stats.norm.ppf(1 - alpha / 2)
    margin_of_error = z_critical * std_err
    confidence_interval = (mean - margin_of_error, mean + margin_of_error)
    return confidence_interval

trip_distance_ci = get_confidence_interval(df['trip_distance'])
fare_amount_ci = get_confidence_interval(df['fare_amount'])
tip_amount_ci = get_confidence_interval(df['tip_amount'])

print(f"95% Confidence Interval for Mean Trip Distance: {trip_distance_ci}")
print(f"95% Confidence Interval for Mean Fare Amount: {fare_amount_ci}")
print(f"95% Confidence Interval for Mean Tip Amount: {tip_amount_ci}")



1. Confidence Intervals

95% Confidence Interval for Mean Trip Distance: (2.696283735685443, 3.0846609879326476)
95% Confidence Interval for Mean Fare Amount: (17.35176054077729, 19.205425388870953)
95% Confidence Interval for Mean Tip Amount: (3.22245775279109, 3.743833704495343)


In [6]:
print("One-sample t-test\n")
# t-statistic measures the difference between your sample mean and the
#null hypothesis mean in units of standard error. A t-statistic of 0 would 
#mean there is no difference at all
#The decision to reject or fail to reject the null hypothesis is based on the p-value. A common
# threshold for statistical significance is 0.05 (or 5%).
mu_0 = 2
t_stat_tip, p_value_tip = stats.ttest_1samp(df['tip_amount'], mu_0)
print(f"One-sample t-test for Tip Amount vs. $2:")
print(f"  T-statistic: {t_stat_tip:.4f}")
print(f"  P-value: {p_value_tip:.4e}")
if p_value_tip < 0.05:
    print("  Conclusion: We reject the null hypothesis. The average tip amount is significantly different from $2.")
else:
    print("  Conclusion: We fail to reject the null hypothesis. The average tip amount is not significantly different from $2.")


One-sample t-test

One-sample t-test for Tip Amount vs. $2:
  T-statistic: 11.1509
  P-value: 2.7284e-27
  Conclusion: We reject the null hypothesis. The average tip amount is significantly different from $2.


In [7]:
print("Two-sample t-test\n")
# Compare average fare_amount between two Payment_type groups (e.g., credit card vs cash).
# Identify the two most common payment types to compare.
top_payment_types = df['payment_type'].value_counts().nlargest(2).index
if len(top_payment_types) < 2:
    print("Not enough unique payment types to perform a two-sample t-test.")
else:
    group1_name = top_payment_types[0]
    group2_name = top_payment_types[1]
    group1 = df[df['payment_type'] == group1_name]['fare_amount']
    group2 = df[df['payment_type'] == group2_name]['fare_amount']

    # Assuming unequal variances (Welch's t-test)
    t_stat_fare, p_value_fare = stats.ttest_ind(group1, group2, equal_var=False)

    print(f"Two-sample t-test for Fare Amount between Payment Types '{group1_name}' and '{group2_name}':")
    print(f"  T-statistic: {t_stat_fare:.4f}")
    print(f"  P-value: {p_value_fare:.4e}")
    if p_value_fare < 0.05:
        print(f"  Conclusion: We reject the null hypothesis. The average fare amount is significantly different between '{group1_name}' and '{group2_name}'.")
    else:
        print(f"  Conclusion: We fail to reject the null hypothesis. The average fare amount is not significantly different between '{group1_name}' and '{group2_name}'.")


Two-sample t-test

Two-sample t-test for Fare Amount between Payment Types '1' and '2':
  T-statistic: 0.2472
  P-value: 8.0495e-01
  Conclusion: We fail to reject the null hypothesis. The average fare amount is not significantly different between '1' and '2'.


In [9]:
print("Chi-square Test of Independence\n")
# Task 3: Chi-square Test of Independence
# Test if Payment_type and RateCodeID are independent.
if 'RatecodeID' in df.columns and 'payment_type' in df.columns:
    # Create a contingency table
    contingency_table = pd.crosstab(df['payment_type'], df['RatecodeID'])
    chi2, p_chi2, dof, expected = stats.chi2_contingency(contingency_table)

    print("Chi-square Test of Independence for Payment_type and RatecodeID:")
    print(f"  Chi-square statistic: {chi2:.4f}")
    print(f"  P-value: {p_chi2:.4e}")
    if p_chi2 < 0.05:
        print("  Conclusion: We reject the null hypothesis. There is a significant association between Payment_type and RateCodeID.")
    else:
        print("  Conclusion: We fail to reject the null hypothesis. Payment_type and RateCodeID are independent.")
else:
    print("Chi-square test cannot be performed as 'RateCodeID' or 'payment_type' columns are not available.")


Chi-square Test of Independence

Chi-square Test of Independence for Payment_type and RatecodeID:
  Chi-square statistic: 6.8535
  P-value: 5.5252e-01
  Conclusion: We fail to reject the null hypothesis. Payment_type and RateCodeID are independent.
