In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 500)
cab_data = pd.read_csv('Cab_Data.csv')
customer_id = pd.read_csv('Customer_ID.csv')
transaction_id = pd.read_csv('Transaction_ID.csv')
city = pd.read_csv('City.csv')

# Zip data to create big data
df1 = customer_id.merge(transaction_id)
df2 = city.merge(cab_data)
df = (df1.merge(df2)).reset_index()
df.to_csv ("big_data.csv")
"""
We get from quick look number of population and user are categorical but numerical
variable. Also, its easy to see  numerical variables in dataframe
"""
df.drop("index", axis=1, inplace=True)
df['Population'] = df['Population'].str.replace(',', '').astype(int)
df['Users'] = df['Users'].str.replace(',', '').astype(int)
df.head()

# Group by "Company" and calculate the sum of "Date of Travel" for each company:
cab_user_in_cer_time = df.groupby("Company").agg({"Date of Travel": ['sum', 'max']})

# There is no relation between income and Km & Price Charged
income_vs_Km = df.groupby("Income (USD/Month)").agg({"KM Travelled": "mean"})
print(income_vs_Km.reset_index())

#Also there is no relation between income and price charged to taxi.
income_vs_Price = df.groupby("Income (USD/Month)").agg({"Price Charged": "mean"})
print(income_vs_Price.reset_index())

drop_list = ["Income (USD/Month)","Transaction ID","Payment_Mode", "Population", "Users"]
df.drop(drop_list, axis=1, inplace=True)

def num_count_plt(dataframe):

    for col in dataframe.columns:
        if dataframe[col].dtypes == [int, float]:
            sns.countplot(x=col, data=df)
            plt.title(str(col) + "Distrubition")
            plt.xlabel(str(col))
            plt.show()

num_count_plt(df)

# Date of Travel frequence per company
df.groupby("Company").agg({"Date of Travel": "max"})

def check_df(dataframe, head=10):
    print("############ SHAPE ####################################")
    print(dataframe.shape)

    print("############ SHAPE ####################################")
    print(dataframe.dtypes)

    print("############ SHAPE ####################################")
    print(dataframe.head(head))

    print("############ SHAPE ####################################")
    print(dataframe.tail(head))

    print("############ SHAPE ####################################")
    print(dataframe.isnull().sum())

    print("############ SHAPE ####################################")
    print(dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

check_df(df)

####################### CATEGORİCAL VARİABLE ANALYSİS ###################################
def cat_variable(dataframe):

    cat_cols = [col for col in dataframe.columns if str(df[col].dtypes) in ["category", "object", "bool"]]

    print("Categorical Variables" ,cat_cols)

    numerical_but_categorical_variable = [col for col in dataframe.columns if dataframe[col].nunique() < 10 and dataframe[col].dtypes in ["int", "float"]]

    print("Those variables numerical but categorical variables", numerical_but_categorical_variable)

    categorical_but_cardianal_variable = [col for col in dataframe.columns if dataframe[col].nunique() > 20 and str(dataframe[col].dtypes) not in ["int", "float"]]

    print("Those variables categorical variables but cardinal variables", categorical_but_cardianal_variable)

    cat_cols = cat_cols + numerical_but_categorical_variable

    cat_cols = [col for col in cat_cols if col not in categorical_but_cardianal_variable]

    print("All categorical variables are:", cat_cols)

cat_variable(df)
def cat_summary(dataframe, variable, plot=False):
    print(pd.DataFrame({variable: dataframe[variable].value_counts(),
                        "Ratio": 100 * dataframe[variable].value_counts() / len(dataframe)}))

    if plot:
        sns.countplot(x=dataframe[variable], data=dataframe)
        plt.show(block=True)

cat_summary(df,'Age', plot = True)

cat_summary(df,'KM Travelled', plot = True)

cat_summary(df,'Price Charged', plot = True)

cat_summary(df,'Cost of Trip', plot = True)

def categoric_count_plt(dataframe):
    for col in dataframe.columns:
        if dataframe[col].dtypes == [bool, object]:
            sns.boxplot(x=col, data=df)
            plt.title(str(col) + "Distrubition")
            plt.xlabel(str(col))
            plt.show()

num_count_plt(df)


# Profit Analysis
import statsmodels.api as sm
def calculate_profit(dataframe):
    # Total Gain per customer
    total_revenue = dataframe.groupby('Customer ID')['Price Charged'].sum()

    # Total cost per customer
    total_cost = dataframe.groupby('Customer ID')['Cost of Trip'].sum()

    # Customer profit formula
    profit = total_revenue - total_cost

    return profit
profit = calculate_profit(df)
profit.sum()
df["Profit"] = profit
def forecast_profit_and_rides(df, company_type):
    # Filter the data for the given cab type
    df_cab = df[df['Company'] == company_type]

    # Group data by date and sum the profit and count the number of rides
    profit_series = df.groupby('Date of Travel')['Profit'].sum()
    rides_series = df.groupby('Date of Travel').size()

    # Create a DataFrame for the profit and rides
    df_forecast = pd.DataFrame({'Profit': profit_series, 'Rides': rides_series})

    # Perform the forecasting
    profit_results = sm.tsa.ARIMA(df_forecast['Profit'], order=(1, 1, 1)).fit()
    rides_results = sm.tsa.ARIMA(df_forecast['Rides'], order=(1, 1, 1)).fit()

    # Forecast future values
    forecast_periods = 30  # You can change this value as needed
    profit_forecast = profit_results.forecast(steps=forecast_periods)
    rides_forecast = rides_results.forecast(steps=forecast_periods)

    return profit_forecast, rides_forecast

cab_type = 'Pink Cab'
profit_forecast, rides_forecast = forecast_profit_and_rides(df, cab_type)

print("Profit Forecast for", cab_type, ":")
print(profit_forecast)

print("Rides Forecast for", cab_type, ":")
print(rides_forecast)

# Visualization of the forecasts via matplotlib
plt.figure(figsize=(12, 6))
plt.plot(profit_forecast, label='Profit Forecast')
plt.plot(rides_forecast, label='Rides Forecast')
plt.legend()
plt.xlabel('Date')
plt.ylabel('Forecast')
plt.title(f'Profit and Rides Forecast for {cab_type}')
plt.show()

cab_type = 'Yellow Cab'
profit_forecast, rides_forecast = forecast_profit_and_rides(df, cab_type)

print("Profit Forecast for", cab_type, ":")
print(profit_forecast)

print("Rides Forecast for", cab_type, ":")
print(rides_forecast)

# Visualization of the forecasts via matplotlib
plt.figure(figsize=(12, 6))
plt.plot(profit_forecast, label='Profit Forecast')
plt.plot(rides_forecast, label='Rides Forecast')
plt.legend()
plt.xlabel('Date')
plt.ylabel('Forecast')
plt.title(f'Profit and Rides Forecast for {cab_type}')
plt.show()




########################################################################################################################
"""
Hypothesis 1: Gender and Price Relationship: You can examine how gender (Gender) affects the price.
We can compare how much, on average, male and female customers pay. 
Our hypothesis could be as follows: "Do male customers pay, on average, more money than female customers?"
"""
# Load the dataset (update the file path and name according to where you have loaded your data)
# Calculate average prices by gender
def avg_prices(dataframe):

    average_prices = dataframe.groupby("Gender")["Price Charged"].mean()
    print(average_prices)
#### ŞEHİRE GÖRE ORTALAMA FİYAT KARŞILAŞTIRMASI YAP
    # Test the hypothesis
    average_price_male = average_prices["Male"]

    average_price_female = average_prices["Female"]

    if average_price_male > average_price_female:

        print("Male customers pay, on average, more than female customers.")
    elif average_price_male < average_price_female:

        print("Male customers pay on average less than female customers.")

    else:
        print("Male and female customers pay the same average amount.")

avg_prices(df)
"""
Hypothesis result: Male customers pay on average more than female customers.
"""

########################################################################################################################
"""
Hypothesis 2: Age and Journey Duration Relationship: You can investigate how age (Age) affects the travel duration.
We can compare the average travel durations of young and elderly customers.
Our hypothesis could be as follows: "Do young customers, on average, make longer journeys than elderly customers?"
"""
def avg_durac(dataframe):
    average_durations = dataframe.groupby("Age")["KM Travelled"].mean()
    # Print the results
    print(average_durations)
    # Test the hypothesis
    average_duration_young = average_durations[average_durations.index < 30].mean()
    average_duration_old = average_durations[average_durations.index >= 60].mean()

    if average_duration_young > average_duration_old:
        print("Young customers, on average, take longer trips compared to elderly customers.")
    elif average_duration_young < average_duration_old:
        print("Young customers on average take shorter trips compared to elderly customers.")
    else:
        print("Young and elderly customers take trips of the same average duration.")
avg_durac(df)
"""
Hypothesis Result: Young customers, on average, take longer trips compared to elderly customers.

"""
########################################################################################################################
"""
Hypothesis 3: KM Travel and Total Fare Relationship: 
We can examine how the distance traveled in kilometers (KM Travelled) affects the total fare.
We can confirm or refute whether longer-distance journeys are priced higher.
"""

import matplotlib.pyplot as plt


# Calculate the correlation between KM Travelled and Price Charged
correlation = df["KM Travelled"].corr(df["Price Charged"])
# Plot the data
plt.scatter(df["KM Travelled"], df["Price Charged"])
plt.xlabel("KM Travelled")
plt.ylabel("Price Charged")
plt.title(f"Correlation: {correlation:.2f}")
plt.show()
# Test the hypothesis
if correlation > 0:
    print("There is a positive correlation between KM Travelled and Price Charged, "
          "indicating that longer trips tend to have higher fares.")
elif correlation < 0:
    print("There is a negative correlation between KM Travelled and Price Charged, "
          "indicating that longer trips tend to have lower fares.")
else:
    print("There is no significant correlation between KM Travelled and Price Charged.")
"""
Hypothesis Result:There is a positive correlation between 
KM Travelled and Price Charged, indicating that longer trips tend to have higher fares.

"""
########################################################################################################################
"""
Hypothesis 4: Age and Total Fare Relationship: You can investigate how age (Age) influences the total fare paid. 
You can compare the average amount of money paid by young and elderly customers.
"""
# Calculate average total fares by age group
average_fares = df.groupby("Age")["Price Charged"].mean()

# Print the results
print(average_fares)

# Test the hypothesis
average_fare_young = average_fares[average_fares.index < 30].mean()
average_fare_old = average_fares[average_fares.index >= 60].mean()

if average_fare_young > average_fare_old:

    print("Young customers, on average, pay more in total fares compared to elderly customers.")

elif average_fare_young < average_fare_old:

    print("Young customers, on average, pay less in total fares compared to elderly customers.")
else:

    print("Young and elderly customers pay the same average total fare.")

"""
Hypothesis Result: Young customers on average 
pay more in total fares compared to elderly customers.

"""
########################################################################################################################
"""
Hypothesis 5: Gender and Journey Duration Relationship:
We can examine how gender (Gender) affects the travel duration. 
Also, we can compare the average travel durations of male and female customers.
"""
# Calculate average travel durations by gender
average_durations = df.groupby("Gender")["KM Travelled"].mean()

print(average_durations)

# Test the hypothesis
average_duration_male = average_durations["Male"]

average_duration_female = average_durations["Female"]

if average_duration_male > average_duration_female:

    print("Male customers, on average, have longer travel durations compared to female customers.")

elif average_duration_male < average_duration_female:

    print("Male customers on average have shorter travel durations compared to female customers.")

else:

    print("Male and female customers have the same average travel duration.")

"""
Hypothesis Result: Male customers on average have shorter travel durations 
compared to female customers.
"""

########################################################################################################################

