In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns 
import matplotlib.pyplot as plt
from datasist.structdata import detect_outliers 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import xgboost as xgb
import pickle

In [None]:
sns.set(rc={"figure.figsize": [9,9]}, font_scale=1.5)

In [None]:
df=pd.read_csv(r"C:\Users\omara\OneDrive\Desktop\career\internship\cellula\week 4\final_internship_data.csv")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
df.duplicated().sum()

In [None]:
df.sample(3)

In [None]:
df.dropna(axis=0,inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.columns

In [None]:
df.drop(["key","Driver Name","User Name","User ID"] , axis=1,inplace=True)


In [None]:
df.head()

In [None]:
def haversine (lon_1, lon_2, lat_1, lat_2):
    
    lon_1, lon_2, lat_1, lat_2 = map(np.radians, [lon_1, lon_2, lat_1, lat_2])  
    diff_lon = lon_2 - lon_1
    diff_lat = lat_2 - lat_1
    km = 2 * 6371 * np.arcsin(np.sqrt(np.sin(diff_lat/2.0)**2 + 
                                      np.cos(lat_1) * np.cos(lat_2) * np.sin(diff_lon/2.0)**2))
    return km

In [None]:
df["my distance"] =haversine(df["pickup_longitude"],df["dropoff_longitude"],df["pickup_latitude"],df["dropoff_latitude"])

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.drop(["dropoff_latitude","dropoff_longitude","pickup_latitude","pickup_longitude"] , axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df["pickup_datetime"]=pd.to_datetime(df["pickup_datetime"],format="%Y-%m-%d %H:%M:%S",errors='coerce')

In [None]:
df.info()

In [None]:
df=df.convert_dtypes()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df["min"]=df["pickup_datetime"].dt.minute

In [None]:
df.head()

In [None]:
df.drop(["pickup_datetime"],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
px.pie(df,names=df["Car Condition"],title= "Car condition distribution").update_traces(textinfo="percent")

In [None]:
px.histogram(df, x="Car Condition", y="fare_amount", histfunc="avg", title="Average Fare Amount by Car Condition")

In [None]:
px.histogram(df, x="Car Condition", y=df["distance"], histfunc="avg", title="Average distance by Car Condition")

In [None]:
px.histogram(df, x="Car Condition", y=df["my distance"], histfunc="avg", title="Average distance by Car Condition")

In [None]:
df["Car Condition"].unique()

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(18, 5), sharey=True)
conditions = ['Very Good', 'Excellent', 'Bad', 'Good']

for i, condition in enumerate(conditions):
    sns.boxplot(data=df[df['Car Condition'] == condition],
                y='fare_amount',
                ax=axes[i],
                color='skyblue')
    axes[i].set_title(f"Car Condition: {condition}")
    axes[i].set_xlabel("")  
    axes[i].set_ylabel("Fare Amount ($)" if i == 0 else "") 

plt.tight_layout()
plt.suptitle("Fare Amount Distribution by Car Condition", fontsize=16, y=1.05)
plt.show()


In [None]:
px.histogram(df, x="Car Condition", y=df["passenger_count"], histfunc="avg", title="Average passenger count by Car Condition")

In [None]:
df["Weather"].unique()

In [None]:
px.pie(df,names=df["Weather"],title= "Weather condition distribution").update_traces(textinfo="percent")

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=df, x="Car Condition", hue="Weather")
plt.title("Weather Distribution by Car Condition")
plt.ylabel("Number of Rides")
plt.xticks(rotation=30)
plt.show()

In [None]:
plt.figure(figsize=(14,6))
sns.boxplot(data=df, x="Car Condition", y="fare_amount", hue="Weather")
plt.title("Fare Amount Distribution by Car Condition and Weather")
plt.ylabel("Fare Amount")
plt.xticks(rotation=15)
plt.legend(title="Weather", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.show()

In [None]:
df["Traffic Condition"].unique()

In [None]:
px.pie(df,names=df["Traffic Condition"],title= "Traffic condition distribution").update_traces(textinfo="percent")

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=df, x="Car Condition", hue="Traffic Condition")
plt.title("Traffic Condition Distribution by Car Condition")
plt.ylabel("Number of Rides")
plt.xticks(rotation=30)
plt.show()

In [None]:
plt.figure(figsize=(14,6))
sns.boxplot(data=df, x="Car Condition", y="fare_amount", hue=df["Traffic Condition"])
plt.title("Fare Amount Distribution by Car Condition and Traffic Condition")
plt.ylabel("Fare Amount")
plt.xticks(rotation=15)
plt.legend(title="Traffic Condition", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.show()

In [None]:
df.head()

In [None]:
px.box(df, x="Weather", y="fare_amount", color="Car Condition",title="Fare Amount Distribution by Weather and Car Condition")


In [None]:
px.box(df, x=df["Traffic Condition"], y="fare_amount", color="Car Condition",title="Fare Amount Distribution by Traffic Condition and Car Condition")


In [None]:
df.head()

In [None]:
sns.scatterplot(df,x=df["passenger_count"],y=df["fare_amount"])
plt.title("Passenger Count By Fare Amount")

In [None]:
sns.scatterplot(df,x=df["distance"],y=df["fare_amount"])
plt.title("distance By Fare Amount")

In [None]:
sns.scatterplot(df,x=df["my distance"],y=df["fare_amount"])
plt.title("my distance By Fare Amount")

In [None]:
df[df["fare_amount"]<=0].count()

In [None]:
df.drop(df[df["fare_amount"]<=0].index,axis=0,inplace=True)

In [None]:
df[df["distance"]<=0]

In [None]:
df.info()

In [None]:
df.drop(df[df["distance"]<=0].index,axis=0,inplace=True)

In [None]:
df.info()

In [None]:
px.scatter(df,x=df["distance"],y=df["fare_amount"],title="Fare Amount By Distance ")

In [None]:
sns.scatterplot(df,x=df["distance"],y=df["fare_amount"])
plt.title("Fare Amount By Distance ")

In [None]:
sns.scatterplot(df,x=df["my distance"],y=df["fare_amount"])
plt.title("Fare Amount By Distance ")

In [None]:
df[df["my distance"]<=0].count()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".1f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
px.box(df,x=df["distance"])

In [None]:
px.box(df,x=df["my distance"])

In [None]:
sns.boxplot(df,x=df["fare_amount"])

In [None]:
df[(df["fare_amount"]>60) & (df["distance"]<1) ]

In [None]:
df.describe()

In [None]:
df[df["fare_amount"]==df["fare_amount"].min()]

In [None]:
sns.histplot(df,x=df["fare_amount"])

In [None]:
df[df["fare_amount"]==df["fare_amount"].min()]

In [None]:
df.drop(df[df["fare_amount"]==df["fare_amount"].min()].index, axis=0,inplace=True)

In [None]:
df[df["fare_amount"]==df["fare_amount"].min()]

In [None]:
df[df["distance"]==df["distance"].max()]

In [None]:
df.drop(df[df["fare_amount"]==df["fare_amount"].min()].index, axis=0,inplace=True)

In [None]:
df[df["fare_amount"]==df["fare_amount"].min()]

In [None]:
df.groupby("year")["fare_amount"].mean()


In [None]:
sns.barplot(df,x=df["year"],y=df["fare_amount"])
plt.title("Year by Average Fare Amount")

In [None]:
df.columns

In [None]:
df.drop(["my distance"] , axis=1,inplace=True)

In [None]:
df.sample(3)

In [None]:
df[df["distance"]>50]

In [None]:
df[df["distance"] > 50]["fare_amount"].describe()

In [None]:
df.drop(df[df["distance"] > 50].index,axis=0,inplace=True)

In [None]:
df.describe()

In [None]:
df[df["distance"]<0.1]

In [None]:
df[df["distance"] <0.1]["fare_amount"].describe()

In [None]:
df.drop(df[df["distance"]<0.1].index,axis=0,inplace=True)

In [None]:
df[["distance"]].describe()

In [None]:
df.info()

In [None]:
df[df["fare_amount"]>100]["distance"].describe()

In [None]:
df[(df["fare_amount"]>100 ) & ( df["distance"]<10)]

In [None]:
df.drop(df[(df["fare_amount"]>100 ) & ( df["distance"]<10)].index,axis=0,inplace=True)

In [None]:
df[df["fare_amount"]>100]["distance"].describe()

In [None]:
df[(df["fare_amount"]<10 ) & ( df["distance"]>20)]

In [None]:
df[(df["fare_amount"]<10 ) & ( df["distance"]>20)].describe()

In [None]:
df[df["fare_amount"]<10]["distance"].describe()

In [None]:
df.drop(df[(df["fare_amount"]<10 ) & ( df["distance"]>20)].index,axis=0,inplace=True)

In [None]:
df[df["fare_amount"]<10]["distance"].describe()

In [None]:
sns.scatterplot(df,x=df["distance"],y=df["fare_amount"])
plt.title("Fare Amount By Distance ")

In [None]:
df.head()

In [None]:
df.drop(["nyc_dist","jfk_dist","lga_dist","ewr_dist","bearing","min"],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df["fare_per_km"]=df["fare_amount"]/df["distance"]

In [None]:
df.head()

In [None]:
df.groupby("year")[["fare_per_km"]].mean()

In [None]:
df[df['fare_per_km']>20]

In [None]:
df.drop(df[df['fare_per_km']>20].index,axis=0,inplace=True)

In [None]:
df["fare_per_km"].describe()

In [None]:
df[df['fare_per_km']<1]

In [None]:
df.drop(df[df['fare_per_km']<1].index,axis=0,inplace=True)

In [None]:
df["fare_per_km"].describe()

In [None]:
df[df['fare_per_km']>15]

In [None]:
df.drop(df[df['fare_per_km']>15].index,axis=0,inplace=True)

In [None]:
df["fare_per_km"].describe()

In [None]:
df.head()

In [None]:
sns.scatterplot(df,x=df["distance"],y=df["fare_per_km"])

In [None]:
df["sol_dist"].describe()

In [None]:
df[df["sol_dist"]>50]

In [None]:
df.drop(df[df["sol_dist"]>50].index,axis=0,inplace=True)

In [None]:
df["sol_dist"].describe()

In [None]:
sns.countplot(data=df, x="hour",palette="viridis",hue="hour",legend=False)
plt.title("Number of Registrations by Hour")
plt.xlabel("Hour of the Day")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
sns.barplot(data=df, x="hour", y="fare_amount", estimator="mean", errorbar=None,palette="viridis",hue="hour",legend=False)

plt.title("Average Fare Amount by Hour")
plt.xlabel("Hour of the Day")
plt.ylabel("Average Fare Amount")
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.countplot(data=df, x="day",palette="viridis",hue="day",legend=False)
plt.title("Number of Registrations by day")
plt.xlabel("Days")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
sns.barplot(data=df, x="day", y="fare_amount", estimator="mean", errorbar=None,palette="viridis",hue="day",legend=False)

plt.title("Average Fare Amount by day")
plt.xlabel("Days")
plt.ylabel("Average Fare Amount")
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.countplot(data=df, x="month",palette="viridis",hue="month",legend=False)
plt.title("Number of Registrations by month")
plt.xlabel("months")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
sns.barplot(data=df, x="month", y="fare_amount", estimator="mean", errorbar=None,palette="viridis",hue="month",legend=False)

plt.title("Average Fare Amount by month")
plt.xlabel("month")
plt.ylabel("Average Fare Amount")
plt.xticks(rotation=45)
plt.show()

In [None]:
sns.countplot(data=df, x="year",palette="viridis",hue="year",legend=False)
plt.title("Number of Registrations by year")
plt.xlabel("year ")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
sns.barplot(data=df, x="year", y="fare_amount", estimator="mean", errorbar=None,palette="viridis",hue="year",legend=False)

plt.title("Average Fare Amount by year")
plt.xlabel("year")
plt.ylabel("Average Fare Amount")
plt.xticks(rotation=45)
plt.show()

In [None]:
df.head()

In [None]:
df["Car Condition"].unique()

In [None]:
carmap={
    "Bad":0,
    "Good":1,
    "Very Good":2,
    "Excellent":3
}
df["Car Condition"]=df["Car Condition"].map(carmap)

In [None]:
df.columns=df.columns.str.strip().str.lower().str.replace(" ","_")

In [None]:
df.columns

In [None]:
df = pd.get_dummies(df, columns=["weather", "traffic_condition"], drop_first=True)

In [None]:
df.head()

In [None]:
x=df.drop(df[["fare_amount","fare_per_km"]],axis=1)
y=df["fare_amount"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [None]:
scaler=RobustScaler()

In [None]:
X_train[["sol_dist", "distance"]] = scaler.fit_transform(X_train[["sol_dist", "distance"]])
X_test[["sol_dist", "distance"]] = scaler.transform(X_test[["sol_dist", "distance"]])  

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "Bagging": BaggingRegressor(),
    "XGBoost": xgb.XGBRegressor(),
    "KNN": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_r2 = r2_score(y_train, y_train_pred)
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_rmse = np.sqrt(train_mse)
    train_mae = mean_absolute_error(y_train, y_train_pred)

    test_r2 = r2_score(y_test, y_test_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_rmse = np.sqrt(test_mse)
    test_mae = mean_absolute_error(y_test, y_test_pred)

    print(f"Model: {name}")
    print(f"Train - R2: {train_r2:.4f} | RMSE: {train_rmse:.4f} | MSE: {train_mse:.4f} | MAE: {train_mae:.4f}")
    print(f"Test  - R2: {test_r2:.4f} | RMSE: {test_rmse:.4f} | MSE: {test_mse:.4f} | MAE: {test_mae:.4f}")
    print("-" * 50)

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_model = XGBRegressor()
search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=10,
    scoring='r2',
    cv=3,
    verbose=1,
    n_jobs=-1
)

search.fit(X_train, y_train)
print("Best parameters:", search.best_params_)
print("Best CV score (R2):", search.best_score_)


In [None]:
best_model = search.best_estimator_
best_model.fit(X_train, y_train)
y_pred = search.best_estimator_.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f" Mean Squared Error (MSE): {mse:.4f}")
print(f" Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f" Mean Absolute Error (MAE): {mae:.4f}")
print(f" R² Score: {r2:.4f}")


In [None]:
pickle.dump(best_model, open("model.pkl", "wb"))
pickle.dump(scaler, open("scaler.pkl", "wb"))


In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import pickle

st.title("Taxi Fare Prediction")
st.write("This app predicts the taxi fare amount based on various trip and environment details.")
st.image("https://oceansidetaxi.ca/wp-content/uploads/uber-estimate.png", width=700)

# --- User Inputs ---
car_condition = st.selectbox("Car Condition", ["Bad", "Good", "Very Good", "Excellent"])
car_map = {"Bad": 0, "Good": 1, "Very Good": 2, "Excellent": 3}
car_condition_val = car_map[car_condition]

passenger_count = st.number_input("Number of Passengers", min_value=0, max_value=6, value=1)

hour = st.slider("Hour of the Day", 0, 23, 12)
day = st.slider("Day of Month", 1, 31, 15)
month = st.selectbox("Month", list(range(1, 13)))
weekday_name = st.selectbox("Day of the Week", [
    "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"
])
weekday_map = {
    "Monday": 0,
    "Tuesday": 1,
    "Wednesday": 2,
    "Thursday": 3,
    "Friday": 4,
    "Saturday": 5,
    "Sunday": 6
}
weekday = weekday_map[weekday_name]
year = st.selectbox("Year", [2009,2010,2011,2012,2013,2014,2015])

sol_dist = st.number_input("Distance from statue of liberty (km)", min_value=0.0, max_value=50.0, step=0.1)
distance = st.number_input("Trip Distance (km)", min_value=0.0, max_value=50.0, step=0.1)

# Weather (cloudy is dropped in one-hot, so not shown)
weather = st.selectbox("Weather", ["Cloudy", "Rainy", "Stormy", "Sunny", "Windy"])
weather_features = ['weather_rainy', 'weather_stormy', 'weather_sunny', 'weather_windy']
weather_encoding = [1 if weather.lower() in w else 0 for w in weather_features]

# Traffic (Congested Traffic is dropped)
traffic = st.selectbox("Traffic Condition", ["Congested Traffic", "Dense Traffic", "Flow Traffic"])
traffic_features = ['traffic_condition_Dense Traffic', 'traffic_condition_Flow Traffic']
traffic_encoding = [1 if traffic in t else 0 for t in traffic_features]

# --- Prepare Input ---
input_data = [[
    car_condition_val, passenger_count, hour, day, month, weekday,
    year, sol_dist, distance
] + weather_encoding + traffic_encoding]

columns = [
    'car_condition', 'passenger_count', 'hour', 'day', 'month', 'weekday',
    'year', 'sol_dist', 'distance',
    'weather_rainy', 'weather_stormy', 'weather_sunny', 'weather_windy',
    'traffic_condition_Dense Traffic', 'traffic_condition_Flow Traffic'
]

input_df = pd.DataFrame(input_data, columns=columns)

# --- Load Scaler and Model ---
scaler = pickle.load(open("scaler.pkl", "rb"))
model = pickle.load(open("model.pkl", "rb"))

# --- Scale sol_dist and distance only ---
input_df[["sol_dist", "distance"]] = scaler.transform(input_df[["sol_dist", "distance"]])

# --- Predict ---
if st.button("Predict Fare"):
    predicted_fare = model.predict(input_df)[0]
    st.write("**Predicted Fare Amount:**")
    st.success(f"${predicted_fare:.2f}")
    st.write("---")
    st.write("**Input Data Used for Prediction:**")
    st.dataframe(input_df)


In [None]:
!streamlit run app.py