In [None]:
import pandas as pd
from pathlib import Path

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Load the data
train_df =  pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/train.csv', nrows = 10_000_000)
test_df =  pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/test.csv')


In [None]:
train_df.head()


In [None]:
test_df.head()


In [None]:
# check datatypes
train_df.dtypes


In [None]:
train_df.describe()


In [None]:
num_rows = len(train_df)
train_df = train_df[(train_df['fare_amount'] > 0)]
print(f'Drop {num_rows - len(train_df)} rows')


In [None]:
def change_outliers_by_range(df, column_name, min_range, max_range):
    before_len = df.shape[0]
    mask = (df[column_name].between(min_range,max_range))
    selected_rows = df[mask]
    changed_rows = before_len - selected_rows.shape[0]
    
    dtype_of_column = df[column_name].dtype
    mean_of_column = selected_rows[column_name].mean()
    if dtype_of_column == np.int64:
        mean_of_column = round(mean_of_column)
    
    df.loc[~mask, column_name] = mean_of_column
    return changed_rows
    

def change_outliers(df):
    print("Change", change_outliers_by_range(df, 'pickup_latitude', 40.5, 41.0), "rows by pickup lat")
    print("Change", change_outliers_by_range(df, 'dropoff_latitude', 40.5, 41.0), "rows by dropoff lat")
    print("Change", change_outliers_by_range(df, 'pickup_longitude', -74.3, -73.6), "rows by pickup long")
    print("Change", change_outliers_by_range(df, 'dropoff_longitude', -74.3, -73.6), "rows by dropoff long")
    print("Change", change_outliers_by_range(df, 'passenger_count', 1, 8), "rows by passenger cnt")
    

print("Training data outliers: ")
change_outliers(train_df)
print("\nTest data outliers: ")
change_outliers(test_df)


In [None]:
train_df.describe()


In [None]:
train_df.isnull().sum()


In [None]:
# Data preprocessing
def preprocess_data(df):
    airport_lat_long = (40.644600, -73.779700)
    la_guardia_airport_lat_long = (40.7733, -73.8718)
    near_airport = (((df["pickup_latitude"] <= airport_lat_long[0] + 0.005) & 
                (df["pickup_latitude"] >= airport_lat_long[0] - 0.005) & 
                (df["pickup_longitude"] <= airport_lat_long[1] + 0.005) & 
                (df["pickup_longitude"] >= airport_lat_long[1] - 0.005)) |
                
                ((df["pickup_latitude"] <= la_guardia_airport_lat_long[0] + 0.002) & 
                (df["pickup_latitude"] >= la_guardia_airport_lat_long[0] - 0.003) & 
                (df["pickup_longitude"] <= la_guardia_airport_lat_long[1] + 0.005) & 
                (df["pickup_longitude"] >= la_guardia_airport_lat_long[1] - 0.005))).astype(int)

    df['near_airport'] = near_airport
    
    df['manhattan_distance'] = (abs(df['pickup_longitude'] - df['dropoff_longitude']) +
                                 abs(df['pickup_latitude'] - df['dropoff_latitude']))
    
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    df['pickup_year'] = df['pickup_datetime'].dt.year
    df['pickup_month'] = df['pickup_datetime'].dt.month
    df['pickup_hour'] = df['pickup_datetime'].dt.hour
    df['pickup_day'] = df['pickup_datetime'].dt.dayofweek
    is_weekend = ((df["pickup_day"] >=5) & 
                    (df["pickup_day"] <=6)).astype(int)
    df['is_weekend'] = is_weekend
    
    is_holiday = (
        ((df['pickup_month'] == 12) & (df['pickup_datetime'].dt.day == 25)) |  
        ((df['pickup_month'] == 12) & (df['pickup_datetime'].dt.day == 26)) | 
        ((df['pickup_month'] == 12) & (df['pickup_datetime'].dt.day == 31)) |  
        ((df['pickup_month'] == 1) & (df['pickup_datetime'].dt.day == 1)) | 
        ((df['pickup_month'] == 7) & (df['pickup_datetime'].dt.day == 4))
    ).astype(int)

    df['is_holiday'] = is_holiday
    

preprocess_data(train_df)
preprocess_data(test_df)


In [None]:
train_df.head()


In [None]:
train_df["near_airport"].sum()


In [None]:
train_df["is_holiday"].sum()


In [None]:
train_df["is_weekend"].sum()


In [None]:
grouped = train_df.groupby('pickup_hour')['fare_amount'].mean().reset_index()

plt.figure(figsize=(12, 6))
plt.bar(grouped['pickup_hour'], grouped['fare_amount'])
plt.title('Average Fare Amount by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Average Fare Amount')
plt.xticks(grouped['pickup_hour'])
plt.show()


In [None]:
grouped = train_df.groupby('near_airport')['fare_amount'].mean().reset_index()

plt.figure(figsize=(2, 5))
plt.bar(grouped['near_airport'], grouped['fare_amount'])
plt.title('Average Fare Amount by Near Airport')
plt.xlabel('Is Near of Airport')
plt.ylabel('Average Fare Amount')
plt.xticks(grouped['near_airport'])
plt.show()


In [None]:
grouped = train_df.groupby('pickup_year')['fare_amount'].mean().reset_index()

plt.figure(figsize=(6, 6))
plt.bar(grouped['pickup_year'], grouped['fare_amount'])
plt.title('Average Fare Amount by Year')
plt.xlabel('Year')
plt.ylabel('Average Fare Amount')
plt.xticks(grouped['pickup_year'])
plt.show()


In [None]:
grouped = train_df.groupby('pickup_month')['fare_amount'].mean().reset_index()

plt.figure(figsize=(6, 6))
plt.bar(grouped['pickup_month'], grouped['fare_amount'])
plt.title('Average Fare Amount by Month')
plt.xlabel('Month')
plt.ylabel('Average Fare Amount')
plt.xticks(grouped['pickup_month'])
plt.show()


In [None]:
grouped = train_df.groupby('passenger_count')['fare_amount'].mean().reset_index()

plt.figure(figsize=(6, 6))
plt.bar(grouped['passenger_count'], grouped['fare_amount'])
plt.title('Average Fare Amount by Passenger Count')
plt.xlabel('Passenger Count')
plt.ylabel('Average Fare Amount')
plt.xticks(grouped['passenger_count'])
plt.show()


In [None]:
grouped = train_df.groupby('pickup_day')['fare_amount'].mean().reset_index()

plt.figure(figsize=(5, 4))
plt.bar(grouped['pickup_day'], grouped['fare_amount'])
plt.title('Average Fare Amount by Day')
plt.xlabel('Day')
plt.ylabel('Average Fare Amount')
plt.xticks(grouped['pickup_day'])
plt.show()


In [None]:
grouped = train_df.groupby('is_holiday')['fare_amount'].mean().reset_index()

plt.figure(figsize=(2, 4))
plt.bar(grouped['is_holiday'], grouped['fare_amount'])
plt.title('Average Fare Amount by Holiday')
plt.xlabel('Is Holiday')
plt.ylabel('Average Fare Amount')
plt.xticks(grouped['is_holiday'])
plt.show()


In [None]:
features = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'near_airport', 'manhattan_distance', 'pickup_year', 'pickup_month', 'pickup_hour', 'passenger_count']
X = train_df[features].values
y = train_df['fare_amount'].values


In [None]:
X_test = test_df[features].values


In [None]:
# Feature scaling
# scaler = StandardScaler()
# X = scaler.fit_transform(X)
# X_test = scaler.transform(X_test)


In [None]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=69)


In [None]:
print(X.shape)
print(y.shape)

print(X_train.shape)
print(y_train.shape)

print(X_val.shape)
print(y_val.shape)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Train Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)


In [None]:
# Evaluate on validation set
validation_predictions_lr = lr_model.predict(X_val)
validation_rmse_lr = np.sqrt(mean_squared_error(y_val, validation_predictions_lr))
print("Validation RMSE (Linear Regression):", validation_rmse_lr)

# Evaluate on full training set
train_predictions_lr = lr_model.predict(X)
train_rmse_lr = np.sqrt(mean_squared_error(y, train_predictions_lr))
print("Training RMSE (Linear Regression):", train_rmse_lr)


In [None]:
# Make predictions on the test set
test_predictions_lr = lr_model.predict(X_test)

# Save predictions to a CSV file
submission_df_lr =  pd.DataFrame(
    {'key': test_df.key, 'fare_amount': test_predictions_lr},
    columns = ['key', 'fare_amount'])
submission_df_lr.to_csv('lr_submission.csv', index=False)


In [None]:
import xgboost as xgb

# Define XGBoost model parameters
xgb_params = {
    'objective': 'reg:squarederror',  # Regression task
    'eval_metric': 'rmse',  # Evaluation metric (Root Mean Squared Error)
    'max_depth': 20,  # Maximum depth of the decision trees
    'subsample': 0.85,  # Subsample ratio of the training instances
    'colsample_bytree': 0.8,  # Subsample ratio of columns when constructing each tree
    'eta': 0.04,  # Learning rate
    'min_child_weight': 3,  # Minimum sum of instance weight needed in a child
    'gamma': 0.1,  # Minimum loss reduction required to make a further partition
    'seed': 42,  # Random seed for reproducibility
    'tree_method': 'hist',  # Use the histogram-based algorithm for better performance
    'nthread': -1,  # Use all available CPU cores
}

# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val)
dtest = xgb.DMatrix(X_test)
dallTrain = xgb.DMatrix(X)

# Train XGBoost model
xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=80)


In [None]:
# Evaluate on validation set
validation_predictions_xgb = xgb_model.predict(dval)
validation_rmse_xgb = np.sqrt(mean_squared_error(y_val, validation_predictions_xgb))
print("Validation RMSE (XGBoost):", validation_rmse_xgb)

# Train on validation set
train_predictions_xgb = xgb_model.predict(dallTrain)
train_rmse_xgb = np.sqrt(mean_squared_error(y, train_predictions_xgb))
print("Train RMSE (XGBoost):", train_rmse_xgb)


In [None]:
# # Define the model
# model = Sequential([
#     Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
#     Dropout(0.2),
#     Dense(64, activation='relu'),
#     Dropout(0.2),
#     Dense(1, activation='linear')
# ])

# # Compile the model
# model.compile(optimizer='adam', loss='mean_squared_error')

# # Train the model
# model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_val, y_val))


In [None]:
# # Evaluate the model
# val_predictions = model.predict(X_val)
# val_rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
# print("Validation RMSE: (Neural Network)", val_rmse)

# # Evaluate the model
# train_predictions = model.predict(X)
# train_rmse = np.sqrt(mean_squared_error(y, train_predictions))
# print("Train RMSE: (Neural Network)", train_rmse)


In [None]:
# Make predictions on the test set
test_predictions_xgb = xgb_model.predict(dtest)

# Save predictions to a CSV file
submission_df_xgb =  pd.DataFrame(
    {'key': test_df.key, 'fare_amount': test_predictions_xgb},
    columns = ['key', 'fare_amount'])
submission_df_xgb.to_csv('submission.csv', index=False)
