In [None]:
import pandas as pd
from pathlib import Path

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # ploting library with python
from sklearn.linear_model import LinearRegression # Library for linear regression model
from sklearn.model_selection import train_test_split

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
# წავიკითხოთ მონაცემები pandas dataframe, ლიმიტი 10,000,000 მონაცემია
train_data_set =  pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv', nrows = 10_000_000, parse_dates=["pickup_datetime"])

# დავბეჭდოთ პირველი 5 ერთეული მონაცემებიდან
train_data_set.head(5)


In [None]:
print(train_data_set.dtypes)
train_data_set.describe()


In [None]:
old_len = len(train_data_set)
train_data_set = train_data_set[train_data_set.fare_amount>=0.1]
new_len = len(train_data_set)
print(f"Removed {(old_len-new_len)} entities from the dataset")
train_data_set.describe()


In [None]:
old_len = len(train_data_set)
train_data_set = train_data_set.dropna(how = 'any', axis = 'rows')
new_len = len(train_data_set)
print(f"Removed {(old_len-new_len)} entities from the dataset")


In [None]:
train_data_set.fare_amount.hist(bins=100, figsize=(14,3))
plt.xlabel('fare $USD')
plt.title('Histogram');


In [None]:
# helper მეთოდი რომ ავირჩიოთ ყველა მონაცემი საზღვრებში
def select_within_boundingbox(df, box):
    return (df.pickup_longitude >= box[0]) & (df.pickup_longitude <= box[1]) & \
           (df.pickup_latitude >= box[2]) & (df.pickup_latitude <= box[3]) & \
           (df.dropoff_longitude >= box[0]) & (df.dropoff_longitude <= box[1]) & \
           (df.dropoff_latitude >= box[2]) & (df.dropoff_latitude <= box[3])

# ნიუ იორკის საზღვრები
new_york_box = (-74.763379, -72.856164, 40.502009, 41.915509)

old_len = len(train_data_set)
train_data_set = train_data_set[select_within_boundingbox(train_data_set, new_york_box)]
new_len = len(train_data_set)
print(f"Removed {(old_len-new_len)} entities from the dataset")


In [None]:
def distance_on_the_sphere(lat1, lon1, lat2, lon2):
    earth_radius = 6371 # დედამიწის რადიუსი კილომეტრებში
    # კუთხეები გვჭირდება რადიანებში
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2.0) * np.sin(delta_phi / 2.0) + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) * np.sin(delta_lambda / 2.0)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return earth_radius * c

# ჩავამატოთ distance feature ჩვენს მონაცემებში
train_data_set['distance'] = distance_on_the_sphere(train_data_set['pickup_latitude'], train_data_set['pickup_longitude'],
                                          train_data_set['dropoff_latitude'], train_data_set['dropoff_longitude'])

train_data_set.head(5)


In [None]:
train_data_set['pickup_datetime'] = pd.to_datetime(train_data_set['pickup_datetime'])
train_data_set['hour'] = train_data_set['pickup_datetime'].dt.hour
train_data_set['year'] = train_data_set['pickup_datetime'].dt.year
train_data_set['day_of_week'] = train_data_set['pickup_datetime'].dt.dayofweek
train_data_set['is_rush_hour'] = train_data_set['hour'].apply(lambda x: 1 if x >= 7 and x <= 10 or x >= 16 and x <= 19 else 0)

train_data_set.head(5)


In [None]:
nyc_down_town = (-74.0063889, 40.7141667)

train_data_set['distance_to_downtown'] = distance_on_the_sphere(nyc_down_town[1], nyc_down_town[0], train_data_set.pickup_latitude, train_data_set.pickup_longitude)

train_data_set.head(5)


In [None]:
idx = train_data_set.passenger_count != 0 & (train_data_set.distance_to_downtown<15)

features = ['hour', 'year', 'distance', 'passenger_count']
target = 'fare_amount'

X = train_data_set[idx][features].values
y = train_data_set[idx][target].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)


In [None]:
test_data_set =  pd.read_csv('../input/new-york-city-taxi-fare-prediction/test.csv')
test_data_set['distance'] = distance_on_the_sphere(test_data_set['pickup_latitude'], test_data_set['pickup_longitude'],
                                          test_data_set['dropoff_latitude'], test_data_set['dropoff_longitude'])
test_data_set['distance_to_downtown'] = distance_on_the_sphere(nyc_down_town[1], nyc_down_town[0], test_data_set.pickup_latitude, test_data_set.pickup_longitude)
test_data_set['pickup_datetime'] = pd.to_datetime(test_data_set['pickup_datetime'])
test_data_set['hour'] = test_data_set['pickup_datetime'].dt.hour
test_data_set['year'] = test_data_set['pickup_datetime'].dt.year


In [None]:
filename = './output/baseline_linear'

XTEST = test_data_set[features].values

y_pred_final = linear_model.predict(XTEST)

submission = pd.DataFrame(
    {'key': test_data_set.key, 'fare_amount': y_pred_final},
    columns = ['key', 'fare_amount'])
submission.to_csv('submission.csv', index = False)
