In [1]:
import numpy as np
import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Function to calculate distance between two lat-lng points
def haversine_distance(lat1, lon1, lat2, lon2):
    # Radius of the Earth in kilometers
    R = 6371.0
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    # Difference in coordinates
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    # Haversine formula
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    # Distance in kilometers
    distance = R * c
    return distance

In [3]:
def get_season(month):
    if 3 <= month <= 5:
        return 'Spring'
    elif 6 <= month <= 8:
        return 'Summer'
    elif 9 <= month <= 11:
        return 'Fall'
    else:
        return 'Winter'

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
filepath = '/content/drive/My Drive/merged_rides_with_elevation.csv'

In [6]:
df = pd.read_csv(filepath)

In [7]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,Start_Altitude,End_Altitude,end_altitude,end_Altitude,Start_Altutude,State_Altitude,Elevation_Change
0,C2F7DD78E82EC875,electric_bike,1/13/2022 11:59,1/13/2022 12:02,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.0128,-87.665906,42.01256,-87.674367,casual,184.1,603.6,,,,,419.5
1,A6CF8980A652D272,electric_bike,1/10/2022 8:41,1/10/2022 8:46,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,42.012763,-87.665967,42.01256,-87.674367,casual,184.6,603.6,,,,,419.0
2,BD0F91DFF741C66D,classic_bike,1/25/2022 4:53,1/25/2022 4:58,Sheffield Ave & Fullerton Ave,TA1306000016,Greenview Ave & Fullerton Ave,TA1307000001,41.925602,-87.653708,41.92533,-87.6658,member,181.7,592.5,,,,,410.8
3,CBB80ED419105406,classic_bike,1/4/2022 0:18,1/4/2022 0:33,Clark St & Bryn Mawr Ave,KA1504000151,Paulina St & Montrose Ave,TA1309000021,41.983593,-87.669154,41.961507,-87.671387,casual,182.9,598.1,,,,,415.2
4,DDC963BFDDA51EEA,classic_bike,1/20/2022 1:31,1/20/2022 1:37,Michigan Ave & Jackson Blvd,TA1309000002,State St & Randolph St,TA1305000029,41.87785,-87.62408,41.884621,-87.627834,member,182.5,594.7,,,,,412.2


In [8]:
df['Start_Altitude'] = df[['Start_Altitude', 'Start_Altutude', 'State_Altitude']].bfill(axis=1).iloc[:, 0]

In [9]:
df.drop(columns=['State_Altitude', 'Start_Altutude'], inplace=True)


In [10]:
df['End_Altitude'] = df[['End_Altitude', 'end_altitude', 'end_Altitude']].bfill(axis=1).iloc[:, 0]

In [11]:
df.drop(columns=['end_Altitude', 'end_altitude'], inplace=True)

In [12]:
df['Start_Altitude'].fillna(method='ffill', inplace=True)
df['End_Altitude'].fillna(df['End_Altitude'].mean(), inplace=True)

In [13]:
from sklearn.impute import SimpleImputer
df['Elevation_Change'] = df['End_Altitude'] - df['Start_Altitude']

In [14]:
imputer = SimpleImputer(strategy='mean')
df['Elevation_Change'] = imputer.fit_transform(df[['Elevation_Change']])

In [15]:
df = df.drop(columns=['Start_Altitude', 'End_Altitude'])

In [16]:
df['Distance'] = df.apply(lambda x: haversine_distance(x['start_lat'], x['start_lng'], x['end_lat'], x['end_lng']), axis=1)

In [17]:
df = df.drop(columns=['start_lat', 'start_lng', 'end_lat', 'end_lng'])

In [18]:
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])

In [19]:
df['season'] = df['started_at'].dt.month.map(get_season)

In [20]:
def get_day_of_week(timestamp):
    return timestamp.strftime("%A")

In [21]:
# Apply the function to each row in the DataFrame
df['day_of_week'] = df['started_at'].apply(get_day_of_week)

In [22]:
df['trip_duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60

In [23]:
weatherData = '/content/drive/My Drive/3667056.csv'

In [24]:
tempData = pd.read_csv(weatherData)

In [25]:
tempData['started_at'] = pd.to_datetime(tempData['DATE'])
tempData['started_at'] = tempData['started_at'].dt.date

In [26]:
df['started_at'] = df['started_at'].dt.date

In [27]:
df = pd.merge(df, tempData[['started_at', 'TMAX', 'TMIN']], on='started_at', how='left')

In [None]:
# ride_types = dfs['rideable_type'].unique()

In [None]:
# print(ride_types)

['electric_bike' 'classic_bike' 'docked_bike']


In [28]:
df['rideable_type'] = df['rideable_type'].replace({
    'electric_bike': 'electric',
    'docked_bike': 'classic',
    'classic_bike': 'classic'
})

In [29]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,member_casual,Elevation_Change,Distance,season,day_of_week,trip_duration,TMAX,TMIN
0,C2F7DD78E82EC875,electric,2022-01-13,2022-01-13 12:02:00,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,casual,419.5,0.699547,Winter,Thursday,3.0,41,29
1,A6CF8980A652D272,electric,2022-01-10,2022-01-10 08:46:00,Glenwood Ave & Touhy Ave,525,Clark St & Touhy Ave,RP-007,casual,419.0,0.694322,Winter,Monday,5.0,20,8
2,BD0F91DFF741C66D,classic,2022-01-25,2022-01-25 04:58:00,Sheffield Ave & Fullerton Ave,TA1306000016,Greenview Ave & Fullerton Ave,TA1307000001,member,410.8,1.000832,Winter,Tuesday,5.0,15,2
3,CBB80ED419105406,classic,2022-01-04,2022-01-04 00:33:00,Clark St & Bryn Mawr Ave,KA1504000151,Paulina St & Montrose Ave,TA1309000021,casual,415.2,2.462779,Winter,Tuesday,15.0,36,24
4,DDC963BFDDA51EEA,classic,2022-01-20,2022-01-20 01:37:00,Michigan Ave & Jackson Blvd,TA1309000002,State St & Randolph St,TA1305000029,member,412.2,0.814537,Winter,Thursday,6.0,22,9


In [31]:
df.fillna(df.mean(numeric_only=True), inplace=True)  # Only fill numeric columns with their means
df.fillna('Unknown', inplace=True)

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [32]:
target_variable = 'rideable_type'
y = df[target_variable]
X = df.drop(target_variable, axis=1)

In [34]:
label_encoders = {}
for column in X.columns:
    if X[column].dtype == 'object':  # This identifies categorical columns
        num_unique_values = len(X[column].unique())
        if num_unique_values < 10:  # Limit set for one-hot encoding
            # Apply one-hot encoding to columns with fewer unique values
            dummies = pd.get_dummies(X[column], prefix=column)
            X = pd.concat([X, dummies], axis=1)
        else:
            # Apply label encoding to columns with many unique values to save memory
            label_encoders[column] = LabelEncoder()
            X[column] = label_encoders[column].fit_transform(X[column].astype(str))
        X.drop(column, axis=1, inplace=True)  # Drop original column after encoding

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [36]:
y_train_encoded = (y_train == 'electric').astype(int)  # Encode 'electric' as 1, 'classic' as 0
y_test_encoded = (y_test == 'electric').astype(int)

In [37]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [38]:
model = LinearRegression()


In [47]:
X_train.drop(columns=['ended_at'], inplace=True)

KeyError: "['ended_at'] not found in axis"

In [44]:
print(X_train.dtypes)
print(y_train_encoded)

Elevation_Change         float64
Distance                 float64
trip_duration            float64
TMAX                       int64
TMIN                       int64
member_casual_Unknown       bool
member_casual_casual        bool
member_casual_member        bool
season_Fall                 bool
season_Spring               bool
season_Summer               bool
season_Winter               bool
day_of_week_Friday          bool
day_of_week_Monday          bool
day_of_week_Saturday        bool
day_of_week_Sunday          bool
day_of_week_Thursday        bool
day_of_week_Tuesday         bool
day_of_week_Wednesday       bool
dtype: object
2220921     1
6189410     0
587405      1
10376738    0
3510934     0
           ..
2234489     1
4304572     1
10081351    1
6550634     1
6423388     1
Name: rideable_type, Length: 7971288, dtype: int64


In [48]:
X_test.drop(columns=['ended_at'], inplace=True)

In [45]:
# Fit the model on the training data with encoded labels
model.fit(X_train, y_train_encoded)

In [49]:
y_pred = model.predict(X_test)

In [50]:
mse = mean_squared_error(y_test_encoded, y_pred)
r2 = r2_score(y_test_encoded, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

Mean Squared Error: 0.24815011076878607
R² Score: 0.00678783227747759


In [51]:
coefficients = pd.DataFrame(model.coef_, X_train.columns, columns=['Coefficient'])
print(coefficients)

                        Coefficient
Elevation_Change      -2.161199e-04
Distance              -2.003583e-05
trip_duration         -9.359789e-05
TMAX                  -1.588950e-03
TMIN                  -2.694059e-04
member_casual_Unknown  1.040834e-17
member_casual_casual   2.735762e-02
member_casual_member  -2.735762e-02
season_Fall            1.525531e-02
season_Spring          1.849525e-03
season_Summer          8.342118e-03
season_Winter         -2.544695e-02
day_of_week_Friday     1.708638e-02
day_of_week_Monday    -5.980562e-03
day_of_week_Saturday  -2.203042e-02
day_of_week_Sunday    -2.604875e-02
day_of_week_Thursday   1.682715e-02
day_of_week_Tuesday    5.087729e-03
day_of_week_Wednesday  1.505847e-02


In [52]:
from sklearn.metrics import recall_score, f1_score, precision_score, classification_report

In [53]:
y_pred = model.predict(X_test)  # This should be predictions on the test set
y_pred_encoded = (y_pred > 0.5).astype(int)

In [54]:
recall = recall_score(y_test_encoded, y_pred_encoded)  # pos_label defaults to 1 which is suitable here
f1 = f1_score(y_test_encoded, y_pred_encoded)

In [55]:
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

# For a comprehensive report including precision and support per class
report = classification_report(y_test_encoded, y_pred_encoded)
print(report)

Recall: 0.6440387883634909
F1-Score: 0.5850402971993045
              precision    recall  f1-score   support

           0       0.53      0.41      0.46   1665742
           1       0.54      0.64      0.59   1750525

    accuracy                           0.53   3416267
   macro avg       0.53      0.53      0.52   3416267
weighted avg       0.53      0.53      0.53   3416267



In [56]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [57]:
gb_regressor = GradientBoostingRegressor(n_estimators=100, max_depth=3, random_state=42)  # Hyperparameters can be adjusted
gb_regressor.fit(X_train, y_train_encoded)

In [59]:
y_pred = gb_regressor.predict(X_test)

In [60]:
feature_importances = gb_regressor.feature_importances_


In [63]:
feature_names = X_train.columns

In [65]:
importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)
print(importances_df)

                  Feature  Importance
1                Distance    0.498346
2           trip_duration    0.416795
0        Elevation_Change    0.050170
6    member_casual_casual    0.019550
7    member_casual_member    0.010294
4                    TMIN    0.002744
3                    TMAX    0.000775
8             season_Fall    0.000307
10          season_Summer    0.000297
9           season_Spring    0.000266
12     day_of_week_Friday    0.000125
15     day_of_week_Sunday    0.000122
11          season_Winter    0.000104
14   day_of_week_Saturday    0.000092
13     day_of_week_Monday    0.000013
5   member_casual_Unknown    0.000000
16   day_of_week_Thursday    0.000000
17    day_of_week_Tuesday    0.000000
18  day_of_week_Wednesday    0.000000


In [67]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [69]:
# Assuming gb_regressor is your trained gradient boosting regression model
predictions = gb_regressor.predict(X_test)  # X_test is your test data

# Calculate evaluation metrics
mae = mean_absolute_error(y_test_encoded, y_pred)
mse = mean_squared_error(y_test_encoded, y_pred)
rmse = mean_squared_error(y_test_encoded, y_pred, squared=False)  # RMSE
r2 = r2_score(y_test_encoded, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Mean Absolute Error: 0.41125352854674235
Mean Squared Error: 0.19575479960088926
Root Mean Squared Error: 0.44244186013632264
R-squared: 0.21649823870180052
