In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv('merged_rides__with_elevation.csv')

df.fillna(df.mean(numeric_only=True), inplace=True)  # Only fill numeric columns with their means
df.fillna('Unknown', inplace=True)

target_variable = 'rideable_type'
y = df[target_variable]
X = df.drop(target_variable, axis=1)

# Apply encoding to the features
label_encoders = {}
for column in X.columns:
    if X[column].dtype == 'object':  # This identifies categorical columns
        num_unique_values = len(X[column].unique())
        if num_unique_values < 10:  # Limit set for one-hot encoding
            # Apply one-hot encoding to columns with fewer unique values
            dummies = pd.get_dummies(X[column], prefix=column)
            X = pd.concat([X, dummies], axis=1)
        else:
            # Apply label encoding to columns with many unique values to save memory
            label_encoders[column] = LabelEncoder()
            X[column] = label_encoders[column].fit_transform(X[column].astype(str))
        X.drop(column, axis=1, inplace=True)  # Drop original column after encoding

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

y_train_encoded = (y_train == 'electric').astype(int)  # Encode 'electric' as 1, 'classic' as 0
y_test_encoded = (y_test == 'electric').astype(int)


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Assuming X_train, X_test, y_train, and y_test have already been defined and split appropriately

# Encode 'electric' as 1, 'classic' as 0 in both training and testing labels
y_train_encoded = (y_train == 'electric').astype(int)
y_test_encoded = (y_test == 'electric').astype(int)

# Initialize the LinearRegression model
model = LinearRegression()

# Fit the model on the training data with encoded labels
model.fit(X_train, y_train_encoded)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculating R-squared and Mean Squared Error based on predictions and the true test labels
mse = mean_squared_error(y_test_encoded, y_pred)
r2 = r2_score(y_test_encoded, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

# Displaying coefficients
coefficients = pd.DataFrame(model.coef_, X_train.columns, columns=['Coefficient'])
print(coefficients)

from sklearn.metrics import recall_score, f1_score, precision_score, classification_report

# Predict on the test set (assuming `model` has already been trained on `X_train, y_train_encoded`)
y_pred = model.predict(X_test)  # This should be predictions on the test set
y_pred_encoded = (y_pred > 0.5).astype(int)  # Assuming you're using a threshold of 0.5 if needed

# Calculate recall, F1 score using the encoded test labels and predictions
recall = recall_score(y_test_encoded, y_pred_encoded)  # pos_label defaults to 1 which is suitable here
f1 = f1_score(y_test_encoded, y_pred_encoded)

# Print recall and F1-score
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

# For a comprehensive report including precision and support per class
report = classification_report(y_test_encoded, y_pred_encoded)
print(report)

Mean Squared Error: 0.0
R² Score: 1.0
                       Coefficient
start_lat                      0.0
start_lng                      0.0
end_lat                        0.0
end_lng                        0.0
Start_Altitude                 0.0
End_Altitude                   0.0
end_altitude                   0.0
end_Altitude                   0.0
Start_Altutude                 0.0
State_Altitude                 0.0
Elevation_Change               0.0
member_casual_Unknown          0.0
member_casual_casual           0.0
member_casual_member           0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Recall: 0.0
F1-Score: 0.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   3416267

    accuracy                           1.00   3416267
   macro avg       1.00      1.00      1.00   3416267
weighted avg       1.00      1.00      1.00   3416267



In [2]:
del df

In [3]:
from sklearn.ensemble import RandomForestClassifier

# Assuming RandomForestClassifier is fitted as `rf`
rf = RandomForestClassifier(n_estimators=10, max_depth=10, random_state=42)  # Limiting depth

rf.fit(X_train, y_train)

# Feature importancesz
importances = rf.feature_importances_
feature_names = X_train.columns

# Create a DataFrame to view the features and their importance scores
importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)
print(importances_df)

                  Feature  Importance
1               start_lng    0.322430
3                 end_lng    0.226781
0               start_lat    0.166575
2                 end_lat    0.091401
4          Start_Altitude    0.082592
5            End_Altitude    0.034461
12   member_casual_casual    0.030882
13   member_casual_member    0.024493
10       Elevation_Change    0.013673
7            end_Altitude    0.005454
9          State_Altitude    0.000680
8          Start_Altutude    0.000519
6            end_altitude    0.000058
11  member_casual_Unknown    0.000000


In [4]:
# Assuming `rf` is your trained RandomForestClassifier
y_pred = rf.predict(X_test)

from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
# Calculate metrics for each class
precision = precision_score(y_test, y_pred, average=None)  # returns an array for each class
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

print("Precision by class:", precision)
print("Recall by class:", recall)
print("F1-Score by class:", f1)
# Print a classification report
report = classification_report(y_test, y_pred)
print(report)

Precision by class: [0.63066374 0.69230769 0.7236172 ]
Recall by class: [7.51173112e-01 1.17504211e-04 6.29748790e-01]
F1-Score by class: [6.85663641e-01 2.34968540e-04 6.73427673e-01]
               precision    recall  f1-score   support

 classic_bike       0.63      0.75      0.69   1589149
  docked_bike       0.69      0.00      0.00     76593
electric_bike       0.72      0.63      0.67   1750525

     accuracy                           0.67   3416267
    macro avg       0.68      0.46      0.45   3416267
 weighted avg       0.68      0.67      0.66   3416267

