In [2]:
import pandas as pd
df = pd.read_csv('/kaggle/input/divvy-2022-23/Output (1).csv')

In [3]:
df.fillna(df.mean(numeric_only=True), inplace=True)  # Only fill numeric columns with their means
df.fillna('Unknown', inplace=True)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

target_variable = 'rideable_type'
y = df[target_variable]
X = df.drop(target_variable, axis=1)

# Apply encoding to the features
label_encoders = {}
for column in X.columns:
    if X[column].dtype == 'object':  # This identifies categorical columns
        num_unique_values = len(X[column].unique())
        if num_unique_values < 10:  # Limit set for one-hot encoding
            # Apply one-hot encoding to columns with fewer unique values
            dummies = pd.get_dummies(X[column], prefix=column)
            X = pd.concat([X, dummies], axis=1)
        else:
            # Apply label encoding to columns with many unique values to save memory
            label_encoders[column] = LabelEncoder()
            X[column] = label_encoders[column].fit_transform(X[column].astype(str))
        X.drop(column, axis=1, inplace=True)  # Drop original column after encoding

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=42)

In [5]:
del df

In [7]:
y_train_encoded = (y_train == 'electric').astype(int)  # Encode 'electric' as 1, 'classic' as 0
y_test_encoded = (y_test == 'electric').astype(int)

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Assuming X_train, X_test, y_train, and y_test have already been defined and split appropriately

# Encode 'electric' as 1, 'classic' as 0 in both training and testing labels
y_train_encoded = (y_train == 'electric').astype(int)
y_test_encoded = (y_test == 'electric').astype(int)

# Initialize the LinearRegression model
model = LinearRegression()

# Fit the model on the training data with encoded labels
model.fit(X_train, y_train_encoded)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculating R-squared and Mean Squared Error based on predictions and the true test labels
mse = mean_squared_error(y_test_encoded, y_pred)
r2 = r2_score(y_test_encoded, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

# Displaying coefficients
coefficients = pd.DataFrame(model.coef_, X_train.columns, columns=['Coefficient'])
print(coefficients)


Mean Squared Error: 0.24815011076878607
R² Score: 0.00678783227747759
                        Coefficient
Elevation_Change      -2.161199e-04
Distance              -2.003583e-05
trip_duration         -9.359789e-05
TMAX                  -1.588950e-03
TMIN                  -2.694059e-04
member_casual_Unknown -5.204170e-18
member_casual_casual   2.735762e-02
member_casual_member  -2.735762e-02
season_Fall            1.525531e-02
season_Spring          1.849525e-03
season_Summer          8.342118e-03
season_Winter         -2.544695e-02
day_of_week_Friday     1.708638e-02
day_of_week_Monday    -5.980562e-03
day_of_week_Saturday  -2.203042e-02
day_of_week_Sunday    -2.604875e-02
day_of_week_Thursday   1.682715e-02
day_of_week_Tuesday    5.087729e-03
day_of_week_Wednesday  1.505847e-02


In [14]:
from sklearn.metrics import recall_score, f1_score, precision_score, classification_report

# Predict on the test set (assuming `model` has already been trained on `X_train, y_train_encoded`)
y_pred = model.predict(X_test)  # This should be predictions on the test set
y_pred_encoded = (y_pred > 0.5).astype(int)  # Assuming you're using a threshold of 0.5 if needed

# Calculate recall, F1 score using the encoded test labels and predictions
recall = recall_score(y_test_encoded, y_pred_encoded)  # pos_label defaults to 1 which is suitable here
f1 = f1_score(y_test_encoded, y_pred_encoded)

# Print recall and F1-score
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

# For a comprehensive report including precision and support per class
report = classification_report(y_test_encoded, y_pred_encoded)
print(report)


Recall: 0.6440387883634909
F1-Score: 0.5850402971993045
              precision    recall  f1-score   support

           0       0.53      0.41      0.46   1665742
           1       0.54      0.64      0.59   1750525

    accuracy                           0.53   3416267
   macro avg       0.53      0.53      0.52   3416267
weighted avg       0.53      0.53      0.53   3416267



In [8]:
from sklearn.ensemble import RandomForestClassifier

# Assuming RandomForestClassifier is fitted as `rf`
rf = RandomForestClassifier(n_estimators=10, max_depth=10, random_state=42)  # Limiting depth

rf.fit(X_train, y_train)

# Feature importancesz
importances = rf.feature_importances_
feature_names = X_train.columns

# Create a DataFrame to view the features and their importance scores
importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)
print(importances_df)

                  Feature  Importance
1                Distance    0.423380
2           trip_duration    0.401072
0        Elevation_Change    0.089895
6    member_casual_casual    0.023871
7    member_casual_member    0.016545
3                    TMAX    0.015507
4                    TMIN    0.013596
14   day_of_week_Saturday    0.002674
10          season_Summer    0.002455
9           season_Spring    0.002379
15     day_of_week_Sunday    0.002007
8             season_Fall    0.001818
11          season_Winter    0.001780
12     day_of_week_Friday    0.000965
16   day_of_week_Thursday    0.000755
18  day_of_week_Wednesday    0.000626
13     day_of_week_Monday    0.000449
17    day_of_week_Tuesday    0.000225
5   member_casual_Unknown    0.000000


In [None]:
# Assuming `rf` is your trained RandomForestClassifier
y_pred = rf.predict(X_test)

from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
# Calculate metrics for each class
precision = precision_score(y_test, y_pred, average=None)  # returns an array for each class
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

print("Precision by class:", precision)
print("Recall by class:", recall)
print("F1-Score by class:", f1)
# Print a classification report
report = classification_report(y_test, y_pred)
print(report)
