In [None]:
import pandas as pd
import numpy as np
from sklearn import set_config


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
!wget https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
!unzip bike+sharing+dataset.zip

In [None]:
df = pd.read_csv('hour.csv')

In [None]:
df

In [None]:
df['day_night'] = df['hr'].apply(lambda x: 'day' if 6 <= x <= 18 else 'night')
df.drop(['instant', 'casual', 'registered', 'atemp',], axis=1, inplace=True)
df['dteday'] = pd.to_datetime(df.dteday)
df['season'] = df.season.astype('category')
df['holiday'] = df.holiday.astype('category')
df['weekday'] = df.weekday.astype('category')
df['weathersit'] = df.weathersit.astype('category')
df['workingday'] = df.workingday.astype('category')
df['mnth'] = df.mnth.astype('category')
df['yr'] = df.yr.astype('category')
df['hr'] = df.hr.astype('category')
df.drop(columns=['dteday'], inplace=True)
X = df.drop(columns=['cnt'])
y = df['cnt']

In [None]:
numerical_features = ['temp', 'hum', 'windspeed']
numerical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')), # Impute missing values with mean
('scaler', MinMaxScaler()) # Normalize using MinMaxScaler
])
# Transforming above
X[numerical_features] = numerical_pipeline.fit_transform(X[numerical_features])
# Categorical features
categorical_features = ['season', 'weathersit', 'day_night']
categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(sparse_output=False, drop='first'))
])
# Transforming above
X_encoded = categorical_pipeline.fit_transform(X[categorical_features])

In [None]:

X_encoded = pd.DataFrame(
    X_encoded,
    columns=categorical_pipeline.named_steps['onehot'].get_feature_names_out(categorical_features)
)

X = pd.concat([X1.drop(columns=categorical_features), X_encoded], axis=1)


In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

In [None]:

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

In [None]:

final_pipeline = Pipeline([
('num_preprocess', numerical_pipeline),
('cat_preprocess', categorical_pipeline),
('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
set_config(display='diagram')# To display
final_pipeline

In [None]:
df = pd.read_csv('hour.csv')


In [None]:
df['day_night'] = df['hr'].apply(lambda x: 'day' if 6 <= x <= 18 else 'night')
df.drop(['instant', 'casual', 'registered', 'atemp',], axis=1, inplace=True)
df['dteday'] = pd.to_datetime(df.dteday)
df['season'] = df.season.astype('category')
df['holiday'] = df.holiday.astype('category')
df['weekday'] = df.weekday.astype('category')
df['weathersit'] = df.weathersit.astype('category')
df['workingday'] = df.workingday.astype('category')
df['mnth'] = df.mnth.astype('category')
df['yr'] = df.yr.astype('category')
df['hr'] = df.hr.astype('category')
df.drop(columns=['dteday'], inplace=True)
X = df.drop(columns=['cnt'])
X1 = df.drop(columns=['cnt'])

y = df['cnt']

In [None]:
df['temp_hum'] = df['temp'] * df['hum']

df['wind_hum'] = df['windspeed'] * df['hum']


In [None]:
x = df.drop(columns=['cnt'])
y = df['cnt']
x.columns


In [None]:
numerical_features = ['temp', 'hum', 'windspeed','temp_hum', 'wind_hum' ]
numerical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')), # Impute missing values with mean
('scaler', MinMaxScaler()) # Normalize using MinMaxScaler
])
# Transforming above
x[numerical_features] = numerical_pipeline.fit_transform(x[numerical_features])


In [None]:
# # Categorical features
# categorical_features = ['season', 'weathersit', 'day_night']
# categorical_pipeline = Pipeline([
# ('imputer', SimpleImputer(strategy='most_frequent')),
# ('onehot', OneHotEncoder(sparse_output=False, drop='first'))
# ])
# # Transforming above
# # print(X[categorical_features])
# X_encoded = categorical_pipeline.fit_transform(X1[categorical_features], y)

In [None]:
x.head()

In [None]:
x.info()

In [None]:
y

In [None]:
print(y.dtype)
print(y.head())


In [None]:
y = y.astype(float)


In [None]:
y.fillna(y.mean(), inplace=True)  # or any other appropriate strategy


In [None]:

!pip install category_encoders

In [None]:
from category_encoders import TargetEncoder
categorical_features = ['season', 'weathersit', 'day_night']
categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('target', TargetEncoder())
])
# Transforming above
# print(X[categorical_features])
X_encoded = categorical_pipeline.fit_transform(x[categorical_features], y)

In [None]:

X_encoded = pd.DataFrame(
    X_encoded,
    columns=categorical_pipeline.named_steps['target'].get_feature_names_out(categorical_features)
)

X = pd.concat([x.drop(columns=categorical_features), X_encoded], axis=1)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

**by scratch**

In [None]:

class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        num_samples, num_features = X.shape

        self.weights = np.zeros(num_features)
        self.bias = 0

        for _ in range(self.num_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)

            dw = (1 / num_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / num_samples) * np.sum(y_predicted - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_cls)


In [None]:
model = LogisticRegression()


In [None]:
import mlflow

# Start an MLflow run
with mlflow.start_run():
    # Fit the model
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred_scratch = model.predict(X_test)
    accuracy = np.mean(y_pred_scratch == y_test)

    # Print and log metrics
    print(f"Accuracy: {accuracy * 100}%")
    mse = mean_squared_error(y_test, y_pred_scratch)
    r2 = r2_score(y_test, y_pred_scratch)
    print(f'Mean Squared Error: {mse}')
    print(f'R-squared: {r2}')

    # Log metrics to MLflow
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)


In [None]:
!pip install mlflow


In [None]:
import mlflow
import mlflow.sklearn

In [None]:
model = LogisticRegression(solver='liblinear', max_iter=100)
from sklearn.linear_model import LogisticRegression

In [None]:
model.fit(X_train, y_train)
# Predict and evaluate
accuracy = np.mean(y_pred_scratch == y_test)
print(f"Accuracy: {accuracy * 100}%")
mse = mean_squared_error(y_test, y_pred_scratch)
r2 = r2_score(y_test, y_pred_scratch)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
# Log metrics to MLflow
mlflow.log_metric("accuracy", accuracy)
mlflow.log_metric("mse", mse)
mlflow.log_metric("r2", r2)

**Using the package**

In [None]:
mlflow.end_run()

In [None]:

import mlflow
import mlflow.sklearn


# Define and train the model
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)



# Start an MLflow run
with mlflow.start_run():

    # Log model parameters
    mlflow.log_param("solver", "liblinear")
    mlflow.log_param("max_iter", 100)

    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = np.mean(y_pred == y_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Log metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)

    # Log the model
    mlflow.sklearn.log_model(model, "logistic_regression_model")

    # Print the results
    print(f"Accuracy: {accuracy * 100}%")
    print(f'Mean Squared Error: {mse}')
    print(f'R-squared: {r2}')

# End the MLflow run (automatically handled by the with statement)
