In [None]:
# Transform the data
y = data['early_spring']
X = data.drop(columns=['early_spring'])
data_transformed = pipeline.fit_transform(X)
numerical_feature_names = pipeline.named_steps['preprocessing'].transformers_[0][2]
_, categorical_columns = get_columns_by_type(X)
categorical_transformer = pipeline.named_steps['preprocessing'].transformers_[1][1]
categorical_feature_names = categorical_transformer.named_steps['onehot'].get_feature_names_out(input_features=categorical_columns)
feature_names = list(numerical_feature_names) + list(categorical_feature_names)
data_transformed_df = pd.DataFrame(data_transformed, columns=feature_names)
print(X.shape, y.shape)

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

X_transformed = pipeline.fit_transform(X)
y = y.reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

model = DecisionTreeRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
importances = model.feature_importances_

feature_importances = pd.DataFrame({
    'Feature': range(X_transformed.shape[1]),
    'Importance': importances
})

if len(feature_importances) == len(importances):
    feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

    threshold = 0.01
    low_importance_features = feature_importances[feature_importances['Importance'] < threshold]
    print("Features not helping the model (importance below threshold):")
    print(low_importance_features)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_transformed = pipeline.fit_transform(X)
y = y.reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)
print(f"NaN in X_train: {pd.isna(X_train).sum()}")
print(f"NaN in y_train: {pd.isna(y_train).sum()}")
imputer = SimpleImputer(strategy='most_frequent')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


# Regression:

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
# from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor

models = {
    'DecisionTree': DecisionTreeRegressor(),
    'RandomForest': RandomForestRegressor(),
    'LinearRegression': LinearRegression(),
    'SupportVector': SVR(),
    'GradientBoosting': GradientBoostingRegressor(),
    # 'XGBoost': XGBRegressor(),
    # 'LightGBM': LGBMRegressor(),
    'ElasticNet': ElasticNet(),
    'KNeighbors': KNeighborsRegressor(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'AdaBoost': AdaBoostRegressor(),
    'Bagging': BaggingRegressor()
}

# Align indices of data_transformed and y
data_transformed = pd.DataFrame(data_transformed).reset_index(drop=True)
y = y.reset_index(drop=True)

# Align indices of data_transformed and y
min_length = min(len(data_transformed), len(y))
data_transformed = data_transformed.iloc[:min_length]
y = y.iloc[:min_length]

X_train, X_test, y_train, y_test = train_test_split(data_transformed, y, test_size=0.2, random_state=42)

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    percision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f'{name} Model - Mean Squared Error: {mse}')
