**MODEL PIPELINE**

In [2]:
import pandas as pd # type: ignore
import numpy as np # type: ignore

*Import the IDS raw dataset*

In [3]:
# Specify the file path
# file_path = r"C:\Reaemanz\Raising the village\IDS\IDS repo\IDS_22_21_20_v1.csv"
# file_path = r"C:\Reaemanz\Raising the village\IDS\Model_changes\Model_7\IDS_22_21_20_v1.csv"

# Load the CSV file into a Pandas DataFrame
IDS_raw = pd.read_csv('IDS_22_21_20_v2.csv')

# print the data set

IDS_raw.head(5)

Unnamed: 0,pre_village,hhs,total_population,beans_kgs_hh_seed,ground_nuts_kgs_hh_seed,maize_kgs_hh_seed,rice_kgs_hh_seed,onions_bags_hh_seed,soya_bean_kgs_hh_seed,field_peas_kgs_hh_seed,...,Sorghum_total_yield,Maize_total_yield,Millet_total_yield,Onions_total_yield,Sweet_potatoes_total_yield,Ground_Nuts_total_yield,Food_banana_total_yield,Coffee_total_yield,cohort,Agriculture Value (USD)
0,Bikungu,103.0,543.0,2.056338,2.056338,0.0,0.0,0.0,2.056338,0.0,...,,,,,,44.25,82.5,48.0,2022.0,780.359556
1,Bisiika_A,75.0,361.0,3.028037,3.028037,3.028037,0.0,0.0,0.0,0.0,...,,,,,10.777778,,96.818182,66.363636,2022.0,784.39718
2,Bisiika_B,70.0,339.0,3.030303,3.030303,3.030303,0.0,0.0,0.0,0.0,...,,,,,10.0,,79.0,70.5,2022.0,729.391892
3,Bubalala,55.0,259.0,2.571429,2.571429,2.571429,0.0,0.0,2.571429,0.0,...,,456.923077,144.0,,6.5,,3.0,,2022.0,528.932432
4,Budongo,90.0,424.0,2.52381,2.52381,2.52381,0.0,0.0,2.52381,0.0,...,,425.75,192.5,,4.0,,12.75,45.0,2022.0,408.060197


**DROP COLUMNS**

In [4]:
import pandas as pd

IDS_1 = IDS_raw.copy()

def drop_columns(df, columns_to_remove):
    """
    Drops specified columns from a DataFrame, ignoring errors if columns are missing.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame.
    columns_to_remove (list): List of column names to remove.
    
    Returns:
    pd.DataFrame: DataFrame with specified columns removed.
    """
    return df.drop(columns=columns_to_remove, errors='ignore')

# Combined unique columns to remove
columns_to_remove = list(set([
    'hhs', "secateurs_tools", "gloves_tools", "Onions_total_yield",
    "hot_pepper_tins_organic_pesticides", "derris_tin_organic_pesticides",
    "onions_bags_organic_pesticides", "turplins_tools", "cohort", 'pre_village',
    'rice_kgs_hh_seed', 'field_peas_kgs_hh_seed', 'neem_kg_organic_pesticides', 'tephrosia_kgs_organic_pesticides',
    'forked_spades_tools', 'jerrican_tools', 'sisal_rope_liquid_manure',
    'sweet_potatoes_bags_hh_seed', 'millet_kgs_hh_seed', 'onions_bags_hh_seed',
    "irish_potatoes_bags_hh_seed", "filter_liquid_manure",
    "mortar_&_pestle_liquid_manure", "Irish_potatoes_total_yield", "Sorghum_total_yield", "Millet_total_yield",
    "Ground_Nuts_total_yield", 'hhh_age', 'total_population'
]))

# Applying the function
IDS_2 = drop_columns(IDS_1, columns_to_remove)

# Display result
IDS_2.head()


Unnamed: 0,beans_kgs_hh_seed,ground_nuts_kgs_hh_seed,maize_kgs_hh_seed,soya_bean_kgs_hh_seed,garlic_kgs_organic_pesticides,ginger_kgs_organic_pesticides,plastic_tanks_120_ltrs_liquid_manure,sacks_liquid_manure,hoes_tools,spades_tools,...,GPS-Altitude,Land_size_agriculture,Time_to_collect_Water_for_Household_use_Minutes,Beans_total_yield,Cassava_total_yield,Maize_total_yield,Sweet_potatoes_total_yield,Food_banana_total_yield,Coffee_total_yield,Agriculture Value (USD)
0,2.056338,2.056338,0.0,2.056338,0.244131,0.244131,0.037559,0.075117,0.033803,0.022535,...,566.658333,0.545833,67.5,83.111111,,,,82.5,48.0,780.359556
1,3.028037,3.028037,3.028037,0.0,0.35514,0.35514,0.037383,0.037383,0.056075,0.037383,...,1778.975,1.408333,23.333333,75.333333,,,10.777778,96.818182,66.363636,784.39718
2,3.030303,3.030303,3.030303,0.0,0.363636,0.363636,0.040404,0.040404,0.060606,0.040404,...,1710.15,0.922917,40.0,85.7,,,10.0,79.0,70.5,729.391892
3,2.571429,2.571429,2.571429,2.571429,0.363636,0.363636,0.103896,0.207792,0.051948,0.051948,...,1024.804167,1.322222,62.166667,33.0,,456.923077,6.5,3.0,,528.932432
4,2.52381,2.52381,2.52381,2.52381,0.365079,0.365079,0.063492,0.126984,0.031746,0.031746,...,1035.525,1.282895,40.291667,24.5,37.0,425.75,4.0,12.75,45.0,408.060197


**Convert the dataframe to numeric**

In [5]:
IDS_2 = IDS_2.apply(pd.to_numeric, errors='coerce')
IDS_2.head()

Unnamed: 0,beans_kgs_hh_seed,ground_nuts_kgs_hh_seed,maize_kgs_hh_seed,soya_bean_kgs_hh_seed,garlic_kgs_organic_pesticides,ginger_kgs_organic_pesticides,plastic_tanks_120_ltrs_liquid_manure,sacks_liquid_manure,hoes_tools,spades_tools,...,GPS-Altitude,Land_size_agriculture,Time_to_collect_Water_for_Household_use_Minutes,Beans_total_yield,Cassava_total_yield,Maize_total_yield,Sweet_potatoes_total_yield,Food_banana_total_yield,Coffee_total_yield,Agriculture Value (USD)
0,2.056338,2.056338,0.0,2.056338,0.244131,0.244131,0.037559,0.075117,0.033803,0.022535,...,566.658333,0.545833,67.5,83.111111,,,,82.5,48.0,780.359556
1,3.028037,3.028037,3.028037,0.0,0.35514,0.35514,0.037383,0.037383,0.056075,0.037383,...,1778.975,1.408333,23.333333,75.333333,,,10.777778,96.818182,66.363636,784.39718
2,3.030303,3.030303,3.030303,0.0,0.363636,0.363636,0.040404,0.040404,0.060606,0.040404,...,1710.15,0.922917,40.0,85.7,,,10.0,79.0,70.5,729.391892
3,2.571429,2.571429,2.571429,2.571429,0.363636,0.363636,0.103896,0.207792,0.051948,0.051948,...,1024.804167,1.322222,62.166667,33.0,,456.923077,6.5,3.0,,528.932432
4,2.52381,2.52381,2.52381,2.52381,0.365079,0.365079,0.063492,0.126984,0.031746,0.031746,...,1035.525,1.282895,40.291667,24.5,37.0,425.75,4.0,12.75,45.0,408.060197


**ROWS WITH MISSING DATA**

*Drop rows with missing data*

In [6]:
IDS_3 = IDS_2.copy()

def drop_rows_with_missing_data(df, missing_threshold):
    """
    Drops rows with more than the specified percentage of missing data.
    
    Parameters:
    df (pd.DataFrame): The input DataFrame.
    missing_threshold (float): The threshold percentage (0-1) of missing values to drop rows.
    
    Returns:
    pd.DataFrame: DataFrame with rows dropped.
    """
    threshold = len(df.columns) * missing_threshold
    return df.dropna(thresh=threshold, axis=0)

IDS_4 = drop_rows_with_missing_data(IDS_3, 0.2)
IDS_4.head(5)

Unnamed: 0,beans_kgs_hh_seed,ground_nuts_kgs_hh_seed,maize_kgs_hh_seed,soya_bean_kgs_hh_seed,garlic_kgs_organic_pesticides,ginger_kgs_organic_pesticides,plastic_tanks_120_ltrs_liquid_manure,sacks_liquid_manure,hoes_tools,spades_tools,...,GPS-Altitude,Land_size_agriculture,Time_to_collect_Water_for_Household_use_Minutes,Beans_total_yield,Cassava_total_yield,Maize_total_yield,Sweet_potatoes_total_yield,Food_banana_total_yield,Coffee_total_yield,Agriculture Value (USD)
0,2.056338,2.056338,0.0,2.056338,0.244131,0.244131,0.037559,0.075117,0.033803,0.022535,...,566.658333,0.545833,67.5,83.111111,,,,82.5,48.0,780.359556
1,3.028037,3.028037,3.028037,0.0,0.35514,0.35514,0.037383,0.037383,0.056075,0.037383,...,1778.975,1.408333,23.333333,75.333333,,,10.777778,96.818182,66.363636,784.39718
2,3.030303,3.030303,3.030303,0.0,0.363636,0.363636,0.040404,0.040404,0.060606,0.040404,...,1710.15,0.922917,40.0,85.7,,,10.0,79.0,70.5,729.391892
3,2.571429,2.571429,2.571429,2.571429,0.363636,0.363636,0.103896,0.207792,0.051948,0.051948,...,1024.804167,1.322222,62.166667,33.0,,456.923077,6.5,3.0,,528.932432
4,2.52381,2.52381,2.52381,2.52381,0.365079,0.365079,0.063492,0.126984,0.031746,0.031746,...,1035.525,1.282895,40.291667,24.5,37.0,425.75,4.0,12.75,45.0,408.060197


*Impute Missing values to the target variable*

In [7]:
IDS_5 = IDS_4.copy()

from sklearn.impute import KNNImputer

# Select the 'Agriculture Value (USD)' column
agriculture_value_column = IDS_5[['Agriculture Value (USD)']]

# Initialize the KNN imputer
knn_imputer = KNNImputer(n_neighbors=4)

# Apply the imputer to the 'Agriculture Value (USD)' column
agriculture_value_imputed = knn_imputer.fit_transform(agriculture_value_column)

# Replace the original column with the imputed values
IDS_5['Agriculture Value (USD)'] = agriculture_value_imputed

# Display the DataFrame to verify the changes
IDS_5.head()

Unnamed: 0,beans_kgs_hh_seed,ground_nuts_kgs_hh_seed,maize_kgs_hh_seed,soya_bean_kgs_hh_seed,garlic_kgs_organic_pesticides,ginger_kgs_organic_pesticides,plastic_tanks_120_ltrs_liquid_manure,sacks_liquid_manure,hoes_tools,spades_tools,...,GPS-Altitude,Land_size_agriculture,Time_to_collect_Water_for_Household_use_Minutes,Beans_total_yield,Cassava_total_yield,Maize_total_yield,Sweet_potatoes_total_yield,Food_banana_total_yield,Coffee_total_yield,Agriculture Value (USD)
0,2.056338,2.056338,0.0,2.056338,0.244131,0.244131,0.037559,0.075117,0.033803,0.022535,...,566.658333,0.545833,67.5,83.111111,,,,82.5,48.0,780.359556
1,3.028037,3.028037,3.028037,0.0,0.35514,0.35514,0.037383,0.037383,0.056075,0.037383,...,1778.975,1.408333,23.333333,75.333333,,,10.777778,96.818182,66.363636,784.39718
2,3.030303,3.030303,3.030303,0.0,0.363636,0.363636,0.040404,0.040404,0.060606,0.040404,...,1710.15,0.922917,40.0,85.7,,,10.0,79.0,70.5,729.391892
3,2.571429,2.571429,2.571429,2.571429,0.363636,0.363636,0.103896,0.207792,0.051948,0.051948,...,1024.804167,1.322222,62.166667,33.0,,456.923077,6.5,3.0,,528.932432
4,2.52381,2.52381,2.52381,2.52381,0.365079,0.365079,0.063492,0.126984,0.031746,0.031746,...,1035.525,1.282895,40.291667,24.5,37.0,425.75,4.0,12.75,45.0,408.060197


*Check for null values*

In [8]:
# Check for null values
print("Null values in 'Agriculture Value (USD)':")
print(IDS_5['Agriculture Value (USD)'].isnull().sum())

# Check for zero values
print("\nZero values in 'Agriculture Value (USD)':")
print((IDS_5['Agriculture Value (USD)'] == 0).sum())

Null values in 'Agriculture Value (USD)':
0

Zero values in 'Agriculture Value (USD)':
0


In [9]:
# Check for missing values in IDS_2
missing_values_count = IDS_5.isnull().sum().sum()

# Print the result
print(f"Total number of missing values in IDS_5: {missing_values_count}")

Total number of missing values in IDS_5: 938


*Replace Missing values with zero's*

In [10]:
IDS_6 = IDS_5.copy()

# Replace all missing values with zero
IDS_6.fillna(0, inplace=True)

# Display the DataFrame to verify the changes
IDS_6.head()

Unnamed: 0,beans_kgs_hh_seed,ground_nuts_kgs_hh_seed,maize_kgs_hh_seed,soya_bean_kgs_hh_seed,garlic_kgs_organic_pesticides,ginger_kgs_organic_pesticides,plastic_tanks_120_ltrs_liquid_manure,sacks_liquid_manure,hoes_tools,spades_tools,...,GPS-Altitude,Land_size_agriculture,Time_to_collect_Water_for_Household_use_Minutes,Beans_total_yield,Cassava_total_yield,Maize_total_yield,Sweet_potatoes_total_yield,Food_banana_total_yield,Coffee_total_yield,Agriculture Value (USD)
0,2.056338,2.056338,0.0,2.056338,0.244131,0.244131,0.037559,0.075117,0.033803,0.022535,...,566.658333,0.545833,67.5,83.111111,0.0,0.0,0.0,82.5,48.0,780.359556
1,3.028037,3.028037,3.028037,0.0,0.35514,0.35514,0.037383,0.037383,0.056075,0.037383,...,1778.975,1.408333,23.333333,75.333333,0.0,0.0,10.777778,96.818182,66.363636,784.39718
2,3.030303,3.030303,3.030303,0.0,0.363636,0.363636,0.040404,0.040404,0.060606,0.040404,...,1710.15,0.922917,40.0,85.7,0.0,0.0,10.0,79.0,70.5,729.391892
3,2.571429,2.571429,2.571429,2.571429,0.363636,0.363636,0.103896,0.207792,0.051948,0.051948,...,1024.804167,1.322222,62.166667,33.0,0.0,456.923077,6.5,3.0,0.0,528.932432
4,2.52381,2.52381,2.52381,2.52381,0.365079,0.365079,0.063492,0.126984,0.031746,0.031746,...,1035.525,1.282895,40.291667,24.5,37.0,425.75,4.0,12.75,45.0,408.060197


*Confirm Missingness*

In [11]:
# Check for missing values in IDS_2
missing_values_count = IDS_6.isnull().sum().sum()

# Print the result
print(f"Total number of missing values in IDS_6: {missing_values_count}")

Total number of missing values in IDS_6: 0


**Remove Outliers**

In [12]:
import numpy as np
import pandas as pd

# Function to remove outliers for specific numerical columns using percentile-based filtering
def remove_outliers_percentile(dataframe, columns):
    new_df = dataframe.copy()
    
    for column in columns:
        if column in dataframe.select_dtypes(include=['number']).columns:  # Ensure column is numeric
            temp_df = new_df.sort_values(by=column)  # Sort values in ascending order
            temp_df.loc[:, 'index'] = range(1, len(temp_df) + 1)  # Create an index column
            
            low = round(len(temp_df) * 0.01)  # 1% lower bound
            upp = len(temp_df) - round(len(temp_df) * 0.04)  # 4% upper bound
            
            # Create an outlier flag (1 = keep, 0 = remove)
            temp_df['outl'] = np.where((temp_df['index'] > low) & (temp_df['index'] <= upp), 1, 0)
            
            # Keep only non-outliers
            temp_df = temp_df[temp_df['outl'] == 1].drop(columns=['index', 'outl'])  # Drop helper columns
            new_df = temp_df
        else:
            print(f"Warning: Column '{column}' is not numeric and was skipped.")
    
    return new_df

# Specify columns to apply outlier removal
columns_to_clean = ['beans_kgs_hh_seed','Coffee_total_yield']  # Replace with actual column names

# Apply function to dataset
IDS_7 = remove_outliers_percentile(IDS_6, columns_to_clean)

# Display shape before and after
print(f"Original shape: {IDS_6.shape}")
print(f"Cleaned shape: {IDS_7.shape}")

# # Preview cleaned data
# print(cleaned_IDS_4.head())

# Optional: Save cleaned dataset
# cleaned_IDS_4.to_csv("cleaned_IDS_4.csv", index=False)



Original shape: (464, 27)
Cleaned shape: (418, 27)


In [None]:
# from sklearn.preprocessing import MinMaxScaler

# # Copy the dataset before scaling
# IDS_8 = IDS_7.copy()

# # Identify numerical columns (features + target)
# IDS_numeric = IDS_7.select_dtypes(include=[np.number]).columns.tolist()

# # Separate the target variable
# target_column = 'Agriculture Value (USD)'

# # # Create scalers
# scaler_features = MinMaxScaler()
# scaler_target = MinMaxScaler()

# # # Scale only feature columns (excluding the target variable)
# feature_columns = [col for col in IDS_numeric if col != target_column]
# IDS_8[feature_columns] = scaler_features.fit_transform(IDS_8[feature_columns])

# # Scale only the target variable
# IDS_8[[target_column]] = scaler_target.fit_transform(IDS_8[[target_column]])

# print("Scaling completed!")


Scaling completed!


**MODEL PIPELINE**

*Import necessary libraries*

In [45]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

*DATASET CREATION*

In [38]:
# Dataset

IDS_8 = IDS_7.copy()

# Define dependent (Y) and independent (X) variables
X = IDS_8.drop(columns=['Agriculture Value (USD)'])  # Independent variables
Y = IDS_8['Agriculture Value (USD)']  # Dependent variable

# Split the dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [57]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.dummy import DummyRegressor
import matplotlib.pyplot as plt

# Check Baseline Model Performance
dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, Y_train)
dummy_pred = dummy.predict(X_test)
dummy_r2 = r2_score(Y_test, dummy_pred)

print(f"\nBaseline Model R² (Predicting Mean): {dummy_r2}")


Baseline Model R² (Predicting Mean): -0.019191111103443292


In [None]:
# # Train a Random Forest model
# rf_model_2 = RandomForestRegressor(n_estimators=100, random_state=42)
# rf_model_2.fit(X_train, Y_train)

# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# import numpy as np

# # Predict on the test set
# Y_pred = rf_model_2.predict(X_test)

# # Compute performance metrics
# mae = mean_absolute_error(Y_test, Y_pred)
# mse = mean_squared_error(Y_test, Y_pred)
# rmse = np.sqrt(mse)
# r2 = r2_score(Y_test, Y_pred)

# # Print results
# print(" Model Performance Metrics ")
# print(f" Mean Absolute Error (MAE): {mae:.2f}")
# print(f" Mean Squared Error (MSE): {mse:.2f}")
# print(f" Root Mean Squared Error (RMSE): {rmse:.2f}")
# print(f" R² Score: {r2:.4f}")

**PREPROCESSING PIPELINE**

In [65]:
# Step 1: Custom Transformer for Target Scaling

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
import joblib  # For saving/loading the model

class TargetScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = MinMaxScaler()

    def fit(self, y, *_):
        self.scaler.fit(y.reshape(-1, 1))
        return self

    def transform(self, y):
        return self.scaler.transform(y.reshape(-1, 1)).ravel()

    def inverse_transform(self, y):
        return self.scaler.inverse_transform(y.reshape(-1, 1)).ravel()

In [None]:
# # Define the preprocessor pipeline
# def build_preprocessor():
#     # Automatically detect all numeric columns
#     numeric_features = X.select_dtypes(include=['number']).columns.tolist()
#     # Define transformers for each step of preprocessing
#     numeric_transformer = Pipeline(steps=[
#         ('scaler', MinMaxScaler())  # Scaling for X
#     ])
    
#    # Apply transformations to all numeric columns automatically
#     preprocessor = ColumnTransformer(
#         transformers=[
#             ('num', numeric_transformer, numeric_features)  # Apply to all numeric columns
#         ]
#     )
    
#     return preprocessor

In [68]:
def build_preprocessor(X):
    # Get numeric columns from the input data passed as parameter
    numeric_features = X.select_dtypes(include=['number']).columns.tolist()
    
    # Define transformers for each step of preprocessing
    numeric_transformer = Pipeline(steps=[
        ('scaler', MinMaxScaler())  # Scaling for X
    ])
    
    # Apply transformations to all numeric columns automatically
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features)  # Apply to all numeric columns
        ]
    )
    
    return preprocessor

**SGD REGRESSOR**

In [None]:
# Import necessary libraries
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

#preprocessor

def build_preprocessor(X):
    # Get numeric columns from the input data passed as parameter
    numeric_features = X.select_dtypes(include=['number']).columns.tolist()
    
    # Define transformers for each step of preprocessing
    numeric_transformer = Pipeline(steps=[
        ('scaler', MinMaxScaler())  # Scaling for X
    ])
    
    # Apply transformations to all numeric columns automatically
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features)  # Apply to all numeric columns
        ]
    )
    
    return preprocessor


# Define the pipeline with preprocessor and model
def sgd_build_pipeline(X):
    # Define the model (SGD Regressor in this case)
    model = SGDRegressor(penalty='l2', max_iter=10000, tol=1e-4, random_state=42)
    
    # Create the full pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', build_preprocessor(X)),  # Apply preprocessing
        ('regressor', model)  # Train the model
    ])
    
    # Use a simpler approach with StandardScaler directly in TransformedTargetRegressor
    transformed_model = TransformedTargetRegressor(
        regressor=pipeline,
        transformer=MinMaxScaler()  # Scale the target variable
    )
    
    return transformed_model

# Build the pipeline
pipeline = sgd_build_pipeline(X_train)

# Set up a parametric grid for the SGD regressor
param_grid_sgd = {
    'regressor__regressor__alpha': [1e-4, 3e-4, 1e-3],  # Regularization parameter
    'regressor__regressor__eta0': [0.01, 0.03, 0.1],   # Learning rate
    'regressor__regressor__learning_rate': ['constant', 'optimal', 'invscaling'],  # Learning rate schedule
}

# Initialize GridSearchCV with pipeline and parameter grid
grid_search_sgd = GridSearchCV(
    pipeline, 
    param_grid_sgd, 
    cv=5, 
    scoring='r2', 
    verbose=1,
    n_jobs=-1  # Use all available cores for faster computation
)

# Fit the grid search
grid_search_sgd.fit(X_train, Y_train)

# Print the best parameters from the grid search
print(f'Best Parameters: {grid_search_sgd.best_params_}')

# Make predictions with the best model
best_model = grid_search_sgd.best_estimator_
predictions = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(Y_test, predictions)
mae = mean_absolute_error(Y_test, predictions)
r2 = r2_score(Y_test, predictions)
rmse = np.sqrt(mse)

# Print evaluation results
print(f'MSE: {mse:.3f}')
print(f'MAE: {mae:.3f}')
print(f'R^2: {r2:.3f}')
print(f'RMSE: {rmse:.3f}')

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'regressor__regressor__alpha': 0.001, 'regressor__regressor__eta0': 0.03, 'regressor__regressor__learning_rate': 'invscaling'}
MSE: 19334.231
MAE: 110.483
R^2: 0.259
RMSE: 139.048


In [None]:
# # Define the pipeline with preprocessor and model
# def sgd_build_pipeline():
#     # Define the model (SGD Regressor in this case)
#     model = SGDRegressor(penalty='l2', max_iter=10000000, random_state=42)
    
#     # Create the full pipeline
#     pipeline = Pipeline(steps=[
#         ('preprocessor', build_preprocessor()),  # Apply preprocessing
#         ('target_scaler', TargetScaler()),  # Scale Y automatically
#         ('regressor', model)  # Train the model
#     ])

#     # Wrap the model pipeline with a Target Transformer
#     transformed_model = TransformedTargetRegressor(regressor=pipeline, transformer=TargetScaler())
    
#     return transformed_model

# # Build the pipeline
# pipeline = sgd_build_pipeline()

# # Set up a parametric grid for the SGD regressor
# param_grid_sgd = {
#     'model__alpha': [1e-4, 3e-4, 1e-3],  # Regularization parameter
#     'model__eta0': [0.01, 0.03, 0.1],   # Learning rate
#     'model__learning_rate': ['constant', 'optimal', 'invscaling'],  # Learning rate schedule
# }

# # param_grid_sgd = {'regressor__max_iter': [100, 1000, 10000], 'regressor__alpha': [0.01, 0.1, 1]}


# # Initialize GridSearchCV with pipeline and parameter grid
# grid_search_sgd = GridSearchCV(pipeline, param_grid_sgd, cv=5, scoring='r2', verbose=1)


# # Fit the grid search
# grid_search_sgd.fit(X_train, Y_train)

# # Print the best parameters from the grid search
# print(f'Best Parameters: {grid_search_sgd.best_params_}')

# # Make predictions with the best model
# best_model = grid_search_sgd.best_estimator_
# predictions = best_model.predict(X_test)

# # Evaluate the model
# mse = mean_squared_error(Y_test, predictions)
# mae = mean_absolute_error(Y_test, predictions)
# r2 = r2_score(Y_test, predictions)
# rmse = np.sqrt(mse)

# # Print evaluation results
# print(f'MSE: {mse:.3f}')
# print(f'MAE: {mae:.3f}')
# print(f'R^2: {r2:.3f}')
# print(f'RMSE: {rmse:.3f}')

Fitting 5 folds for each of 27 candidates, totalling 135 fits


ValueError: Invalid parameter 'model' for estimator TransformedTargetRegressor(regressor=Pipeline(steps=[('preprocessor',
                                                      ColumnTransformer(transformers=[('num',
                                                                                       Pipeline(steps=[('scaler',
                                                                                                        MinMaxScaler())]),
                                                                                       ['beans_kgs_hh_seed',
                                                                                        'ground_nuts_kgs_hh_seed',
                                                                                        'maize_kgs_hh_seed',
                                                                                        'soya_bean_kgs_hh_seed',
                                                                                        'garlic_kgs_organic_pesticides',
                                                                                        'ginger_kgs_organic_pesticides',
                                                                                        'plastic_tanks_120_ltrs_liquid_manure',
                                                                                        'sac...
                                                                                        'GPS-Altitude',
                                                                                        'Land_size_agriculture',
                                                                                        'Time_to_collect_Water_for_Household_use_Minutes',
                                                                                        'Beans_total_yield',
                                                                                        'Cassava_total_yield',
                                                                                        'Maize_total_yield',
                                                                                        'Sweet_potatoes_total_yield',
                                                                                        'Food_banana_total_yield',
                                                                                        'Coffee_total_yield'])])),
                                                     ('target_scaler',
                                                      TargetScaler()),
                                                     ('regressor',
                                                      SGDRegressor(max_iter=10000000,
                                                                   random_state=42))]),
                           transformer=TargetScaler()). Valid parameters are: ['check_inverse', 'func', 'inverse_func', 'regressor', 'transformer'].

In [None]:
# from sklearn.neighbors import KNeighborsRegressor
# model_ = KNeighborsRegressor(n_neighbors=3)
# model_.fit(X=X_train, y=Y_train)
# preds_ = model_.predict(X=X_test)
# mse_ = mean_squared_error(y_true=Y_test, y_pred=preds_)
# print(f'{mse_:.3f}')

25242.587


In [None]:
# # Fit the grid search
# grid_search_sgd.fit(X_train, Y_train)

# # Print the best parameters from the grid search
# print(f'Best Parameters: {grid_search_sgd.best_params_}')

# # Make predictions with the best model
# best_model = grid_search_sgd.best_estimator_
# predictions = best_model.predict(X_test)

# # Evaluate the model
# mse = mean_squared_error(Y_test, predictions)
# mae = mean_absolute_error(Y_test, predictions)
# r2 = r2_score(Y_test, predictions)
# rmse = np.sqrt(mse)

# # Print evaluation results
# print(f'MSE: {mse:.3f}')
# print(f'MAE: {mae:.3f}')
# print(f'R^2: {r2:.3f}')
# print(f'RMSE: {rmse:.3f}')

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'model__alpha': 0.001, 'model__eta0': 0.03, 'model__learning_rate': 'invscaling'}
MSE: 0.021
MAE: 0.116
R^2: 0.232
RMSE: 0.145


In [108]:
# class OurModel:
#     def __init__(self, estimators, processes:bool, train:bool):
#         self.estimators = estimators
#         self.processes = processes
#         self.train = train

#     def preprocessor(self, X):
#         if self.train:
#             print("imputing missing values")
#             print("removing outliers")
#         return X
    
#     def train(self, X, y):
#         processed_data = self.preprocessor(X)

#         pipeline = Pipeline(steps=[
#             ('scaler', StandardScaler()),
#             ('model', self.estimator)
#         ])

#         # setup grid search
#         param_grid = {
#             'model__knn_alpha': [1e-4, 3e-4, 1e-3],
#             'model__knn_eta0': [0.01, 0.03, 0.1],
#             'model__learning_rate': ['constant', 'optimal', 'invscaling'],
#             'model_estimator': [knn]
#         }

#         grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1)

#         grid_search.fit(X, y)

#         print("training done")
#         return grid_search

#     def predict(self, X):
#         print("making predictions") # return estimator.predict(X)

# training phase
# model = OurModel(estimator=model, processes=True, train=True)
# model.preprocessor(X_train)
# model.train(X_train, Y_train)
# model.predict(X_test)

# # save model
# joblib.dump(model, 'model.pkl')

# # infrence phase
# model.predict(X_test)