In [61]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [62]:
crop_df = pd.read_csv('./csv/crop_yield2.csv')

In [63]:
crop_df.head()

Unnamed: 0,Region,Soil_Type,Crop,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Weather_Condition,Days_to_Harvest,Yield_tons_per_hectare
0,West,Sandy,Cotton,897.077239,27.676966,False,True,Cloudy,122,6.555816
1,South,Clay,Rice,992.673282,18.026142,True,True,Rainy,140,8.527341
2,North,Loam,Barley,147.998025,29.794042,False,False,Sunny,106,1.127443
3,North,Sandy,Soybean,986.866331,16.64419,False,True,Rainy,146,6.517573
4,South,Silt,Wheat,730.379174,31.620687,True,True,Cloudy,110,7.248251


In [64]:
crop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3563 entries, 0 to 3562
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Region                  3563 non-null   object 
 1   Soil_Type               3563 non-null   object 
 2   Crop                    3563 non-null   object 
 3   Rainfall_mm             3563 non-null   float64
 4   Temperature_Celsius     3563 non-null   float64
 5   Fertilizer_Used         3563 non-null   bool   
 6   Irrigation_Used         3563 non-null   bool   
 7   Weather_Condition       3563 non-null   object 
 8   Days_to_Harvest         3563 non-null   int64  
 9   Yield_tons_per_hectare  3563 non-null   float64
dtypes: bool(2), float64(3), int64(1), object(4)
memory usage: 229.8+ KB


In [65]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X = crop_df.drop('Yield_tons_per_hectare', axis=1)
y = crop_df['Yield_tons_per_hectare'] 

# Dividimos los datos en entrenamiento y prueba antes de preprocesarlos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identificamos las columnas categóricas
categorical_columns = ['Region', 'Soil_Type', 'Crop', 'Weather_Condition']

# Creamos el preprocesador que convierte las columnas categóricas usando OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
        #handle_unknown='ignore' interesante si trabajas con datos en vivo y en los que pueden aparecer categorías inesperadas
    ],
    remainder='passthrough',  # Las demás columnas (numéricas) se dejan como están
    force_int_remainder_cols=False  # las columnas que no se transforman siguen con su tipo de dato original
)


In [66]:
# Definir el pipeline para preprocesar los datos y definir el modelo
squared = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('svm', LinearSVR(loss='squared_epsilon_insensitive', C=1.0, max_iter=1000))
])

# Entrenar el pipeline con los datos de entrenamiento
squared.fit(X_train, y_train)

# Hacemos predicciones
y_pred = squared.predict(X_test)

# Evaluamos la precisión usando métricas de regresión
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R2 Score:", r2)

Mean Squared Error: 0.245245499581833
Mean Absolute Error: 0.39677930053382926
R2 Score: 0.9088057614668873


In [None]:
from sklearn.linear_model import LinearRegression

# Definir el pipeline para preprocesar los datos y definir el modelo de regresión lineal
linear_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Entrenar el pipeline con los datos de entrenamiento
linear_pipeline.fit(X_train, y_train)

# Hacemos predicciones
y_pred_linear = linear_pipeline.predict(X_test)

# Evaluamos la precisión usando métricas de regresión
mse_linear = mean_squared_error(y_test, y_pred_linear)
mae_linear = mean_absolute_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print("Mean Squared Error:", mse_linear)
print("Mean Absolute Error:", mae_linear)
print("R2 Score:", r2_linear)

Linear Regression Model
Mean Squared Error: 0.2454774305714271
Mean Absolute Error: 0.39501386757189016
R2 Score: 0.9087195182125796


In [68]:
from sklearn.ensemble import RandomForestRegressor

# Definir el pipeline para preprocesar los datos y definir el modelo RandomForest
random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Entrenar el pipeline con los datos de entrenamiento
random_forest_pipeline.fit(X_train, y_train)

# Hacemos predicciones
y_pred_rf = random_forest_pipeline.predict(X_test)

# Evaluamos la precisión usando métricas de regresión
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Mean Squared Error:", mse_rf)
print("Mean Absolute Error:", mae_rf)
print("R2 Score:", r2_rf)

Mean Squared Error: 0.2792742158742787
Mean Absolute Error: 0.42372861191412103
R2 Score: 0.896152225007135


In [69]:
from sklearn.ensemble import VotingRegressor

# Definir los modelos base
estimators = [
    ('linear', linear_pipeline),
    ('svm', squared),
    ('rf', random_forest_pipeline)
]

# Crear el Voting Regressor
voting_regressor = VotingRegressor(estimators=estimators)

# Entrenar el Voting Regressor con los datos de entrenamiento
voting_regressor.fit(X_train, y_train)

# Hacer predicciones
y_pred_voting = voting_regressor.predict(X_test)

# Evaluar la precisión usando métricas de regresión
mse_voting = mean_squared_error(y_test, y_pred_voting)
mae_voting = mean_absolute_error(y_test, y_pred_voting)
r2_voting = r2_score(y_test, y_pred_voting)

print("Mean Squared Error (Voting):", mse_voting)
print("Mean Absolute Error (Voting):", mae_voting)
print("R2 Score (Voting):", r2_voting)

Mean Squared Error (Voting): 0.24755571668463927
Mean Absolute Error (Voting): 0.39730813267326504
R2 Score (Voting): 0.9079467100677961
