In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, classification_report, 
                            confusion_matrix, mean_squared_error, r2_score)
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, 
                             RandomForestRegressor, GradientBoostingRegressor)
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_regression
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib  # For saving models
import pickle  # Alternative for saving models
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
df = pd.read_csv('Dataset_for_Models.csv')

# Initial inspection
print(df.head())
print("\nColumns:", df.columns)
print("\nMissing values:\n", df.isnull().sum())

                  created_at  entry_id  Moisture  Temperature   EC   pH   N  \
0  2025-03-25T13:29:59+00:00       462      31.1         28.3  320  6.2  28   
1  2025-03-25T13:30:16+00:00       463      31.1         28.3  319  6.2  28   
2  2025-03-25T13:30:39+00:00       464      31.1         28.3  319  6.2  28   
3  2025-03-25T13:31:02+00:00       465      31.1         28.3  319  6.1  27   
4  2025-03-25T13:33:59+00:00       466      30.7         28.2  317  6.2  27   

     P    K Fertilizer  kg/ha  
0  111  103       Urea  126.0  
1  111  103       Urea  126.0  
2  111  103       Urea  126.0  
3  110  103       Urea  121.5  
4  110  102       Urea  121.5  

Columns: Index(['created_at', 'entry_id', 'Moisture', 'Temperature', 'EC', 'pH', 'N',
       'P', 'K', 'Fertilizer', 'kg/ha'],
      dtype='object')

Missing values:
 created_at     0
entry_id       0
Moisture       0
Temperature    0
EC             0
pH             0
N              0
P              0
K              0
Fertilizer  

In [3]:
# Drop unnecessary columns
df = df.drop(['created_at', 'entry_id'], axis=1)

# Define features and transformers
numeric_features = ['Moisture', 'Temperature', 'EC', 'pH', 'N', 'P', 'K']
categorical_features = ['Fertilizer']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Encode target variable (Fertilizer)
le = LabelEncoder()
df['Fertilizer'] = le.fit_transform(df['Fertilizer'])

# Split data into features and targets
X = df.drop(['Fertilizer', 'kg/ha'], axis=1)
y_class = df['Fertilizer']
y_reg = df['kg/ha']

In [4]:
X_train, X_test, y_class_train, y_class_test, y_reg_train, y_reg_test = train_test_split(
    X, y_class, y_reg, test_size=0.2, random_state=42)

In [5]:
# For classification
selector_class = SelectKBest(score_func=f_classif, k='all')
X_train_class = selector_class.fit_transform(X_train, y_class_train)
X_test_class = selector_class.transform(X_test)

# For regression
selector_reg = SelectKBest(score_func=mutual_info_regression, k='all')
X_train_reg = selector_reg.fit_transform(X_train, y_reg_train)
X_test_reg = selector_reg.transform(X_test)

In [6]:
class_models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

# Hyperparameter tuning for best classifier
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search_class = GridSearchCV(RandomForestClassifier(random_state=42),
                               param_grid_rf, cv=5, scoring='accuracy')
grid_search_class.fit(X_train_class, y_class_train)
best_class_model = grid_search_class.best_estimator_

# Save the best classification model
joblib.dump(best_class_model, 'best_classification_model.pkl')
# Alternative: pickle.dump(best_class_model, open('best_class_model.pkl', 'wb'))

# Also save the label encoder
joblib.dump(le, 'label_encoder.pkl')

# Evaluate all classification models
for name, model in class_models.items():
    model.fit(X_train_class, y_class_train)
    y_pred = model.predict(X_test_class)
    # Evaluation code remains the same...

In [7]:
reg_models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Support Vector': SVR()
}

# Hyperparameter tuning for best regressor
param_grid_rf_reg = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid_search_reg = GridSearchCV(RandomForestRegressor(random_state=42),
                             param_grid_rf_reg, cv=5, scoring='r2')
grid_search_reg.fit(X_train_reg, y_reg_train)
best_reg_model = grid_search_reg.best_estimator_

# Save the best regression model
joblib.dump(best_reg_model, 'best_regression_model.pkl')

# Evaluate all regression models
for name, model in reg_models.items():
    model.fit(X_train_reg, y_reg_train)
    y_pred = model.predict(X_test_reg)
    # Evaluation code remains the same...

In [8]:
def check_overfitting(model, X_train, y_train, X_test, y_test, task='classification'):
    if task == 'classification':
        train_score = model.score(X_train, y_train)
        test_score = model.score(X_test, y_test)
    else:
        train_score = model.score(X_train, y_train)
        test_score = model.score(X_test, y_test)
    
    print(f"\nOverfitting Check for {type(model).__name__}:")
    print(f"Train Score: {train_score:.4f}")
    print(f"Test Score: {test_score:.4f}")
    
    if abs(train_score - test_score) > 0.2:
        if train_score > test_score:
            print("Warning: Possible overfitting!")
        else:
            print("Warning: Possible underfitting!")
    else:
        print("No significant overfitting or underfitting detected.")

# Check for classification models
for name, model in class_models.items():
    check_overfitting(model, X_train_class, y_class_train, X_test_class, y_class_test)

# Check for regression models
for name, model in reg_models.items():
    check_overfitting(model, X_train_reg, y_reg_train, X_test_reg, y_reg_test, 'regression')


Overfitting Check for RandomForestClassifier:
Train Score: 1.0000
Test Score: 1.0000
No significant overfitting or underfitting detected.

Overfitting Check for GradientBoostingClassifier:
Train Score: 1.0000
Test Score: 1.0000
No significant overfitting or underfitting detected.

Overfitting Check for KNeighborsClassifier:
Train Score: 1.0000
Test Score: 1.0000
No significant overfitting or underfitting detected.

Overfitting Check for RandomForestRegressor:
Train Score: 1.0000
Test Score: 1.0000
No significant overfitting or underfitting detected.

Overfitting Check for GradientBoostingRegressor:
Train Score: 1.0000
Test Score: 1.0000
No significant overfitting or underfitting detected.

Overfitting Check for SVR:
Train Score: 0.1645
Test Score: 0.1662
No significant overfitting or underfitting detected.


In [9]:
# Create some new samples
new_samples = pd.DataFrame({
    'Moisture': [31.1, 32.1, 29.8],
    'Temperature': [28.3, 27.5, 28.3],
    'EC': [320, 325, 310],
    'pH': [6.2, 6.3, 6.0],
    'N': [28, 29, 26],
    'P': [111, 115, 108],
    'K': [103, 105, 100]
})

# Load saved models for demonstration
loaded_class_model = joblib.load('best_classification_model.pkl')
loaded_reg_model = joblib.load('best_regression_model.pkl')
loaded_le = joblib.load('label_encoder.pkl')

# Preprocess new samples
new_samples_class = selector_class.transform(new_samples)
new_samples_reg = selector_reg.transform(new_samples)

# Make predictions
print("\nPredictions for new samples using saved models:")
for i, sample in new_samples.iterrows():
    print(f"\nSample {i+1}:")
    print(sample)
    
    # Classification prediction
    fert_pred = loaded_class_model.predict(new_samples_class[i:i+1])
    print(f"Predicted Fertilizer: {loaded_le.inverse_transform(fert_pred)[0]}")
    
    # Regression prediction
    kg_pred = loaded_reg_model.predict(new_samples_reg[i:i+1])
    print(f"Predicted kg/ha: {kg_pred[0]:.2f}")


Predictions for new samples using saved models:

Sample 1:
Moisture        31.1
Temperature     28.3
EC             320.0
pH               6.2
N               28.0
P              111.0
K              103.0
Name: 0, dtype: float64
Predicted Fertilizer: Urea
Predicted kg/ha: 126.00

Sample 2:
Moisture        32.1
Temperature     27.5
EC             325.0
pH               6.3
N               29.0
P              115.0
K              105.0
Name: 1, dtype: float64
Predicted Fertilizer: Urea
Predicted kg/ha: 194.09

Sample 3:
Moisture        29.8
Temperature     28.3
EC             310.0
pH               6.0
N               26.0
P              108.0
K              100.0
Name: 2, dtype: float64
Predicted Fertilizer: Urea
Predicted kg/ha: 117.00


In [10]:
# For classification
print("\nFeature Importance for Classification:")
feature_imp_class = pd.Series(best_class_model.feature_importances_,
                             index=X.columns[selector_class.get_support()])
# Plotting code remains the same...

# For regression
print("\nFeature Importance for Regression:")
feature_imp_reg = pd.Series(best_reg_model.feature_importances_,
                           index=X.columns[selector_reg.get_support()])
# Plotting code remains the same...


Feature Importance for Classification:

Feature Importance for Regression:


In [11]:
# Load models
loaded_class_model = joblib.load('best_classification_model.pkl')
loaded_reg_model = joblib.load('best_regression_model.pkl')
loaded_le = joblib.load('label_encoder.pkl')

# Make predictions
fert_pred = loaded_class_model.predict(new_samples)
kg_pred = loaded_reg_model.predict(new_samples)
fert_name = loaded_le.inverse_transform(fert_pred)