In [4]:


import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Import category encoders (choose appropriate encoder based on your needs)
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

data_path="Medicine.csv"

# Load and preprocess data
def load_and_preprocess_data(data_path):
    train = pd.read_csv(data_path)
    categorical_cols = ["Disease", "Season",""]  # Adjust these based on your data

   

 # Split features and target
    X = train[categorical_cols]
    y = train["Quantity"]

    # Separate train/test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Choose and apply encoding (consider data sparsity and interpretability)
    if len(set(train[categorical_cols[0]])) > 10:  # Use OneHotEncoder if category count > 10
        encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
        X_train_encoded = encoder.fit_transform(X_train)
        X_test_encoded = encoder.transform(X_test)
    else:
        encoder = LabelEncoder()
        X_train_encoded = encoder.fit_transform(X_train)
        X_test_encoded = encoder.transform(X_test)

    return X_train_encoded, X_test_encoded, y_train, y_test, encoder

# Choose and train a model
def train_model(X_train, y_train):
    models = {
        "RandomForestRegressor": RandomForestRegressor(n_estimators=100, random_state=42),
        "LinearRegression": LinearRegression(),
        "SVM Regression": svm.SVR(kernel="linear")  # Add more models as needed
    }

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_train)
        mse = mean_squared_error(y_train, y_pred)
        r2 = r2_score(y_train, y_pred)
        print(f"{name}: MSE={mse:.3f}, R-squared={r2:.3f}")

    # Select the best model based on your criteria (e.g., MSE, R-squared)
    best_model = models["RandomForestRegressor"]  # Replace with the chosen model

    return best_model, encoder  # Return both the model and encoder for prediction

# Make predictions on new data
def make_predictions(new_data, encoder, best_model):
    # Ensure compatibility with the model's expected input format
    if isinstance(encoder, OneHotEncoder):
        new_data_encoded = encoder.transform([new_data])
    else:
        new_data_encoded = encoder.transform([new_data])

    prediction = best_model.predict(new_data_encoded)[0]

    # Optionally convert prediction back to category name for interpretation
    if isinstance(encoder, LabelEncoder):
        predicted_category = encoder.inverse_transform([prediction])[0]
        print(f"Predicted Quantity: {prediction} ({predicted_category})")
    else:
        print(f"Predicted Quantity: {prediction}")

# Example usage
data_path = "medicine.csv"  # Replace with your data path
X_train_encoded, X_test_encoded, y_train, y_test, encoder = load_and_preprocess_data(data_path)
best_model, encoder = train_model(X_train_encoded, y_train)

# New data example (adjust categories/values as needed)
new_data = {"Disease": "Malaria", "Season": "Wet Season"}  # Use categorical names
make_predictions(new_data, encoder, best_model)

ValueError: y should be a 1d array, got an array of shape (29, 2) instead.