In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import pickle

# Load the data
df = pd.read_csv('DATA_MILK.csv')

# Define the features and target
X = df[['Animal Type', 'Steroid Class', 'Steroid Name', 'Age', 'Weight']]
y = df['Legal Limit']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing for numerical and categorical columns
numerical_features = ['Age', 'Weight']
categorical_features = ['Animal Type', 'Steroid Class', 'Steroid Name']

# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Define the full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
train_score = pipeline.score(X_train, y_train)
print(f'R2 Score (Train): {train_score}')

y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# Save the pipeline to a .pkl file
with open('milk.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

print("Pipeline saved as 'milk.pkl'.")

# Example usage for prediction
sample_input = pd.DataFrame({
    'Animal Type': ['Cow'],
    'Steroid Class': ['Estrogens'],
    'Steroid Name': ['17β-Estradiol'],
    'Age': [4],
    'Weight': [450]
})

# Load the pipeline for prediction (to ensure saving/loading works)
with open('milk.pkl', 'rb') as file:
    loaded_pipeline = pickle.load(file)

# Predict using the trained model
predicted_legal_limit = loaded_pipeline.predict(sample_input)
print(f'Predicted Legal Limit: {predicted_legal_limit[0]}')
