# 🥛 Milk Quality Prediction - Jupyter Notebook
This notebook demonstrates milk quality prediction using **Logistic Regression** and **Random Forest** models.
It includes data preprocessing, visualization, and model evaluation.

In [None]:

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Display plots inline
%matplotlib inline


## 📥 Load Dataset

In [None]:

DATA_PATH = os.path.join("data", "milknew.csv")
df = pd.read_csv(DATA_PATH)
df.head()


## 📊 Dataset Info

In [None]:

df.shape, df.info(), df.describe()


## 🧹 Data Cleaning & Preprocessing

In [None]:

# Fix column names if needed
if 'Fat ' in df.columns:
    df.rename(columns={'Fat ': 'Fat'}, inplace=True)

# Encode target variable
df['Grade'] = df['Grade'].map({'high': 2, 'medium': 1, 'low': 0})

df['Grade'].unique(), df.isnull().sum()


## 📈 Data Visualization

In [None]:

plt.figure(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True, cmap='RdYlGn')
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:

df.hist(bins=10, figsize=(20, 20), color='blue')
plt.suptitle("Feature Distributions")
plt.show()


In [None]:

sns.countplot(data=df, x='Grade')
plt.title("Distribution of Milk Quality Grades")
plt.show()


In [None]:

sns.regplot(data=df, x="Temprature", y="pH")
plt.title("Temperature vs pH")
plt.show()


In [None]:

sns.scatterplot(x="Temprature", y="Colour", data=df)
plt.title("Temperature vs Colour")
plt.show()


## 🛠️ Dataset Preparation

In [None]:

X = df.drop(columns=['Grade'])
y = df['Grade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


## 🤖 Logistic Regression

In [None]:

acc_vec = []
c_vec = np.arange(0.1, 10, 0.5)

for c in c_vec:
    model = LogisticRegression(C=c, max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc_vec.append(accuracy_score(y_test, y_pred))

best_log_acc = max(acc_vec)
print(f"Logistic Regression Best Accuracy: {best_log_acc:.2f}")

plt.plot(c_vec, acc_vec)
plt.xlabel("C (Regularization parameter)")
plt.ylabel("Accuracy")
plt.title("Logistic Regression Accuracy vs C")
plt.show()


## 🌲 Random Forest

In [None]:

acc_vec_RF = []
depth_vec = np.arange(1, 20, 1)

for d in depth_vec:
    clf = RandomForestClassifier(max_depth=d, random_state=42)
    clf.fit(X_train, y_train)
    y_pred_RF = clf.predict(X_test)
    acc_vec_RF.append(accuracy_score(y_test, y_pred_RF))

best_rf_acc = max(acc_vec_RF)
print(f"Random Forest Best Accuracy: {best_rf_acc:.2f}")

plt.plot(depth_vec, acc_vec_RF)
plt.xlabel("Max Depth")
plt.ylabel("Accuracy")
plt.title("Random Forest Accuracy vs Tree Depth")
plt.show()


## ✅ Final Results

In [None]:

print("Best Logistic Regression Accuracy:", round(best_log_acc, 2))
print("Best Random Forest Accuracy:", round(best_rf_acc, 2))
