# Milk Data Classification

### 1. Loading the Dataset

In [104]:
# Imports
import pandas as pd
import numpy as np
import altair as alt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay


In [76]:
milk_data = pd.read_csv('milknew.csv')
milk_data.rename(columns = {"Temprature": "Temperature"}, inplace = True)
milk_data.shape


(1059, 8)

Based on the shape of the data, there are 1059 obersvations with 8 predictors

### 2. Data Preprocessing

In [77]:
milk_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1059 entries, 0 to 1058
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pH           1059 non-null   float64
 1   Temperature  1059 non-null   int64  
 2   Taste        1059 non-null   int64  
 3   Odor         1059 non-null   int64  
 4   Fat          1059 non-null   int64  
 5   Turbidity    1059 non-null   int64  
 6   Colour       1059 non-null   int64  
 7   Grade        1059 non-null   object 
dtypes: float64(1), int64(6), object(1)
memory usage: 66.3+ KB


We can see all the 8 perdictors and their data types. The dataset contains:
- 1 float column (pH)
- 6 int columns (Temperature, Taste, Odor, Fat, Turbidity, Colour, Grade)
- 1 object column (Grade)

Since 'Grade' contains string data, we should convert that to an integer type

In [78]:
milk_data['Grade'] = milk_data['Grade'].map({'low':0,'medium':1,'high':2})
milk_data.head()

Unnamed: 0,pH,Temperature,Taste,Odor,Fat,Turbidity,Colour,Grade
0,6.6,35,1,0,1,0,254,2
1,6.6,36,0,1,0,1,253,2
2,8.5,70,1,1,1,1,246,0
3,9.5,34,1,1,0,1,255,0
4,6.6,37,0,0,0,0,255,1


Now we have mapped the grade as : 
- 'Low' = 0
- 'Medium' = 1
- 'High' = 2 

In [79]:
milk_data.isna().sum()

pH             0
Temperature    0
Taste          0
Odor           0
Fat            0
Turbidity      0
Colour         0
Grade          0
dtype: int64

Our data does not contain any NA values. So we do not need to handle that.

In [80]:
# Correlation
milk_data.drop('Grade', axis=1).corr()

Unnamed: 0,pH,Temperature,Taste,Odor,Fat,Turbidity,Colour
pH,1.0,0.244684,-0.064053,-0.081331,-0.093429,0.048384,-0.164565
Temperature,0.244684,1.0,-0.109792,-0.04887,0.024073,0.185106,-0.008511
Taste,-0.064053,-0.109792,1.0,0.017582,0.324149,0.055755,-0.082654
Odor,-0.081331,-0.04887,0.017582,1.0,0.314505,0.457935,-0.039361
Fat,-0.093429,0.024073,0.324149,0.314505,1.0,0.329264,0.114151
Turbidity,0.048384,0.185106,0.055755,0.457935,0.329264,1.0,0.136436
Colour,-0.164565,-0.008511,-0.082654,-0.039361,0.114151,0.136436,1.0


We can see the correlation between all the predictors in the table above.

### 3. Data Visualizations

In [81]:
chart_grade_count = alt.Chart(milk_data).mark_bar().encode(
    x=alt.X('count()'),
    y=alt.Y('Grade', sort='x')
).properties(
    height=200,
    width=600
)
chart_grade_count

### 4. Data Standardization and Splitting

We will start by splitting our dataset into X (predictors) and Y (response variable). 

In [82]:
X = milk_data.drop('Grade', axis=1)
y = milk_data['Grade']

- Labels in X (Predictors) : pH, Temprature, Taste, Odor, Fat, Turbidity, Colour 
- Labels in Y (Response Variable) : Grade

Now we will scale our data using StandardScaler :

In [83]:
# Scaling the quantitative features
quantitative_vars = X[["pH", "Temperature", "Colour"]]
sc = StandardScaler()
sc.fit(quantitative_vars)
xscaled=sc.transform(quantitative_vars)

# Creating a new DataFrame with the scaled features
X_scaled=pd.DataFrame(data=xscaled,columns=milk_data.columns[0:3])
X_scaled = pd.concat([X_scaled, X.iloc[:,2:6]], axis = 1)
X_scaled



Unnamed: 0,pH,Temperature,Taste,Taste.1,Odor,Fat,Turbidity
0,-0.021531,-0.914107,0.501600,1,0,1,0
1,-0.021531,-0.815035,0.269333,0,1,0,1
2,1.336564,2.553438,-1.356536,1,1,1,1
3,2.051351,-1.013180,0.733867,1,1,0,1
4,-0.021531,-0.715962,0.733867,0,0,0,0
...,...,...,...,...,...,...,...
1054,0.049947,0.076620,-1.124269,1,1,0,0
1055,0.049947,-0.616889,0.733867,1,0,1,0
1056,-2.594765,-0.418744,0.733867,1,1,1,1
1057,0.121426,-0.121525,-0.427468,1,0,1,0


Splitting the data into training and testing set:

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

### 5. Running a Random Forest Analysis on the data

In [85]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9952830188679245


### 6. Running a Multinomial Logistic Regression

In [126]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, , 10, 100],
    'multi_class': ['multinomial'],
    'solver': ['lbfgs', 'sag', 'newton-cg']
}

# Create the logistic regression model
mnr = LogisticRegression(max_iter = 1000)

# Create GridSearchCV with cross-validation
grid_search = GridSearchCV(mnr, param_grid, cv=5, scoring='accuracy')

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)

# Make predictions on the test set using the best model
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

# Evaluate the model performance (optional)
accuracy = np.mean(predictions == y_test)
print(f"Accuracy: {accuracy}")

Best Parameters: {'C': 1, 'multi_class': 'multinomial', 'solver': 'lbfgs'}
Accuracy: 0.839622641509434
