In [None]:
import pandas as pd  #data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np #Linear algebra

# running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import matplotlib.pyplot as plt #pyplot is a collection of command style functions that make matplotlib work like MATLAB
import seaborn as sns #It is advance data visualization library
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
sns.set_style('darkgrid') # It sets the style of the plots to have a dark background with gridlines.

## Load Dataset

In [None]:
df=pd.read_csv('../input/milk-quality-prediction/milknew.csv') #df is name given to the dataset

## Exploratory data analysis (EDA)

In [None]:
#Print first 5 rows of dataset
df.head()

In [None]:
#Print last 5 rows of dataset
df.tail()

In [None]:
df.shape # which presents columns and rows of the dataset

In [None]:
#the information contains the number of columns, column labels, column data types,
#memory usage, range index, and the number of cells in each column (non-null values)

df.info() #actually prints the info.

### Basic Statistic details about the data
note only numerical columns would be displayed here unless parameter include="all"
* count tells us the number of NoN-empty rows in a feature.
* mean tells us the mean value of that feature.
* std tells us the Standard Deviation Value of that feature.
* min tells us the minimum value of that feature.
* 25%, 50%, and 75% are the percentile/quartile of each features. This quartile information helps us to detect Outliers.
* max tells us the maximum value of that feature.

In [None]:
df.describe()

## Data Cleaning

In [None]:
df.duplicated().sum()

In [None]:
#We can remove the duplicates values using dropna..
df.drop_duplicates()

In [None]:
#Checking for null values which are present in dataset
df.isnull().sum()

In [None]:
df.nunique()

### Value counts of Different Colums

In [None]:
df[['Grade']].value_counts()

In [None]:
df[['pH']].value_counts()

In [None]:
df[['Taste']].value_counts()

In [None]:
df[['Grade']].value_counts()

In [None]:
df[['Temprature']].value_counts()

In [None]:
#Correlation of columns between each other 
df.corr()

## Data Visualization

In [None]:
heatmap=sns.heatmap(
    df.corr(),           # Compute the correlation matrix of the DataFrame
    annot=True,          # Display the correlation values on the heatmap
    cmap='coolwarm',      # Set the color map for the heatmap (using the 'viridis' colormap)
    vmax=1.0,            # Set the maximum value of the color range to 1.0
    vmin=-1.0,           # Set the minimum value of the color range to -1.0
    linewidths=0.1,      # Set the width of the lines between cells in the heatmap
    annot_kws={"size": 8},  # Set the font size for the annotations on the heatmap
    square=True          # Force the heatmap cells to be square-shaped
)
heatmap.set_title('Correaltion Of Numeric Columns')#Set the title for heatmap
plt.xticks(rotation=45)#Set the title for heatmap
plt.show()

we first calculate the correlation matrix of the DataFrame using df.corr(). Then, we set up the figure and axes using plt.figure(figsize=(10, 8)). Next, we create the heatmap using sns.heatmap(), providing the correlation matrix as the data. We specify the annot parameter as True to display the correlation values on the heatmap, and we set various other options such as the color map (cmap), the range of color values (vmax and vmin), the line width between cells (linewidths), and the font size for the annotations (annot_kws). Finally, we set the title for the heatmap using heatmap.set_title(), and show the plot using plt.show()

## Grade Distribution

In [None]:
grade_counts = df['Grade'].value_counts()

# Get the unique grades
unique_grades = np.unique(df['Grade'])

# Set up the figure and axes
fig, ax = plt.subplots(figsize=(8, 6))

# Create the pie chart
pie = ax.pie(grade_counts, autopct='%1.2f%%', labels=unique_grades)

# Set the title for the pie chart
ax.set_title('Distribution of Grades')

# Add a legend
ax.legend(pie[0], unique_grades, title='Grades', loc='center left', bbox_to_anchor=(1, 0.5))

# Equal aspect ratio ensures that pie is drawn as a circle
ax.axis('equal')

# Show the plot
plt.show()

In this code, we use plt.subplots() to set up the figure and axes, and then create the pie chart using ax.pie(). We set the autopct parameter to format the percentage values displayed on the chart. We also set a title for the pie chart using ax.set_title(), add a legend using ax.legend(), and ensure that the pie is drawn as a circle by setting the aspect ratio using ax.axis('equal').

In [None]:
# Create the pair plot with hue-based coloring
pairplot = sns.pairplot(df, hue='Grade')

# Set the title for the pair plot
pairplot.fig.suptitle('Pair Plot of Numeric Columns with Grade Hue')

# Adjust the layout of the plot
plt.tight_layout()

# Show the plot
plt.show()

In this code, we use sns.pairplot() to create a pair plot of the numeric columns in the DataFrame df. The hue parameter is set to 'Grade', which colors the data points based on the different grades. We then set the title for the pair plot using pairplot.fig.suptitle(), adjust the layout of the plot using plt.tight_layout(), and finally show the plot using plt.show().

In [None]:
plt.figure(figsize=(20, 15))

plt.subplot(3, 3, 1)
sns.barplot(x='Grade', y='pH', data=df, palette="Reds")
plt.title('pH vs. Grade')

plt.subplot(3, 3, 2)
sns.barplot(x='Grade', y='Temprature', data=df, palette="Wistia")
plt.title('Temperature vs. Grade')

plt.subplot(3, 3, 3)
sns.barplot(x='Grade', y='Taste', data=df, palette="summer")
plt.title('Taste vs. Grade')

plt.subplot(3, 3, 4)
sns.barplot(x='Grade', y='Odor', data=df, palette="Blues")
plt.title('Odor vs. Grade')

plt.subplot(3, 3, 5)
sns.barplot(x='Grade', y='Fat ', data=df, palette="RdPu")
plt.title('Fat vs. Grade')

plt.subplot(3, 3, 6)
sns.barplot(x='Grade', y='Turbidity', data=df, palette="PuRd_r")
plt.title('Turbidity vs. Grade')

plt.subplot(3, 3, 7)
sns.barplot(x='Grade', y='Colour', data=df, palette="Greys_r")
plt.title('Colour vs. Grade')

plt.tight_layout()
plt.show()

In this code, we use plt.figure(figsize=(20, 15)) to set the size of the overall figure. Then, we create a 3x3 grid of subplots using plt.subplot(), and for each subplot, we use sns.barplot() to create a bar plot with the specified x and y columns from the DataFrame df. We set different color palettes for each subplot using the palette parameter. Finally, we set titles for each subplot using plt.title() and adjust the layout using plt.tight_layout().

### Odor v/s Grade using Boxplot

In [None]:
plt.figure(figsize = (12, 6))
ax = sns.boxplot(y='Grade', x='Odor', data=df)
plt.setp(ax.artists, alpha=.5, linewidth=2, edgecolor="k")
plt.xticks(rotation=45)
plt.title('Odor v/s Grade')

### Temprature v/s Grade using Lineplot

In [None]:
sns.lineplot(data=df, x='Temprature', y='Grade', color='darkblue')

### Turbidity v/s Grade using Swarmplot

In [None]:
plt.figure(figsize=(10,6))
sns.swarmplot(data=df, x='Turbidity', y='Grade',color='blue')
plt.show()

### pH v/s Grade using Barplot

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(x=df['pH'], height=df['Grade'], color='violet')
plt.show()

### Taste v/s Grade using Scatterplot 

In [None]:
plt.figure(figsize=(15,9))
sns.violinplot(data=df, x='Colour', y='Grade',hue=df['Taste'],palette='cool')
plt.show()

In [None]:
plt.figure(figsize=(25,15))

plt.subplot(2,4,1)
sns.histplot(df['pH'], color = 'black', kde = True).set_title('pH')

plt.subplot(2,4,2)
sns.histplot(df['Temprature'], kde = True, color = 'yellow').set_title('Temperature')

plt.subplot(2,4,3)
sns.histplot(df['Taste'], kde = True, color = 'violet').set_title('Taste')

plt.subplot(2,4,4)
sns.histplot(df['Odor'], kde = True, color = 'green').set_title('Odor')

plt.subplot(2,4,7)
sns.histplot(df['Fat '], kde = True, color = 'pink').set_title('Fat')

plt.subplot(2,4,5)
sns.histplot(df['Turbidity'], kde = True, color = 'red').set_title('Turbidity')

plt.subplot(2,4,6)
sns.histplot(df['Colour'], kde = True, color = 'orange').set_title('Colour')

In [None]:
df.hist(figsize=(20,20))
plt.show()

In [None]:
plt.figure(figsize=(20, 15))

plt.subplot(3, 3, 1)
sns.violinplot(x='Grade', y='pH', data=df, palette="Reds")
plt.title('pH vs. Grade')

plt.subplot(3, 3, 7)
sns.violinplot(x='Grade', y='Temprature', data=df, palette="Wistia")
plt.title('Temperature vs. Grade')

plt.subplot(3, 3, 6)
sns.violinplot(x='Grade', y='Taste', data=df, palette="summer")
plt.title('Taste vs. Grade')

plt.subplot(3, 3, 2)
sns.violinplot(x='Grade', y='Odor', data=df, palette="Blues")
plt.title('Odor vs. Grade')

plt.subplot(3, 3, 3)
sns.violinplot(x='Grade', y='Fat ', data=df, palette="RdPu")
plt.title('Fat vs. Grade')

plt.subplot(3, 3, 4)
sns.violinplot(x='Grade', y='Turbidity', data=df, palette="PuRd_r")
plt.title('Turbidity vs. Grade')

plt.subplot(3, 3, 5)
sns.violinplot(x='Grade', y='Colour', data=df, palette="Greys_r")
plt.title('Colour vs. Grade')

plt.tight_layout()
plt.show()

In this code, we use plt.figure(figsize=(20, 15)) to set the size of the overall figure. Then, we create a 3x3 grid of subplots using plt.subplot(), and for each subplot, we use sns.violinplot() to create a violin plot with the specified x and y columns from the DataFrame df. We set different color palettes for each subplot using the palette parameter. Finally, we set titles for each subplot using plt.title() and adjust the layout using plt.tight_layout().

## We performing following operations
`Logestic Regression` 

`Decision Tree CLassifier` 

`Random Forest Classifier`

`KNeighbers Classifier`

## Data Preparation

In [None]:
X=df.drop('Grade',axis=1) #independent vaiables, Predictors

In [None]:
y=df.Grade #dependent variable, prediction done on this column

In [None]:
X

In [None]:
[[y]]

## Model Building

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)

In [None]:
X_train.head()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [None]:
X_train_std=scaler.fit_transform(X_train)
X_test_std=scaler.transform(X_test)

In [None]:
X_train_std

In [None]:
X_train

In [None]:
y_train.head()

In [None]:
df.columns

## Logistic Regression

- Logistic regression is a popular statistical model used for binary classification problems,
where the dependent variable (also called the target or outcome variable)
takes on two possible values, typically represented as 0 and 1.
It is named after the logistic function used in the model.


- The logistic regression model estimates the probability of the dependent variable
belonging to a certain class based on one or more independent variables
(also known as predictors or features). The model uses a logistic or sigmoid function 
to transform the linear combination of the predictors into a probability value between 0 and 1.

In [None]:
#import Linear regression 
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()

In [None]:
lr.fit(X_train,y_train) #fitting the training and testing data

In [None]:
print('Training accuracy score',lr.score(X_train,y_train)) #accuracy of training data
print("Testing accuracy Score",lr.score(X_test,y_test)) #accuracy of test data

## Confusion Matrix

- A `confusion matrix` is a table used to describe the performance of a classification model by displaying the counts of true positive, true negative, false positive, and false negative predictions. It is a useful tool for evaluating the accuracy of a classification algorithm and understanding the types of errors it makes.


- The confusion matrix allows you to calculate various performance metrics, including `accuracy`,` precision`, `recall (sensitivity)`, `specificity`, and `F1 score`. 

In [None]:
from sklearn import metrics

In [None]:
confusion_matrix = metrics.confusion_matrix

In [None]:
y_pred=lr.predict(X_test)

In [None]:
cf_matrix=confusion_matrix(y_test,y_pred)

In [None]:
cf_matrix

In [None]:
sns.heatmap(cf_matrix,annot=True)

## Classifiaction Report

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,y_pred))

## Prediction

In [None]:
print(y_pred)

## Decision Tree Classifier

- Two criterion is there in this
- nut we using only one i.e.criterion entropy

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(criterion='entropy',max_depth=3,random_state=42)

In [None]:
dtc.fit(X_train,y_train)

In [None]:
print("Training accuracy sciore",dtc.score(X_train,y_train))
print("testing Accuracy score",dtc.score(X_test,y_test))

## Confusion Matrix

In [None]:
y_pred=dtc.predict(X_test)
confusion_matrix(y_test,y_pred)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True)

## Classification Report

In [None]:
print(classification_report(y_test,y_pred))

## Visualization Of Decision Tree

In [None]:
from sklearn import tree
plt.figure(figsize=(12,8))
tree.plot_tree(dtc.fit(X_train,y_train))
plt.show()

## Random Forest
- `Random Forest` is a popular machine learning algorithm used for both regression and classification tasks. It is an ensemble learning method that combines multiple decision trees to make predictions. Each decision tree is built on a randomly sampled subset of the data and features, hence the name `random forest`.

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()

In [None]:
rfc.fit(X_train,y_train)

In [None]:
print("training accuracy score",rfc.score(X_train,y_train))
print("testing accuracy score",rfc.score(X_test,y_test))

In [None]:
y_pred=rfc.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print("model accuracy score with 100 decision trees:{0:0.4f}".format(accuracy_score(y_test,y_pred)))

### Random Forest with n_estimatos =100

In [None]:
rfc_100=RandomForestClassifier(n_estimators=100)

In [None]:
rfc_100.fit(X_train,y_train)

In [None]:
rfc_100_pred=rfc.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print("model accuracy score with 100 decision trees:{0:0.4f}".format(accuracy_score(y_test,rfc_100_pred)))

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
print(classification_report(y_test,rfc_100_pred))

In [None]:
confusion_matrix(y_test,y_pred)

## KNeighbersClassifiers

- The k-Nearest Neighbors (k-NN) algorithm is a simple yet effective classification algorithm that predicts the class of a new sample based on its similarity to the k nearest neighbors in the training dataset. It is a non-parametric and lazy learning algorithm, meaning it doesn't make any assumptions about the underlying data distribution and doesn't explicitly build a model during the training phase.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()

In [None]:
knn.fit(X_train,y_train)

In [None]:
print("training Accuracy score",knn.score(X_train,y_train))
print("testinhg accuracy score",knn.score(X_test,y_test))

In [None]:
knn_pred=knn.predict(X_test)
print("accuracy score",accuracy_score(y_test,knn_pred))

In [None]:
confusion_matrix(y_test,knn_pred)

In [None]:
print(classification_report(y_test,knn_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test,knn_pred),annot=True)

In [None]:
print(dtc.predict(X_test))