<a href="https://colab.research.google.com/github/Anish-S-tech/my-ml-journey/blob/main/EDA_in_Machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Steps in Exploratory data analysis in Machine learning

In [None]:
# Step 1: Importing required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings as wr
wr.filterwarnings('ignore')

In [None]:
# Step 2: Reading the dataset (using pandas)

df = pd.read_csv('/content/WineQT.csv')
df.head()

In [None]:
# Step 3: Analyzing the data

df.shape


In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.columns.tolist()

In [None]:
# Step 4: Checking missing values

df.isnull().sum()

In [None]:
# Step 5: Checking for duplicate values

df.nunique()

In [None]:
# Step 6: Univariate Analysis (Single variable analysis)

# Bar plot for evaluating the count of the wine with it's quality rate

quality_counts = df['quality'].value_counts()

plt.figure(figsize=(8,6))
plt.bar(quality_counts.index,quality_counts,color='darkred')
plt.title('Bar plot of quality vs count')
plt.xlabel('Quality')
plt.ylabel('Count')
plt.show()

In [None]:
# Kernel density plot - for understanding variance in the dataset

sns.set_style('darkgrid')

numerical_columns = df.select_dtypes(include=['int64','float64']).columns
plt.figure(figsize=(14,len(numerical_columns)*3))  # width = 14, height = 3 for each numerical_columns

for idx,feature in enumerate(numerical_columns,1): # unpacking the pair (index,num_col) to use them as separate variables
# (num_col,1) = here, 1 is for defining the index starts from 1 (not default 0)

  plt.subplot(len(numerical_columns),2,idx) # plt.subplot(no.of_rows,no.of_cols,index)
  sns.histplot(df[feature],kde=True)
  plt.title(f'{feature} | Skewness: {round(df[feature].skew(),2)}')

plt.tight_layout()
plt.show()

In [None]:
# Swarm plot - for showing the outliers in the data

plt.figure(figsize=(10,8))

sns.swarmplot(x='quality',y='alcohol',data=df,palette='viridis')

plt.title("Swarm plot for detecting outliers (quality vs alcohol)")
plt.xlabel("Quality")
plt.ylabel("Alcohol")
plt.show()

In [None]:
# Step 7: Bivariate analysis (Two variables are analyzed together to identify patterns)

# Pair plot - for showing the distribution of individual variables

sns.set_palette('Pastel1')

plt.figure(figsize=(10,6))
sns.pairplot(df)
plt.title("Pair plot for the dataframe (For each variables)")
plt.show()

In [None]:
# Violin plot - for examining the relationship between two variables (Here alcohol and quantity)

df['quality'] = df['quality'].astype(str)

plt.figure(figsize=(10,8))

sns.violinplot(x='quality',y='alcohol',data=df,palette = {
               '3': 'lightcoral', '4': 'lightblue', '5': 'lightgreen', '6':'gold', '7':'lightskyblue', '8':'lightpink'},
               alpha=0.9)

plt.title("Violin plot between quality and alcohol")
plt.xlabel("Quality")
plt.ylabel("Alcohol")
plt.show()


In [None]:
# Box plot - for visualizing the data distribution relationship between two variables

sns.boxplot(x='quality',y='alcohol',data=df)

In [None]:
# Step 8: Multivariate analysis (For examining relationship between multiple variables)

# Correlation matrix - for identifying the relationship between each other features

plt.figure(figsize=(15,10))

sns.heatmap(df.corr(),annot=True,fmt='.2f',cmap='Pastel1',linewidths=2)

plt.title("Correlation heatmap")
plt.show()