In [1]:
# Univariate and Bivariate Analysis in Python

# Import necessary libraries
import pandas as pd         # For data manipulation
import numpy as np          # For numerical computations
import seaborn as sns       # For visualization
import matplotlib.pyplot as plt  # For plotting
import warnings
warnings.filterwarnings("ignore")  # Ignore warnings for cleaner output

# Set the style for plots
sns.set_style("whitegrid")

# Load the dataset
df = pd.read_csv("zomato_dataset.csv")

# Display the first few rows of the dataset
df.head()

ModuleNotFoundError: No module named 'pandas'

## Step 1: Data Overview

Before diving into analysis, let's inspect the dataset.

In [7]:
# Get basic information about the dataset
df.info()

# Summary statistics of numerical variables
df.describe()

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# Display data types of each column
print("\nData types of columns:")
print(df.dtypes)

NameError: name 'df' is not defined

## Step 2: Univariate Analysis
1️. Distribution of Numerical Variables
We analyze each numerical column's distribution using histograms and kernel density estimation (KDE) plots.

In [None]:
# Plot histograms with KDE for numerical variables
df.select_dtypes(include=[np.number]).hist(figsize=(12, 8), bins=30, edgecolor="black")
plt.suptitle("Distribution of Numerical Features", fontsize=16)
plt.show()

In [None]:
# KDE plots for numerical features
plt.figure(figsize=(12, 6))
for column in df.select_dtypes(include=[np.number]).columns:
    sns.kdeplot(df[column], label=column, shade=True)
plt.title("Kernel Density Estimation (KDE) Plots of Numerical Features")
plt.legend()
plt.show()

2️. Box Plots (Detect Outliers)
Box plots help in identifying outliers.

# Box plots for numerical variables
plt.figure(figsize=(12, 6))
sns.boxplot(data=df)
plt.xticks(rotation=45)
plt.title("Box Plot of Numerical Features (Outlier Detection)")
plt.show()

3️. Violin Plots (Distribution + Outliers)
Violin plots combine KDE plots with box plots

In [None]:
# Violin plot for numerical features
plt.figure(figsize=(12, 6))
sns.violinplot(data=df)
plt.xticks(rotation=45)
plt.title("Violin Plot of Numerical Features")
plt.show()

4️. Count Plots for Categorical Variables
For categorical variables, count plots help visualize category distribution.

In [None]:
# Plot count plots for categorical variables
categorical_columns = df.select_dtypes(include=['object']).columns
plt.figure(figsize=(12, 6))
for i, col in enumerate(categorical_columns):
    plt.figure(figsize=(8, 4))
    sns.countplot(y=df[col], order=df[col].value_counts().index, palette="coolwarm")
    plt.title(f"Count Plot for {col}")
    plt.xlabel("Count")
    plt.ylabel(col)
    plt.show()

## Step 3: Bivariate Analysis
1️. Correlation Matrix (Numerical Variables)
Heatmaps show the correlation between numerical features.

In [None]:
# Compute correlation matrix
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

2️. Pair Plots (Visualizing Relationships)
Pair plots help visualize relationships between multiple numerical variables.

In [None]:
# Pairplot for numerical variables
sns.pairplot(df)
plt.show()

3️. Scatter Plots (Numerical vs Numerical)
Scatter plots show relationships between two numerical variables.

In [None]:
# Replace 'feature_x' and 'feature_y' with actual column names
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df["feature_x"], y=df["feature_y"], alpha=0.7)
plt.title("Scatter Plot: feature_x vs feature_y")
plt.xlabel("feature_x")
plt.ylabel("feature_y")
plt.show()

4️. Box Plots (Categorical vs Numerical)
Box plots help analyze the distribution of numerical values across different categories.

In [None]:
# Replace 'categorical_column' and 'numerical_column' with actual column names
plt.figure(figsize=(10, 6))
sns.boxplot(x=df["categorical_column"], y=df["numerical_column"], palette="Set2")
plt.xticks(rotation=45)
plt.title("Box Plot: categorical_column vs numerical_column")
plt.show()

5️. Violin Plots (Categorical vs Numerical)
Violin plots combine KDE with box plots for category-wise numerical analysis.

In [None]:
# Violin plot for categorical vs numerical feature
plt.figure(figsize=(10, 6))
sns.violinplot(x=df["categorical_column"], y=df["numerical_column"], palette="coolwarm")
plt.xticks(rotation=45)
plt.title("Violin Plot: categorical_column vs numerical_column")
plt.show()

6️. Bar Plots (Categorical vs Numerical Aggregation)
Bar plots help analyze the mean/median of numerical features for different categories.

In [None]:
# Bar plot of categorical variable vs numerical variable (mean)
plt.figure(figsize=(12, 6))
sns.barplot(x=df["categorical_column"], y=df["numerical_column"], estimator=np.mean, ci=None, palette="Blues")
plt.xticks(rotation=45)
plt.title("Bar Plot: Mean of numerical_column by categorical_column")
plt.show()

### **Summary of Findings**
- **Univariate Analysis:**
  - Histograms and KDE plots showed the distribution of numerical features.
  - Box plots and violin plots helped detect outliers.
  - Count plots revealed the frequency distribution of categorical variables.
  
- **Bivariate Analysis:**
  - Heatmaps identified highly correlated numerical features.
  - Scatter plots showed relationships between numerical variables.
  - Box and violin plots revealed category-wise variations in numerical features.
  - Bar plots summarized numerical variables based on categorical groupings.
  
- **Key Takeaways:**
  - Strong correlations between variables suggest potential feature engineering.
  - Outliers were detected in some numerical variables.
  - Certain categorical groups had significantly different numerical values.
  
- **Next Steps:**
  - Handle missing values if necessary.
  - Remove or transform outliers if they affect model performance.
  - Perform feature engineering based on insights.
