# Exploratory Data Analysis (EDA) Notebook

This notebook provides a structured approach for performing Exploratory Data Analysis (EDA) with Python. It covers data loading, cleaning, univariate analysis, bivariate and multivariate analysis, feature engineering, and assumption checking. 

In [None]:
# Step 1: Data Loading and Cleaning
import pandas as pd

# Load the dataset (replace 'file.csv' with your data file path)
df = pd.read_csv('file.csv')
df.head()  # Display the first few rows

In [None]:
# Data Cleaning

# Check for missing values
print(df.isnull().sum())

# Drop duplicates
df = df.drop_duplicates()

# Fill missing values (example)
df['column_name'] = df['column_name'].fillna(df['column_name'].mean())


In [None]:
# Step 2: Univariate Analysis
import matplotlib.pyplot as plt
import seaborn as sns

# Numerical data - histogram
plt.figure(figsize=(10, 6))
sns.histplot(df['numeric_column'], bins=20, kde=True)
plt.show()

# Categorical data - count plot
plt.figure(figsize=(10, 6))
sns.countplot(x='categorical_column', data=df)
plt.show()

In [None]:
# Step 3: Bivariate Analysis

# Scatter plot for two numerical variables
plt.figure(figsize=(10, 6))
sns.scatterplot(x='numeric_column1', y='numeric_column2', data=df)
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.show()

In [None]:
# Step 4: Multivariate Analysis

# Pair plot for numerical columns
sns.pairplot(df[['numeric_column1', 'numeric_column2', 'numeric_column3']])
plt.show()

In [None]:
# Step 5: Feature Engineering and Transformation
import numpy as np

# Log transformation for skewed data
df['log_numeric_column'] = np.log1p(df['numeric_column'])

# One-hot encoding for categorical variables
df = pd.get_dummies(df, columns=['categorical_column'], drop_first=True)


In [None]:
# Step 6: Checking Assumptions
from scipy.stats import shapiro, levene

# Normality test
stat, p = shapiro(df['numeric_column'])
print(f'Shapiro-Wilk test p-value: {p}')

# Homogeneity of variances
stat, p = levene(df['numeric_column1'], df['numeric_column2'])
print(f'Levene’s test p-value: {p}')

In [None]:
# Summary

print('Exploratory Data Analysis complete. Summarize findings and prepare for modeling.')

# Save summary statistics
summary_stats = df.describe()
summary_stats.to_csv('summary_statistics.csv')