In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load dataset
file_path = "Train.csv"  # Update with correct path if needed
df = pd.read_csv(file_path)

In [None]:
# Display basic info and first few rows
print("Dataset Info:")
df.info()
print("\nFirst 5 Rows:")
print(df.head())

In [None]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
# Summary statistics
print("\nSummary Statistics:")
print(df.describe())

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

In [None]:
# Visualizing missing values
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cmap='viridis', cbar=False)
plt.title("Missing Values Heatmap")
plt.show()

In [None]:
# Distribution of target variable (Price)
plt.figure(figsize=(8, 5))
sns.histplot(df['Price'], bins=30, kde=True, color='blue')
plt.title("Distribution of Price")
plt.xlabel("Price")
plt.ylabel("Count")
plt.show()

In [None]:
# Boxplot for detecting outliers in Price
plt.figure(figsize=(8, 5))
sns.boxplot(x=df['Price'])
plt.title("Boxplot of Price")
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Pairplot for numerical features
sns.pairplot(df.select_dtypes(include=['number']))
plt.show()

In [None]:
# Convert date column to datetime if exists
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)

In [None]:
 # Line plot of price over time
plt.figure(figsize=(12, 6))
sns.lineplot(data=df, x=df.index, y='Price', color='red')
plt.title("Price Variation Over Time")
plt.xlabel("Date")
plt.ylabel("Price")
plt.xticks(rotation=45)
plt.show()    

In [None]:
# Price range per month
df['Month'] = df.index.month
df['Year'] = df.index.year
plt.figure(figsize=(12, 6))
sns.boxplot(x='Month', y='Price', data=df)
plt.title("Monthly Price Variation")
plt.xlabel("Month")
plt.ylabel("Price")
plt.show()

In [None]:
# Distribution of categorical features
categorical_cols = df.select_dtypes(include=['object']).columns
to_plot = categorical_cols[:3] if len(categorical_cols) > 3 else categorical_cols  # Limit to 3 for clarity
for col in to_plot:
    plt.figure(figsize=(10, 5))
    sns.countplot(data=df, x=col, order=df[col].value_counts().index)
    plt.xticks(rotation=45)
    plt.title(f"Distribution of {col}")
    plt.show()