# EDA[Exploratory Data Analysis]

- EDA stands for Exploratory Data Analysis. It is an approach to analyzing and visualizing data sets to summarize their main characteristics, often       with the help of statistical graphics and other data visualization methods. The primary goal of EDA is to uncover patterns, relationships, anomalies,   and insights within the data, helping to guide further analysis or hypothesis testing.

- **CAR PRICE PREDICTION(Used cars)**


# Importing Libraries:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 


# Read data 

In [None]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns',200)

In [None]:
car_price_data=pd.read_csv('D:/DS NOTE/OASIS INFO BYTE/Car price prediction(used cars)/trian/raw/train.csv')
car_price_data

# Data preprocessing

In [None]:
# Print a concise summary of a DataFrame.
car_price_data.info()

In [None]:
car_price_data.head()

In [None]:
car_price_data.tail()

In [None]:
# Detect missing values
car_price_data.isnull().sum()

In [None]:
# denoting duplicate rows
car_price_data.duplicated().sum()

In [None]:
# Generate descriptive statistics.
car_price_data.describe()

# Outlier Detection 
- Detect the outlier in SepalWidthCm , outlier affect the distribution & model also we can remove the outlier
  usin Inter quantile range method to caping the outlier

In [None]:
for column in car_price_data.columns:
    if car_price_data[column].dtype!='object':
        plt.figure(figsize=(10,5))
        sns.boxplot(y=column,data=car_price_data)
        plt.title(f'Boxplot of {column}')
        plt.show()



# IQR[Inter Quantile Range]Method
 - HANDLING OUTLIER

In [None]:
Outlier_list= {}

In [None]:
for column in ['Year', 'Selling_Price', 'Present_Price', 'Driven_kms', 'Owner']:
    q1=car_price_data[column].quantile(0.25)
    q3=car_price_data[column].quantile(0.75)
    IQR=q3-q1
    lower_range=q1-(1.5*IQR)
    upper_range=q3+(1.5*IQR)
    #capping 
    car_price_data[column]=np.where(car_price_data[column]<lower_range,lower_range,
                                    np.where(car_price_data[column]>upper_range,upper_range,
                                             car_price_data[column]))
    
    Outlier_list[column]= {'q1':q1,'q3':q3,'IQR':IQR,'lower_range':lower_range, 'upper_range':upper_range}


In [None]:
for column, values in Outlier_list.items():
    print(f"Column: {column}\nq1:{values['q1']} \nq3:{values['q3']} \nIQR:{values['IQR']} \nlower_range:{values['lower_range']} \nupper_range:{values['upper_range']}\n-----------------------")

    

# After handling outlier

In [None]:
for column in car_price_data.columns:
    if car_price_data[column].dtype!='object':
        plt.figure(figsize=(10,5))
        sns.boxplot(y=column,data=car_price_data)
        plt.title(f'Boxplot of {column}')
        plt.show()


# Correlation B/W XandY 
- check any correlation betweeen Feature and target, Because data point correlated with only feature not in target
- If correlated with feature and target it will affect the model prediction 

In [None]:

# Label encode categorical columns
cat_columns = car_price_data.select_dtypes(include=['object']).columns
for col in cat_columns:
    car_price_data[col] = car_price_data[col].astype('category').cat.codes

In [None]:
# Create a correlation matrix
corr_matrix = car_price_data.corr()

In [None]:
# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='Blues', fmt='.2f', linewidths=0.5, linecolor='green')
plt.title('Correlation Matrix')
plt.show()


- The data point have postive correlation


In [None]:
#Year-wise car count
plt.figure(figsize=(12, 6))
sns.countplot(x='Year',data=car_price_data)
plt.title('Number of Cars for Each Year')
plt.xlabel('Year')
plt.ylabel('No.of.Count car')
plt.show()

In [None]:
# Distribution of Selling Prices
plt.figure(figsize=(12, 6))
sns.histplot(car_price_data['Selling_Price'], bins=10, kde=True)
plt.title('Distribution of Selling Prices')
plt.xlabel('Selling Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
car_price_data['Fuel_Type'] = car_price_data['Fuel_Type'].map({1: 'petrol', 2: 'diesel'})


In [None]:
# Boxplot of Selling Prices for each Fuel Type
plt.figure(figsize=(12, 6))
sns.boxplot(x='Fuel_Type', y='Selling_Price', data=car_price_data,hue='Fuel_Type')
plt.title('Boxplot of Selling Prices for Each Fuel Type')
plt.xlabel('Fuel Type')
plt.ylabel('Selling Price')
plt.show()


In [None]:
# Scatter plot of Present Price vs Selling Price
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Present_Price', y='Selling_Price', data=car_price_data)
plt.title('Present Price vs Selling Price')
plt.xlabel('Present Price')
plt.ylabel('Selling Price')
plt.show()

In [None]:
car_price_data['Transmission'] = car_price_data['Transmission'].map({0: 'Manul', 1: 'Automatic'})

In [None]:
# Transmission-wise car count
plt.figure(figsize=(8, 6))
sns.countplot(x='Transmission', data=car_price_data,hue='Transmission')
plt.title('Number of Cars by Transmission Type')
plt.xlabel('Transmission Type')
plt.ylabel('Count')
plt.show()

In [None]:

# Visualization 6: Bar chart of Fuel Type distribution
plt.figure(figsize=(8, 6))
car_price_data['Fuel_Type'].value_counts().plot(kind='bar')
plt.title('Fuel Type Distribution')
plt.xlabel('Fuel Type')
plt.ylabel('Count')
plt.show()


In [None]:
# Pair plot for numerical variables
sns.pairplot(car_price_data[['Year', 'Selling_Price', 'Present_Price', 'Driven_kms']])
plt.suptitle('Pair Plot of Numerical Variables', y=1.02)
plt.show()


In [None]:
# Count plot of Fuel Type with Transmission as hue
plt.figure(figsize=(12, 6))
sns.countplot(x='Fuel_Type', hue='Transmission', data=car_price_data)
plt.title('Count of Cars by Fuel Type with Transmission as Hue')
plt.xlabel('Fuel Type')
plt.ylabel('Count')
plt.show()

In [None]:
# Boxplot of Selling Prices for each Selling Type
plt.figure(figsize=(12, 6))
sns.boxplot(x='Selling_type', y='Selling_Price', data=car_price_data)
plt.title('Boxplot of Selling Prices for Each Selling Type')
plt.xlabel('Selling Type')
plt.ylabel('Selling Price')
plt.show()
