In [9]:
##import necessary libraries
import numpy as np
import pandas as pd
import glob
import os
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

#sklearn preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.metrics import mean_squared_error

# sklearn classification models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

#sklearn evaluation metrics and validation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [10]:
# Read all CSV files in the specified folder
file_paths = ["Pune.csv", "Mumbai.csv", "Kolkata.csv", "Jaipur.csv", 
              "Hyderabad.csv", "Gurgaon.csv", "Chennai.csv", 
              "Car_data.csv", "Banglore.csv", "Ahmedabad.csv"]

# Concatenate all files into a single DataFrame
df = pd.concat([pd.read_csv(file) for file in file_paths], ignore_index=True)

In [11]:
# Check the first few rows to understand the structure
print(df.head())

   Unnamed: 0                      Brand  Make_Year    Fuel  KMs_Driven  \
0           0  2019 Maruti Vitara Brezza       2018  Diesel  34,470 Kms   
1           1  2019 Maruti Vitara Brezza       2018  Diesel  27,133 Kms   
2           2        2017 Maruti Celerio       2017  Petrol  75,447 Kms   
3           3    2017 Maruti Swift Dzire       2017  Diesel  34,110 Kms   
4           4     2016 Hyundai Grand i10       2015  Petrol  27,123 Kms   

  Engine_Displacement No_Of_Owner Transmission        Mileage Max_Power  ...  \
0             1248 cc   1st Owner    Automatic      24.3 kmpl   88.5bhp  ...   
1             1248 cc   1st Owner    Automatic      24.3 kmpl   88.5bhp  ...   
2              998 cc   1st Owner         MH12  Comprehensive    998 CC  ...   
3             1248 cc   1st Owner       Manual      28.4 kmpl  74.02bhp  ...   
4             1197 cc   1st Owner       Manual      18.9 kmpl     82bhp  ...   

  Seats   Color Gear_Box Drive_Type Steering_Type Front_Brake_Type  

In [12]:
# Overview of the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10394 entries, 0 to 10393
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           10394 non-null  int64 
 1   Brand                10394 non-null  object
 2   Make_Year            10394 non-null  int64 
 3   Fuel                 10394 non-null  object
 4   KMs_Driven           10394 non-null  object
 5   Engine_Displacement  10394 non-null  object
 6   No_Of_Owner          10394 non-null  object
 7   Transmission         10394 non-null  object
 8   Mileage              10394 non-null  object
 9   Max_Power            10394 non-null  object
 10  Torque               10394 non-null  object
 11  Seats                10394 non-null  int64 
 12  Color                10394 non-null  object
 13  Gear_Box             10386 non-null  object
 14  Drive_Type           10208 non-null  object
 15  Steering_Type        10344 non-null  object
 16  Fron

In [13]:
# Summary statistics for numerical columns
print(df.describe())

         Unnamed: 0     Make_Year         Seats
count  10394.000000  10394.000000  10394.000000
mean    1460.271984   2014.891861      5.188378
std     1563.455786      3.094559      0.653699
min        0.000000   1995.000000      2.000000
25%      262.000000   2013.000000      5.000000
50%      594.000000   2015.000000      5.000000
75%     2597.750000   2017.000000      5.000000
max     5196.000000   2021.000000      9.000000


Identify Missing Values