In [3]:
import pandas as pd

In [4]:
file_path = '/Users/spiderman/Documents/GitHub/Assignment2_PDS/raw_data/train.csv'
df = pd.read_csv(file_path)

In [5]:
print("Missing values in each column:\n", df.isnull().sum())

Missing values in each column:
 Unnamed: 0              0
Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  38
New_Price            5032
Price                   0
dtype: int64


In [6]:
# Remove non-numeric characters from columns with units and convert them to numeric
df['Mileage'] = df['Mileage'].str.replace(' kmpl', '').str.replace(' km/kg', '').astype(float)
df['Engine'] = df['Engine'].str.replace(' CC', '').str.replace(' cc', '').astype(float)
df['Power'] = df['Power'].str.replace(' bhp', '').astype(float)

In [7]:
df['Mileage'].fillna(df['Mileage'].median(), inplace=True)  # Median for Mileage
df['Engine'].fillna(df['Engine'].median(), inplace=True)    # Median for Engine
df['Power'].fillna(df['Power'].median(), inplace=True)      # Median for Power
df['Seats'].fillna(df['Seats'].mode()[0], inplace=True)     # Mode for Seats

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Mileage'].fillna(df['Mileage'].median(), inplace=True)  # Median for Mileage
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Engine'].fillna(df['Engine'].median(), inplace=True)    # Median for Engine
The behavior will change in pandas 3.0. This inplace method will never 

In [8]:
df.drop(columns=['New_Price'], inplace=True)

In [9]:
clean_data_path = '/Users/spiderman/Documents/GitHub/Assignment2_PDS/clean_data/train_clean.csv'
df.to_csv(clean_data_path, index=False)

In [10]:
print(f"Cleaned data saved to: {clean_data_path}")

Cleaned data saved to: /Users/spiderman/Documents/GitHub/Assignment2_PDS/clean_data/train_clean.csv


In [11]:
clean_data_path = '/Users/spiderman/Documents/GitHub/Assignment2_PDS/clean_data/train_clean.csv'
df = pd.read_csv(clean_data_path)

In [15]:
# Ensure 'Mileage', 'Engine', and 'Power' are strings before replacing units
df['Mileage'] = df['Mileage'].astype(str).str.replace(' kmpl', '').str.replace(' km/kg', '').astype(float)
df['Engine'] = df['Engine'].astype(str).str.replace(' CC', '').str.replace(' cc', '').astype(float)
df['Power'] = df['Power'].astype(str).str.replace(' bhp', '').astype(float)


In [16]:
if 'New_Price' in df.columns:
    df['New_Price'] = df['New_Price'].str.replace(' lakh', '').astype(float)

In [17]:
clean_data_path_no_units = '/Users/spiderman/Documents/GitHub/Assignment2_PDS/clean_data/train_clean_no_units.csv'
df.to_csv(clean_data_path_no_units, index=False)

In [18]:
print(f"Data with units removed saved to: {clean_data_path_no_units}")

Data with units removed saved to: /Users/spiderman/Documents/GitHub/Assignment2_PDS/clean_data/train_clean_no_units.csv


In [19]:
df = pd.get_dummies(df, columns=['Fuel_Type', 'Transmission'], drop_first=True)

In [20]:
print(df.head())

   Unnamed: 0                              Name    Location  Year  \
0           1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
1           2                      Honda Jazz V     Chennai  2011   
2           3                 Maruti Ertiga VDI     Chennai  2012   
3           4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   
4           6            Nissan Micra Diesel XV      Jaipur  2013   

   Kilometers_Driven Owner_Type  Mileage  Engine   Power  Seats  Price  \
0              41000      First    19.67  1582.0  126.20    5.0  12.50   
1              46000      First    13.00  1199.0   88.70    5.0   4.50   
2              87000      First    20.77  1248.0   88.76    7.0   6.00   
3              40670     Second    15.20  1968.0  140.80    5.0  17.74   
4              86999      First    23.08  1461.0   63.10    5.0   3.50   

   Fuel_Type_Electric  Fuel_Type_Petrol  Transmission_Manual  
0               False             False                 True  
1             

In [21]:
from datetime import datetime

In [22]:
current_year = datetime.now().year

In [23]:
df['Car_Age'] = current_year - df['Year']

In [24]:
print(df.head())

   Unnamed: 0                              Name    Location  Year  \
0           1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
1           2                      Honda Jazz V     Chennai  2011   
2           3                 Maruti Ertiga VDI     Chennai  2012   
3           4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   
4           6            Nissan Micra Diesel XV      Jaipur  2013   

   Kilometers_Driven Owner_Type  Mileage  Engine   Power  Seats  Price  \
0              41000      First    19.67  1582.0  126.20    5.0  12.50   
1              46000      First    13.00  1199.0   88.70    5.0   4.50   
2              87000      First    20.77  1248.0   88.76    7.0   6.00   
3              40670     Second    15.20  1968.0  140.80    5.0  17.74   
4              86999      First    23.08  1461.0   63.10    5.0   3.50   

   Fuel_Type_Electric  Fuel_Type_Petrol  Transmission_Manual  Car_Age  
0               False             False                 True        