#### calculate car age

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load your cleaned dataset
file_path = '../Data/cleaned_data.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
df.head()

Unnamed: 0,Company Name,Car Name,Variant,Fuel Type,Tyre Condition,Make Year,Owner Type,Registration Number,Mileage,Price,Transmission Type,Body Color,Service Record,Insurance,Registration Certificate,Accessories
0,Maruti Suzuki,Cruze,EX,CNG,Needs Replacement,2018.0,Second,84-436-5584,52798.0,759107.0,Manual,Grey,Major Service at 50418 km,No Current Insurance,Not Available,"Music System, Sunroof, Alloy Wheels"
1,Kia,Seltos,RXE,Petrol,New,2020.0,Third,79-114-3166,43412.0,505071.0,Automatic,Maroon,Major Service at 131313 km,No Current Insurance,Available,Sunroof
2,Kia,Accord,RXE,Petrol,New,2022.0,Second,41-358-3344,95219.0,635322.0,Automatic (Tiptronic),Black,No Service Record,No Current Insurance,Available,Sunroof
3,Nissan,Seltos,Highline,Diesel,Used,2024.0,Third,92-708-1763,70370.0,483152.0,Automatic (Tiptronic),Maroon,Major Service at 98115 km,Valid Until [date],Available,"Music System, Alloy Wheels"
4,Chevrolet,Kwid,Highline,Petrol,Used,2018.0,Second,76-154-5485,85852.0,712961.0,Automatic (Tiptronic),Silver,Major Service at 135665 km,No Current Insurance,Not Available,"GPS, Music System"


In [2]:
from datetime import datetime

# Assuming the current year is needed for the age calculation
current_year = datetime.now().year

# Create a new column 'Car Age'
df['Car Age'] = current_year - df['Make Year']

# Display the first few rows to verify
print(df[['Make Year', 'Car Age']].head())


   Make Year  Car Age
0     2018.0      6.0
1     2020.0      4.0
2     2022.0      2.0
3     2024.0      0.0
4     2018.0      6.0


In [3]:
# Create a new column 'Price per Mile'
df['Price per Mile'] = df['Price'] / df['Mileage']

# Display the first few rows to verify
print(df[['Price', 'Mileage', 'Price per Mile']].head())


      Price  Mileage  Price per Mile
0  759107.0  52798.0       14.377571
1  505071.0  43412.0       11.634364
2  635322.0  95219.0        6.672219
3  483152.0  70370.0        6.865880
4  712961.0  85852.0        8.304536


In [4]:
# List of categorical columns to encode
categorical_cols = ['Company Name', 'Car Name', 'Variant', 'Fuel Type', 'Transmission Type', 'Body Color', 'Owner Type']

# Apply one-hot encoding using pandas get_dummies
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Display the first few rows to verify
print(df_encoded.head())


      Tyre Condition  Make Year Registration Number  Mileage     Price  \
0  Needs Replacement     2018.0         84-436-5584  52798.0  759107.0   
1                New     2020.0         79-114-3166  43412.0  505071.0   
2                New     2022.0         41-358-3344  95219.0  635322.0   
3               Used     2024.0         92-708-1763  70370.0  483152.0   
4               Used     2018.0         76-154-5485  85852.0  712961.0   

               Service Record             Insurance Registration Certificate  \
0   Major Service at 50418 km  No Current Insurance            Not Available   
1  Major Service at 131313 km  No Current Insurance                Available   
2           No Service Record  No Current Insurance                Available   
3   Major Service at 98115 km    Valid Until [date]                Available   
4  Major Service at 135665 km  No Current Insurance            Not Available   

                           Accessories  Car Age  ...  \
0  Music System, S

## Featuer Scaling

In [5]:
from sklearn.preprocessing import StandardScaler

# List of numerical columns to standardize
numerical_cols = ['Car Age', 'Mileage', 'Price', 'Price per Mile']

# Initialize the StandardScaler
scaler = StandardScaler()

# Apply standardization
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

# Display the first few rows to verify
print(df_encoded[numerical_cols].head())


    Car Age   Mileage     Price  Price per Mile
0  0.525856 -0.936010  0.653591        0.423144
1 -0.165332 -1.105025 -0.446084        0.174142
2 -0.856521 -0.172127  0.117749       -0.276273
3 -1.547709 -0.619588 -0.540968       -0.258694
4  0.525856 -0.340800  0.453834       -0.128107


In [7]:
# Save the new dataset with engineered features
new_file_path = '../Data/feature_dataset.csv'
df_encoded.to_csv(new_file_path, index=False)

print(f"New dataset saved at {new_file_path}")


New dataset saved at ../Data/feature_dataset.csv
