In [4]:
import pandas as pd
from datetime import datetime

# a) Look for the missing values in all the columns and either impute them (replace with mean,
# median, or mode) or drop them. Justify your action for this task.

# Load the data
df = pd.read_csv('train.csv')

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:\n", missing_values)

# If a column has more than 50% missing values, drop the column
for column in df.columns:
    if df[column].isnull().sum() > 0.5 * len(df):
        df.drop(column, axis=1, inplace=True)
        print(f"Dropped column {column} due to high percentage of missing values.")

# For the remaining columns, replace missing values in numerical columns with the median
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].median())

# Replace missing values in categorical columns with the mode
categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
    df[column].fillna(df[column].mode()[0], inplace=True)

# Check if there are still any missing values
missing_values = df.isnull().sum()
print("Missing values after imputation:\n", missing_values)

# b) Remove the units from some of the attributes and only keep the numerical values (for
# example remove kmpl from “Mileage”, CC from “Engine”, bhp from “Power”, and lakh from
# “New_price”).



# Remove units from the attributes
df['Mileage'] = df['Mileage'].str.replace(' kmpl', '')
df['Engine'] = df['Engine'].str.replace(' CC', '')
df['Power'] = df['Power'].str.replace(' bhp', '')

# Convert the columns to numeric
df['Mileage'] = pd.to_numeric(df['Mileage'], errors='coerce')
df['Engine'] = pd.to_numeric(df['Engine'], errors='coerce')
df['Power'] = pd.to_numeric(df['Power'], errors='coerce')



# C) Change the categorical variables (“Fuel_Type” and “Transmission”) into numerical one hot
# encoded value.

# Convert categorical variables into one-hot encoded values
df = pd.get_dummies(df, columns=['Fuel_Type', 'Transmission'])

#d) Create one more feature and add this column to the dataset (you can use mutate function in
#R for this). For example, you can calculate the current age of the car by subtracting “Year” value
#from the current year.

# Calculate the current year
current_year = datetime.now().year

# Calculate the age of the car
df['Car_Age'] = current_year - df['Year']

# Print the updated DataFrame
print(df.head())

Missing values per column:
 Unnamed: 0              0
Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  38
New_Price            5032
Price                   0
dtype: int64
Dropped column New_Price due to high percentage of missing values.
Missing values after imputation:
 Unnamed: 0           0
Name                 0
Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
dtype: int64
   Unnamed: 0                              Name    Location  Year  \
0           1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
1           2                      Honda Jaz