In [2]:
import pandas as pd
import re

# Load the CSV file
file_path = '//content/train.csv'
data = pd.read_csv(file_path)

# Function to extract numeric values from strings
def extract_numeric(series):
    return series.str.extract(r'(\d+\.?\d*)').astype(float)

# Apply the function to relevant columns to extract numeric values
data['Mileage'] = extract_numeric(data['Mileage'])
data['Engine'] = extract_numeric(data['Engine'])
data['Power'] = extract_numeric(data['Power'])

# Impute missing values
# For 'Mileage', 'Engine', and 'Power', use median due to potential outliers
data['Mileage'].fillna(data['Mileage'].median(), inplace=True)
data['Engine'].fillna(data['Engine'].median(), inplace=True)
data['Power'].fillna(data['Power'].median(), inplace=True)

# For 'Seats', use mode as it is likely a categorical value
data['Seats'].fillna(data['Seats'].mode()[0], inplace=True)

# Drop 'New_Price' column due to high proportion of missing values
data.drop(columns=['New_Price'], inplace=True)

# Verify if missing values have been handled
missing_values_after_cleaning = data.isnull().sum()
print(missing_values_after_cleaning)


Unnamed: 0           0
Name                 0
Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Mileage'].fillna(data['Mileage'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Engine'].fillna(data['Engine'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 

In [10]:
import pandas as pd
import re

# Load the CSV file
file_path = '/content/train.csv'
data = pd.read_csv(file_path)

# Columns to clean
columns_to_clean = ['Mileage', 'Engine', 'Power', 'New_Price']

# Apply the extraction function to remove units and keep only numeric values
for column in columns_to_clean:
    data[column] = data[column].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)

# Check the cleaned data
print(data[columns_to_clean].head())

# Perform one-hot encoding on 'Fuel_Type' and 'Transmission'
data = pd.get_dummies(data, columns=['Fuel_Type', 'Transmission'], drop_first=True)

# Check the data to confirm encoding
print(data.head())

from datetime import datetime

# Calculate the current year
current_year = datetime.now().year

# Create a new feature 'Car_Age' by subtracting 'Year' from the current year
data['Car_Age'] = current_year - data['Year']

# Display the first few rows to check the new column
print(data[['Year', 'Car_Age']].head())

result = (
    data[['Name', 'Location', 'Year', 'Kilometers_Driven', 'FuelType', 'Price']]
    .query("Year > 2010 & Price < 10.0")
    .rename(columns={'Kilometers_Driven': 'Kms_Driven', 'FuelType': 'Fuel'})
)



   Mileage  Engine   Power  New_Price
0    19.67  1582.0  126.20        NaN
1    13.00  1199.0   88.70       8.61
2    20.77  1248.0   88.76        NaN
3    15.20  1968.0  140.80        NaN
4    23.08  1461.0   63.10        NaN
   Unnamed: 0                              Name    Location  Year  \
0           1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
1           2                      Honda Jazz V     Chennai  2011   
2           3                 Maruti Ertiga VDI     Chennai  2012   
3           4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   
4           6            Nissan Micra Diesel XV      Jaipur  2013   

   Kilometers_Driven Owner_Type  Mileage  Engine   Power  Seats  New_Price  \
0              41000      First    19.67  1582.0  126.20    5.0        NaN   
1              46000      First    13.00  1199.0   88.70    5.0       8.61   
2              87000      First    20.77  1248.0   88.76    7.0        NaN   
3              40670     Second    15.20  196

KeyError: "['FuelType'] not in index"