# Step 1: Load and Combine Raw Data

In [12]:
import pandas as pd
import numpy as np
import glob
import os
from sklearn.linear_model import LinearRegression

In [13]:
path = os.path.join('..', 'data', 'raw')
all_files = glob.glob(os.path.join(path, "*.csv"))

# Loop through all files, add a 'brand' column, and append to a list
li = []
for filename in all_files:
    df_temp = pd.read_csv(filename)
    brand = os.path.basename(filename).split('.')[0]
    df_temp['brand'] = brand
    li.append(df_temp)

# Concatenate all dataframes into one
df = pd.concat(li, axis=0, ignore_index=True)

print("Successfully loaded and combined all raw data files.")
df.info()


Successfully loaded and combined all raw data files.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118150 entries, 0 to 118149
Data columns (total 17 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   model         117995 non-null  object 
 1   year          117903 non-null  float64
 2   price         117995 non-null  object 
 3   transmission  117995 non-null  object 
 4   mileage       117077 non-null  object 
 5   fuelType      108540 non-null  object 
 6   tax           94327 non-null   float64
 7   mpg           99187 non-null   float64
 8   engineSize    108540 non-null  float64
 9   brand         118150 non-null  object 
 10  tax(£)        4860 non-null    float64
 11  fuel type     3517 non-null    object 
 12  engine size   9345 non-null    object 
 13  mileage2      9399 non-null    object 
 14  fuel type2    8537 non-null    object 
 15  engine size2  8537 non-null    object 
 16  reference     9455 non-null    object 


# Step 2: Clean and Standardize Data


In [None]:
#Standardize column names to snake_case
df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace(r'[()£]', '', regex=True)

# Consolidate data from columns with similar names
df['engine_size'] = df['engine_size'].fillna(df['enginesize'])
df['fuel_type'] = df['fuel_type'].fillna(df['fueltype'])

# Consolidate the duplicated 'tax' columns
tax_columns = df.columns[df.columns == 'tax']
df['tax_final'] = df[tax_columns].apply(
    lambda x: x.bfill(axis=0).iloc[0] if x.notna().any() else None,
    axis=1
)

# Define and drop all redundant or unnecessary columns
columns_to_drop = ['enginesize', 'fueltype', 'tax', 'mileage2', 'fuel_type2', 'engine_size2', 'reference']
df = df.drop(columns=columns_to_drop, errors='ignore')

# Rename the consolidated 'tax_final' column back to 'tax'
df.rename(columns={'tax_final': 'tax'}, inplace=True)


Successfully standardized and consolidated columns.


# Step 3: Correct Data Types and Formats

In [15]:
# Clean the 'engine_size' column by extracting only numbers
df['engine_size'] = df['engine_size'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)

# Clean and convert core numeric columns, removing any non-numeric characters
cols_to_convert = ['price', 'tax', 'mileage']
for col in cols_to_convert:
    df[col] = df[col].astype(str).str.replace(r'[£,]', '', regex=True)
    df[col] = pd.to_numeric(df[col], errors='coerce')

print("Successfully corrected all data types.")
df[['price', 'tax', 'mileage', 'engine_size']].info()



Successfully corrected all data types.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118150 entries, 0 to 118149
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   price        117995 non-null  float64
 1   tax          99187 non-null   float64
 2   mileage      117073 non-null  float64
 3   engine_size  117885 non-null  float64
dtypes: float64(4)
memory usage: 3.6 MB


# Step 4: Impute Missing Values

In [16]:
# Use a predictive model (Linear Regression) to impute missing 'mpg' values

# Separate the dataframe into a training set (where mpg is known) and a prediction set
train_df = df[df['mpg'].notna()].copy()
predict_df = df[df['mpg'].isna()].copy()

# Define the features for the model
features = ['year', 'price', 'mileage', 'engine_size', 'tax']

# Fill any remaining NaNs in the feature columns of both sets using the median from the training data
for col in features:
    median_val = train_df[col].median()
    train_df[col].fillna(median_val, inplace=True)
    if not predict_df.empty:
        predict_df[col].fillna(median_val, inplace=True)

# Train the model
model = LinearRegression()
X_train = train_df[features]
y_train = train_df['mpg']
model.fit(X_train, y_train)

# Predict missing values and fill them back into the original dataframe
if not predict_df.empty:
    X_predict = predict_df[features]
    predicted_mpg = model.predict(X_predict)
    df.loc[df['mpg'].isna(), 'mpg'] = predicted_mpg
    print("Successfully imputed 'mpg' using a predictive model.")
else:
    print("No 'mpg' values required predictive imputation.")


# Impute any other remaining missing values in the entire dataframe
for col in df.columns:
    if df[col].isnull().any():
        if df[col].dtype == 'object':
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            df[col].fillna(df[col].median(), inplace=True)

print("\nFinal check for missing values (should be 0):", df.isnull().sum().sum())


Successfully imputed 'mpg' using a predictive model.

Final check for missing values (should be 0): 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  predict_df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

# Step 5: Save Cleaned Data

In [17]:
# Save the fully cleaned dataframe to the processed folder
output_path = os.path.join('..', 'data', 'processed', 'cleaned_car_data.csv')
df.to_csv(output_path, index=False)

print(f"\nSuccessfully saved the cleaned data to: {output_path}")



Successfully saved the cleaned data to: ..\data\processed\cleaned_car_data.csv
