# Pandas: Data Cleaning

In [1]:
# pip install pandas

import pandas as pd


In [2]:
# Load the CSV file into a DataFrame
df = pd.read_csv('automobile_data.csv')

# Display basic information about the DataFrame
print("DataFrame Info:")
print(df.info())


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Make              91 non-null     object 
 1   Model             91 non-null     object 
 2   Year              91 non-null     float64
 3   Mileage           91 non-null     float64
 4   Miles Per Gallon  91 non-null     float64
 5   engine size       91 non-null     object 
 6   Transmission      91 non-null     object 
 7   FuelType          91 non-null     object 
 8   Price             81 non-null     float64
dtypes: float64(4), object(5)
memory usage: 6.7+ KB
None


In [3]:
# Display the original column names before any modifications
col_list = df.columns.tolist()
print("\nOrig Col List:\t\t", col_list)

# Convert all column names to lowercase and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')
col_list = df.columns.tolist()
print("Converted Col List:\t", col_list)

# Rename column(s)
df.rename(columns={'miles_per_gallon': 'mpg', 'engine_size': 'displacement'}, inplace=True)
col_list = df.columns.tolist()
print("Updated Col List:\t", col_list)



Orig Col List:		 ['Make', 'Model', 'Year', 'Mileage', 'Miles Per Gallon', 'engine size', 'Transmission', 'FuelType', 'Price']
Converted Col List:	 ['make', 'model', 'year', 'mileage', 'miles_per_gallon', 'engine_size', 'transmission', 'fueltype', 'price']
Updated Col List:	 ['make', 'model', 'year', 'mileage', 'mpg', 'displacement', 'transmission', 'fueltype', 'price']


In [6]:
# Display DataFrame data
print(df) 
# print(df.to_string())


           make                model    year  mileage   mpg displacement  \
0       Ferrari              488 GTB  2021.0  12000.0  18.0      3.9L V8   
1       Ferrari              488 GTB  2021.0  12000.0  18.0      3.9L V8   
2   Lamborghini            Aventador  2020.0   9000.0  15.0     6.5L V12   
3   Lamborghini            Aventador  2020.0   9000.0  15.0     6.5L V12   
4       Porsche          911 Turbo S  2022.0   8000.0  20.0      3.8L H6   
..          ...                  ...     ...      ...   ...          ...   
89          BMW                   M1  2021.0   9000.0  17.0      3.5L I6   
90         Audi        Quattro Sport  2022.0   7000.0  20.0      2.1L I5   
91       Jaguar                XJ220  2021.0   6000.0  17.0      3.5L V6   
92        Tesla  Model 3 Performance  2021.0   3000.0  82.0     Electric   
93      McLaren                   F1  2022.0  10000.0  15.0     6.1L V12   

   transmission  fueltype       price  
0     Automatic    Petrol    320000.0  
1     A

In [7]:
# Drop rows with fueltype col = 'Electric' or 'Hybrid' from the DataFrame
print("Number of rows before dropping Electric and Hybrid autos:", len(df))
df.drop(df[df['fueltype'].isin(['Electric', 'Hybrid'])].index, inplace=True)
print("Number of rows after dropping Electric and Hybrid autos:", len(df))

# Drop mileage column from the DataFrame
df.drop(columns=['mileage'], inplace=True)

col_list = df.columns.tolist()
print("\nUpdated Col List:", col_list)


Number of rows before dropping Electric and Hybrid autos: 94
Number of rows after dropping Electric and Hybrid autos: 80

Updated Col List: ['make', 'model', 'year', 'mpg', 'displacement', 'transmission', 'fueltype', 'price']


In [8]:
# Find empty rows in the DataFrame
empty_rows = df[df.isnull().all(axis=1)]
print("Empty Rows in DataFrame:\n", empty_rows)

# Drop empty rows from the DataFrame
print("\nNumber of rows before dropping empty rows:", len(df))
df.dropna(how='all', inplace=True)
empty_rows = df[df.isnull().all(axis=1)]
print("\nEmpty Rows in DataFrame:\n", empty_rows)
print("\nNumber of rows after dropping empty rows:", len(df))


Empty Rows in DataFrame:
    make model  year  mpg displacement transmission fueltype  price
26  NaN   NaN   NaN  NaN          NaN          NaN      NaN    NaN
43  NaN   NaN   NaN  NaN          NaN          NaN      NaN    NaN
60  NaN   NaN   NaN  NaN          NaN          NaN      NaN    NaN

Number of rows before dropping empty rows: 80

Empty Rows in DataFrame:
 Empty DataFrame
Columns: [make, model, year, mpg, displacement, transmission, fueltype, price]
Index: []

Number of rows after dropping empty rows: 77


In [9]:
# Find any row with NaN values in the DataFrame
nan_rows = df[df.isnull().any(axis=1)]
print("\nRows with NaN values in DataFrame:\n", nan_rows)

# Drop rows with any NaN values from the DataFrame
print("\nNumber of rows before dropping rows with empty cells:", len(df))
df.dropna(how='any', inplace=True)
nan_rows = df[df.isnull().any(axis=1)]
print("\nRows with NaN values in DataFrame:\n", nan_rows)
print("\nNumber of rows before dropping rows with empty cells:", len(df))



Rows with NaN values in DataFrame:
             make            model    year   mpg displacement transmission  \
7   Aston Martin             DB11  2022.0  20.0      4.0L V8    Automatic   
18       Porsche   Panamera Turbo  2022.0  20.0      4.0L V8    Automatic   
22      Maserati   Levante Trofeo  2022.0  16.0      3.8L V8    Automatic   
46  Aston Martin       Rapide AMR  2022.0  18.0     6.0L V12    Automatic   
52        Jaguar  XE SV Project 8  2022.0  18.0      5.0L V8    Automatic   
55       Ferrari        GTC4Lusso  2021.0  15.0     6.3L V12    Automatic   
86   Rolls-Royce     Silver Ghost  2022.0  10.0      7.4L I6       Manual   

   fueltype  price  
7    Petrol    NaN  
18   Petrol    NaN  
22   Petrol    NaN  
46   Petrol    NaN  
52   Petrol    NaN  
55   Petrol    NaN  
86   Petrol    NaN  

Number of rows before dropping rows with empty cells: 77

Rows with NaN values in DataFrame:
 Empty DataFrame
Columns: [make, model, year, mpg, displacement, transmission, fuelt

In [10]:
# Find duplicates in the DataFrame based on all columns
duplicates = df[df.duplicated()]
print("\nDuplicate Rows:\n", duplicates)

# Remove duplicate rows from the DataFrame
print("\nNumber of rows before removing duplicates:", len(df))
df.drop_duplicates(inplace=True)
print("Number of rows after removing duplicates:", len(df))



Duplicate Rows:
           make        model    year   mpg displacement transmission fueltype  \
1      Ferrari      488 GTB  2021.0  18.0      3.9L V8    Automatic   Petrol   
3  Lamborghini    Aventador  2020.0  15.0     6.5L V12    Automatic   Petrol   
5      Porsche  911 Turbo S  2022.0  20.0      3.8L H6    Automatic   Petrol   

      price  
1  320000.0  
3  400000.0  
5  210000.0  

Number of rows before removing duplicates: 70
Number of rows after removing duplicates: 67
