## Data Cleaning with Pandas

In [1]:
# import pandas
import pandas as pd

In [3]:
# load the dataset
df = pd.read_csv('laptopData.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2.0,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [5]:
# Understand your data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1273 non-null   float64
 1   Company           1273 non-null   object 
 2   TypeName          1273 non-null   object 
 3   Inches            1273 non-null   object 
 4   ScreenResolution  1273 non-null   object 
 5   Cpu               1273 non-null   object 
 6   Ram               1273 non-null   object 
 7   Memory            1273 non-null   object 
 8   Gpu               1273 non-null   object 
 9   OpSys             1273 non-null   object 
 10  Weight            1273 non-null   object 
 11  Price             1273 non-null   float64
dtypes: float64(2), object(10)
memory usage: 122.3+ KB


In [7]:
# Initial data inspection
print("Initial data shape: ", df.shape)
print("Number of rows: ", df.shape[0])
print("Number of columns: ", df.shape[1])

Initial data shape:  (1303, 12)
Number of rows:  1303
Number of columns:  12


In [8]:
# Return missing values
print("Missing values in each column BEFORE cleaning:")
print(df.isnull().sum())

Missing values in each column before cleaning:
Unnamed: 0          30
Company             30
TypeName            30
Inches              30
ScreenResolution    30
Cpu                 30
Ram                 30
Memory              30
Gpu                 30
OpSys               30
Weight              30
Price               30
dtype: int64


In [9]:
print("Number of duplicate rows BEFORE cleaning:")
print(df.duplicated().sum())

Number of duplicate rows before cleaning:
29


### Handle Missing Values

In [10]:
# replace anywhere we have question mark (?) in each column with NAN (Not a Number)
df.replace("?", pd.NA, inplace=True)

In [12]:
# Dropped any row with missing data
critical_cols = ["Company", "Cpu", "Ram", "Memory", "Gpu", "OpSys", "Weight", "Price"]

# Drop the rows with the missing data
df = df.dropna(subset=critical_cols)

In [13]:
# Drop duplicate values
df = df.drop_duplicates()

print("Cleaned data shape: ", df.shape)

Cleaned data shape:  (1271, 12)


In [14]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2.0,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808
5,5.0,Acer,Notebook,15.6,1366x768,AMD A9-Series 9420 3GHz,4GB,500GB HDD,AMD Radeon R5,Windows 10,2.1kg,21312.0
6,6.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.2GHz,16GB,256GB Flash Storage,Intel Iris Pro Graphics,Mac OS X,2.04kg,114017.6016
7,7.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,256GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,61735.536
8,8.0,Asus,Ultrabook,14.0,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,16GB,512GB SSD,Nvidia GeForce MX150,Windows 10,1.3kg,79653.6
9,9.0,Acer,Ultrabook,14.0,IPS Panel Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,Intel UHD Graphics 620,Windows 10,1.6kg,41025.6


### Standardize our columns (convert string (objects) into numbers (floats or int))

In [55]:
df["Weight"] = df["Weight"].replace("kg", "", regex=False)
df["Weight"] = pd.to_numeric(df["Weight"], errors="coerce")

In [51]:
df["Price"] = pd.to_numeric(df["Price"], errors="coerce")

In [54]:
# Windows 10 --> windows_10
df["OpSys"] = df["OpSys"].str.lower().str.replace(" ", "_")

In [21]:
print("Current data shape: ", df.shape)

Current data shape:  (1271, 12)


In [22]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macos,1.37,71378.6832
1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macos,1.34,47895.5232
2,2.0,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,no_os,1.86,30636.0
3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macos,1.83,135195.336
4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macos,1.37,96095.808
5,5.0,Acer,Notebook,15.6,1366x768,AMD A9-Series 9420 3GHz,4GB,500GB HDD,AMD Radeon R5,windows_10,2.1,21312.0
6,6.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.2GHz,16GB,256GB Flash Storage,Intel Iris Pro Graphics,mac_os_x,2.04,114017.6016
7,7.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,256GB Flash Storage,Intel HD Graphics 6000,macos,1.34,61735.536
8,8.0,Asus,Ultrabook,14.0,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,16GB,512GB SSD,Nvidia GeForce MX150,windows_10,1.3,79653.6
9,9.0,Acer,Ultrabook,14.0,IPS Panel Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,Intel UHD Graphics 620,windows_10,1.6,41025.6


In [25]:
## Standardize our Cpu
# extract cpu speed from the Cpu column
def extract_cpu(cpu_info):
    try:
        return float(cpu_info.split()[-1][:-3])  # 2.7GHz will now be 2.7
    except Exception as e:
        return None

# create a new column in our dataset for this new Cpu data
df["Cpu_Speed"] = df["Cpu"].apply(extract_cpu)

In [26]:
print("Current data shape: ", df.shape)

Current data shape:  (1271, 13)


In [28]:
# Standardize our memory column
def convert_memory(memory):
    try:
        if "GB" in memory:
            return int(memory.replace("GB", " ")) * 1024  # converting GB to MB
        elif "TB" in memory:
            return int(memory.replace("TB", " ")) * 1024 * 1024  # converting TB to MB
    except Exception as e:
        return None

# creating a new column in our dataset for this new memory MB data
df["Memory_MB"] = df["Memory"].apply(convert_memory)

### Replacing missing values (NaN) with the Average of that column

In [31]:
# Fill NA/NaN values using the specified method.
df["Weight"].fillna(df["Weight"].mean(), inplace=True)
df["Price"].fillna(df["Price"].mean(), inplace=True)
df["Cpu_Speed"].fillna(df["Cpu_Speed"].mean(), inplace=True)
df["Memory_MB"].fillna(df["Memory_MB"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Weight"].fillna(df["Weight"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Price"].fillna(df["Price"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are 

### Final Testing (Cleaned Version of Our DataSet)

In [34]:
print("Final data shape: ", df.shape)
df.head()

Final data shape:  (1271, 14)


Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,Cpu_Speed,Memory_MB
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macos,1.37,71378.6832,2.3,
1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macos,1.34,47895.5232,1.8,
2,2.0,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,no_os,1.86,30636.0,2.5,
3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macos,1.83,135195.336,2.7,
4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macos,1.37,96095.808,3.1,


In [35]:
print("Missing values in each column AFTER cleaning:")
print(df.isnull().sum())

Missing values in each column AFTER cleaning:
Unnamed: 0             0
Company                0
TypeName               0
Inches                 1
ScreenResolution       0
Cpu                    0
Ram                    0
Memory                 0
Gpu                    0
OpSys                  0
Weight                 0
Price                  0
Cpu_Speed              0
Memory_MB           1271
dtype: int64


In [40]:
# Drop the rows with the missing values under Indices column
df.dropna(subset=["Inches"], inplace=True)
df.isnull().sum()

Unnamed: 0             0
Company                0
TypeName               0
Inches                 0
ScreenResolution       0
Cpu                    0
Ram                    0
Memory                 0
Gpu                    0
OpSys                  0
Weight                 0
Price                  0
Cpu_Speed              0
Memory_MB           1270
dtype: int64

In [37]:
print("Number of duplicate rows AFTER cleaning:")
print(df.duplicated().sum())

Number of duplicate rows AFTER cleaning:
0


In [41]:
# print datatypes
print("All the data types:")
print(df.dtypes)

All the data types:
Unnamed: 0          float64
Company              object
TypeName             object
Inches               object
ScreenResolution     object
Cpu                  object
Ram                  object
Memory               object
Gpu                  object
OpSys                object
Weight              float64
Price               float64
Cpu_Speed           float64
Memory_MB           float64
dtype: object


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1270 entries, 0 to 1302
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1270 non-null   float64
 1   Company           1270 non-null   object 
 2   TypeName          1270 non-null   object 
 3   Inches            1270 non-null   object 
 4   ScreenResolution  1270 non-null   object 
 5   Cpu               1270 non-null   object 
 6   Ram               1270 non-null   object 
 7   Memory            1270 non-null   object 
 8   Gpu               1270 non-null   object 
 9   OpSys             1270 non-null   object 
 10  Weight            1270 non-null   float64
 11  Price             1270 non-null   float64
 12  Cpu_Speed         1270 non-null   float64
 13  Memory_MB         0 non-null      float64
dtypes: float64(5), object(9)
memory usage: 148.8+ KB
