# 1. Reading CSV Files with Encodings

In [1]:
import pandas as pd
import numpy as np

laptops=pd.read_csv("laptops.csv",encoding="Latin-1")
print(laptops.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              1303 non-null   object
 1   Model Name                1303 non-null   object
 2   Category                  1303 non-null   object
 3   Screen Size               1303 non-null   object
 4   Screen                    1303 non-null   object
 5   CPU                       1303 non-null   object
 6   RAM                       1303 non-null   object
 7    Storage                  1303 non-null   object
 8   GPU                       1303 non-null   object
 9   Operating System          1303 non-null   object
 10  Operating System Version  1133 non-null   object
 11  Weight                    1303 non-null   object
 12  Price (Euros)             1303 non-null   object
dtypes: object(13)
memory usage: 132.5+ KB
None


# 2. Cleaning Column Names

In [2]:
new_columns=[]

for column in laptops.columns:
    new_columns.append(column.strip())
laptops.columns=new_columns

# 3. Cleaning Column Names Continued

In [3]:
import pandas as pd
laptops = pd.read_csv('laptops.csv', encoding='Latin-1')

def clean(column):
    column=column.strip()
    column=column.replace("Operating System","os")
    column=column.replace("(","")
    column=column.replace(")","")
    column=column.replace(" ","_")
    column=column.lower()
    return column

new_laptops_columns=[]

for column in laptops.columns:
    column=clean(column)
    new_laptops_columns.append(column)
    
laptops.columns=new_laptops_columns

# 4. Converting String Columns to Numeric

In [4]:
unique_ram=laptops["ram"].unique()

# 5. Removing Non-Digit Characters 

In [5]:
laptops["ram"]=laptops["ram"].str.replace("GB","")
unique_ram=laptops["ram"].unique()

# 6. Converting Columns to Numeric Dtypes

In [6]:
# laptops["ram"]=laptops["ram"].str.replace("GB","")
laptops["ram"]=laptops["ram"].astype(int)
dtypes=laptops.dtypes

# 7. Renaming Columns

In [7]:
# laptops["ram"] = laptops["ram"].str.replace('GB','').astype(int)

laptops.rename({"ram":"ram_gb"},axis=1,inplace=True)

ram_gb_desc=laptops["ram_gb"].describe()

# 8. Extracting Values from Strings

In [8]:
laptops["gpu_manufacturer"] = (laptops["gpu"]
                                       .str.split()
                                       .str[0]
                              )

laptops["cpu_manufacturer"]=(laptops["cpu"].str.split().str[0])

cpu_manufacturer_counts=laptops["cpu_manufacturer"].value_counts()

# 9. Correcting Bad Values

In [9]:
mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}

laptops["os"]=laptops["os"].map(mapping_dict)

# 10. Dropping Missing Values

In [10]:
laptops_no_null_rows=laptops.dropna()
laptops_no_null_cols=laptops.dropna(axis=1)

# 11. Filling Missing Values

In [11]:
value_counts_before = laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()
laptops.loc[laptops["os"] == "macOS", "os_version"] = "X"

laptops.loc[laptops["os"]=="No OS", "os_version"]="Version Unknown"
value_counts_after=laptops.loc[laptops["os_version"].isnull(),"os"].value_counts()

# 12. Challenge: Clean a String Column 

In [12]:
unique_weights=laptops["weight"].unique()

print(unique_weights)

laptops["weight"]=laptops["weight"].str.replace("kgs","")
laptops["weight"]=laptops["weight"].str.replace("kg","")

laptops.rename({"weight":"weight_kg"},axis=1,inplace=True)

laptops["weight_kg"]=laptops["weight_kg"].astype(float)

laptops.to_csv("laptops_cleaned.csv",index=False)

['1.37kg' '1.34kg' '1.86kg' '1.83kg' '2.1kg' '2.04kg' '1.3kg' '1.6kg'
 '2.2kg' '0.92kg' '1.22kg' '0.98kg' '2.5kg' '1.62kg' '1.91kg' '2.3kg'
 '1.35kg' '1.88kg' '1.89kg' '1.65kg' '2.71kg' '1.2kg' '1.44kg' '2.8kg'
 '2kg' '2.65kg' '2.77kg' '3.2kg' '0.69kg' '1.49kg' '2.4kg' '2.13kg'
 '2.43kg' '1.7kg' '1.4kg' '1.8kg' '1.9kg' '3kg' '1.252kg' '2.7kg' '2.02kg'
 '1.63kg' '1.96kg' '1.21kg' '2.45kg' '1.25kg' '1.5kg' '2.62kg' '1.38kg'
 '1.58kg' '1.85kg' '1.23kg' '1.26kg' '2.16kg' '2.36kg' '2.05kg' '1.32kg'
 '1.75kg' '0.97kg' '2.9kg' '2.56kg' '1.48kg' '1.74kg' '1.1kg' '1.56kg'
 '2.03kg' '1.05kg' '4.4kg' '1.90kg' '1.29kg' '2.0kg' '1.95kg' '2.06kg'
 '1.12kg' '1.42kg' '3.49kg' '3.35kg' '2.23kg' '4.42kg' '2.69kg' '2.37kg'
 '4.7kg' '3.6kg' '2.08kg' '4.3kg' '1.68kg' '1.41kg' '4.14kg' '2.18kg'
 '2.24kg' '2.67kg' '2.14kg' '1.36kg' '2.25kg' '2.15kg' '2.19kg' '2.54kg'
 '3.42kg' '1.28kg' '2.33kg' '1.45kg' '2.79kg' '1.84kg' '2.6kg' '2.26kg'
 '3.25kg' '1.59kg' '1.13kg' '1.78kg' '1.10kg' '1.15kg' '1.27kg' '1.43kg

# 13. Next Steps

Our dataset is ready for some analysis, but there are still some data cleaning tasks left! Here are your next steps:

Convert the price_euros column to a numeric dtype.

Extract the screen resolution from the screen column.

Extract the processor speed from the cpu column.

Here are some questions you might like to answer in your own time by analyzing the cleaned data:

Are laptops made by Apple more expensive than those made by other manufacturers?

What is the best value laptop with a screen size of 15" or more?

Which laptop has the most storage space?

The final mission in our course is a guided project, where we'll put everything together to clean and analyze a dataset using pandas!