## Data Cleaning Basics

In [1]:
# reading files with "Encoding"
import numpy as np 
import pandas as pd 
# type of encoding to read the file correctly use "Latin-1" 
# you can try one out of ["UTF-8",  "Latin-1 (also known as ISO-8859-1)" , " Windows-1251"]

laptops = pd.read_csv("laptops.csv" , encoding = "Latin-1") 
laptops.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price (Euros)
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360


In [2]:
laptops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              1303 non-null   object
 1   Model Name                1303 non-null   object
 2   Category                  1303 non-null   object
 3   Screen Size               1303 non-null   object
 4   Screen                    1303 non-null   object
 5   CPU                       1303 non-null   object
 6   RAM                       1303 non-null   object
 7    Storage                  1303 non-null   object
 8   GPU                       1303 non-null   object
 9   Operating System          1303 non-null   object
 10  Operating System Version  1133 non-null   object
 11  Weight                    1303 non-null   object
 12  Price (Euros)             1303 non-null   object
dtypes: object(13)
memory usage: 132.5+ KB


- Next, let's use the DataFrame.columns attribute to remove whitespaces from the column names.

In [3]:
#data cleaning basics how to clean dataframe columns names "dataframe.columns"
#removing the spaces from start and end of the names
new_cols = []
for col in laptops.columns:
    col = col.strip()
    new_cols.append(col)

# modify the column names
laptops.columns = new_cols
laptops.columns

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', 'Storage', 'GPU', 'Operating System',
       'Operating System Version', 'Weight', 'Price (Euros)'],
      dtype='object')

In [4]:
# clean all the names it is easy to use a function to clean all the column names
def clean_col(c):
    c = c.strip()
    c = c.replace(")" , "")
    c = c.replace("(" , "")
    c = c.lower()
    c = c.replace(" " , "_")
    return c

new_cols = []
for col in laptops.columns:
    col = clean_col(col)
    new_cols.append(col)
    
laptops.columns = new_cols
print(laptops.columns)

Index(['manufacturer', 'model_name', 'category', 'screen_size', 'screen',
       'cpu', 'ram', 'storage', 'gpu', 'operating_system',
       'operating_system_version', 'weight', 'price_euros'],
      dtype='object')


In [5]:
# convert string columns to numeric datatype
#first identify the column using series.dtype and series.unique
laptops.iloc[:5 , 2:5]

Unnamed: 0,category,screen_size,screen
0,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600
1,Ultrabook,"13.3""",1440x900
2,Notebook,"15.6""",Full HD 1920x1080
3,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800
4,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600


- notice the screen_size column is string cause of (") after the number
- we need to remove the (") string then convert the column to float or integer

In [6]:
#exploring the screen size column
print(laptops["screen_size"].dtype)
print(laptops["screen_size"].unique())

object
['13.3"' '15.6"' '15.4"' '14.0"' '12.0"' '11.6"' '17.3"' '10.1"' '13.5"'
 '12.5"' '13.0"' '18.4"' '13.9"' '12.3"' '17.0"' '15.0"' '14.1"' '11.3"']


In [7]:
#removing the " string using pandas string vectorized methods
laptops["screen_size"] = laptops["screen_size"].str.replace('"' , "")
laptops["screen_size"].unique()

array(['13.3', '15.6', '15.4', '14.0', '12.0', '11.6', '17.3', '10.1',
       '13.5', '12.5', '13.0', '18.4', '13.9', '12.3', '17.0', '15.0',
       '14.1', '11.3'], dtype=object)

In [8]:
#convert the column to numeric datatype
laptops["screen_size"] = laptops["screen_size"].astype(float)
laptops["screen_size"].unique()

array([13.3, 15.6, 15.4, 14. , 12. , 11.6, 17.3, 10.1, 13.5, 12.5, 13. ,
       18.4, 13.9, 12.3, 17. , 15. , 14.1, 11.3])

In [9]:
laptops.rename({"screen_size" : "screen_size_inches"} , axis = 1 , inplace = True)
laptops.dtypes

manufacturer                 object
model_name                   object
category                     object
screen_size_inches          float64
screen                       object
cpu                          object
ram                          object
storage                      object
gpu                          object
operating_system             object
operating_system_version     object
weight                       object
price_euros                  object
dtype: object

In [10]:
#explore the "ram" column 
print(laptops["ram"].dtype)
print(laptops["ram"].unique())

object
['8GB' '16GB' '4GB' '2GB' '12GB' '6GB' '32GB' '24GB' '64GB']


In [11]:
# we should remove the "GB" string and then convert to numeric
laptops["ram"] = laptops["ram"].str.replace("GB" , "")
laptops["ram"].unique()

array(['8', '16', '4', '2', '12', '6', '32', '24', '64'], dtype=object)

In [12]:
#convert the column to int values
laptops["ram"] = laptops["ram"].astype(int)
laptops["ram"].dtype

dtype('int32')

In [13]:
#renamming the "ram" to "ram_gb"
laptops.rename({"ram" : "ram_gb"} , axis = 1, inplace = True)
laptops.dtypes

manufacturer                 object
model_name                   object
category                     object
screen_size_inches          float64
screen                       object
cpu                          object
ram_gb                        int32
storage                      object
gpu                          object
operating_system             object
operating_system_version     object
weight                       object
price_euros                  object
dtype: object

### extracting values from string

In [14]:
print(laptops["gpu"].head())

0    Intel Iris Plus Graphics 640
1          Intel HD Graphics 6000
2           Intel HD Graphics 620
3              AMD Radeon Pro 455
4    Intel Iris Plus Graphics 650
Name: gpu, dtype: object


- how to extrct the manufacturer intel/amd

In [15]:
# extracting the first word from the string
laptops["gpu"].str.split().head().str[0]

0    Intel
1    Intel
2    Intel
3      AMD
4    Intel
Name: gpu, dtype: object

In [16]:
# extract the model from the string 
"eslam hosam".split()

['eslam', 'hosam']

In [17]:
print(laptops["cpu"].head())

0          Intel Core i5 2.3GHz
1          Intel Core i5 1.8GHz
2    Intel Core i5 7200U 2.5GHz
3          Intel Core i7 2.7GHz
4          Intel Core i5 3.1GHz
Name: cpu, dtype: object


In [18]:
cpu_manufacturer = laptops["cpu"].str.split().str[0]

In [19]:
cpu_manuf_counts = cpu_manufacturer.value_counts()
cpu_manuf_counts

Intel      1240
AMD          62
Samsung       1
Name: cpu, dtype: int64

In [20]:
laptops["cpu"].str.split().str[0] + " " + laptops["cpu"].str.split().str[1] + " " + laptops["cpu"].str.split().str[2]

0            Intel Core i5
1            Intel Core i5
2            Intel Core i5
3            Intel Core i7
4            Intel Core i5
               ...        
1298         Intel Core i7
1299         Intel Core i7
1300    Intel Celeron Dual
1301         Intel Core i7
1302    Intel Celeron Dual
Name: cpu, Length: 1303, dtype: object

In [21]:
#correcting bad values 
laptops.operating_system.unique()

array(['macOS', 'No OS', 'Windows', 'Mac OS', 'Linux', 'Android',
       'Chrome OS'], dtype=object)

In [22]:
dct = {"macOS": "macos",
       "No Os" : "Android",
      "Windows" : "Windows",
      "Chrome OS" : "Chraome OS",
      "Linux" : "Linux",
      "Android" : "Android",
      "Mac OS" : "macos"}

laptops.operating_system = laptops.operating_system.map(dct)


In [23]:
laptops.operating_system.value_counts(dropna = False)

Windows       1125
NaN             66
Linux           62
Chraome OS      27
macos           21
Android          2
Name: operating_system, dtype: int64

**try this one to count the number of laptobs for each company**

In [24]:
# how many laptops have intel gpu and Amd and others

Intel_laptops  = []
Amd_laptops = []
others = []

for lst in laptops.gpu.str.split():
    if "Intel" in lst :
        Intel_laptops.append(lst)
        
    elif "AMD" in lst:
        Amd_laptops.append(lst)
    else:
        others.append(lst)
        
print("Intel has {} laptops ".format(len(Intel_laptops)))
print("AMD has {} laptops".format(len(Amd_laptops)))
print("others has {}".format(len(others)))

Intel has 722 laptops 
AMD has 180 laptops
others has 401


**compare the beauty of pandas to get the same results**

In [25]:
# for Intel company
laptops.gpu.str.split().str[0].isin (["Intel"]).value_counts()

True     722
False    581
Name: gpu, dtype: int64

In [26]:
# for AMD company
laptops.gpu.str.split().str[0].isin (["AMD"]).value_counts()

False    1123
True      180
Name: gpu, dtype: int64

In [27]:
# for Nvidia  company
laptops.gpu.str.split().str[0].isin (["Nvidia"]).value_counts()

False    903
True     400
Name: gpu, dtype: int64