In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load the data
df = pd.read_csv("laptop_data.csv")

### Removing unwanted data like monitors, tablets

In [3]:
#I filtered with GHz cause, every laptop has it mentioned in CPU speed
df = df[df['CPU Speed'].str.contains('GHz')]

### Converting the Number of review to numeric column

In [4]:
# Lets make the review feature as numeric
df['No. Reviews'] = df['No. Reviews'].str.replace(',','').astype(int)

### Converting Display size column to numeric

In [5]:
#Removed irrelavant data and converted it to numeric
df['Display size'] = df[df['Display size'].str.contains('in')]['Display size'].str.replace('in','')
df = df.dropna()
df.drop(df[df['Display size'].str.contains('W')].index , inplace =True)
df['Display size'] = df['Display size'].astype(float)

# As there are mulitple values for CPU speed, its hard to create dummy columns. So I changed it to numeric
df['CPU Speed'] = df['CPU Speed'].str.replace('GHz','').astype(float)

In [6]:
#Dropped duplicate rows
df.drop(df[df['RAM'].str.contains('-')][1:].index, inplace = True)

### Cleaning the RAM column

In [7]:
#Cleaning the RAM feature
df.RAM = df.RAM.str.replace('GB','')
df.RAM = df.RAM.str.replace('TB','')

df.RAM = df.RAM.replace( '-' , np.NAN)
df.RAM = df.RAM.fillna(df['RAM'].mode()[0]).astype(int)

### Converting Ratings datatype to Float 

In [8]:
#Cleaning the ratings column
df.Rating = df['Rating'].str[:3].astype(float)

### Fixing the Storage type feature

In [9]:
#Fixing the storage types
df['Storage type'] = df['Storage type'].str.replace('Solid State Drive','SSD')
df['Storage type'] = df['Storage type'].apply(lambda x : 'SSD' if ('ssd' in x.lower()) else x )

#Fixed the EMMC
df['Storage type'] = df['Storage type'].apply(lambda x : 'EMMC' if ('emmc' in x.lower()) else x )
df['Storage type'] = df['Storage type'].apply(lambda x : 'EMMC' if ('Embedded MultiMediaCard' in x) else x )

#Fixed the HDD
df['Storage type'] = df['Storage type'].apply(lambda x : 'HDD' if ('hdd' in x.lower()) else x )

#Replacing the rest of the data with others
df['Storage type'] = df['Storage type'].replace('Mechanical Hard Drive','Others').replace('Hybrid Drive','Others')
df['Storage type'] = df['Storage type'].replace('-','Others').replace('Integrated','Others').replace('Hybrid','Others').replace('SDD','Others')
df['Storage type'] = df['Storage type'].replace('64gb','Others').replace('Flash','Others').replace('sshd','Others').replace('USB','Others')

### Creating a new column Brand from the title

In [10]:
#Extracting brand from the title
df['Brand'] = df['Title'].apply(lambda x : x.split(' ')[:][0])

#Dropped the duplicate values
df.drop(df[df['Brand'].str.contains("14.1")][1:].index, inplace = True)

#Again dropped the duplicate value, I think there were many same sponsered ads
df.drop_duplicates(subset='Title',keep="first", inplace = True)

df.Brand = df.Brand.str.replace('BLACK','Hp')

pd.options.display.max_colwidth = 250
df.RAM[998] = 8
df.Brand = df.Brand.apply(lambda x : x.lower() )

# Now lets only keep the important brands and replace the rest with other_brand
def brand_maker(x):
    if (x=="lenovo") or (x=="hp") or (x=="asus") or (x=="dell") or (x=="apple") or (x=="acer") :
        return x
    else:
        return 'other_brand'

df.Brand = df.Brand.apply(brand_maker)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.RAM[998] = 8


### Creating a new feature to represent if it has graphics card or not

In [11]:
#Lets make the title to lower, so we can extract more informartion with ease
#Lets check if it has any gaming laptops, which typically comes with a powerful graphics cards, which might be useful to extract
index_for_grp = df[df.Title.str.lower().str.contains('graphics|gaming|nvidia')].index

#If it has graphics card set it to 1, 0 is default
df['graphics'] = 0
df.loc[index_for_grp,'graphics'] = 1

### Extracting processor type from the title

In [12]:
i5_index = df[df.Title.str.lower().str.contains('i5')].index
i7_index = df[df.Title.str.lower().str.contains('i7')].index
i9_index = df[df.Title.str.lower().str.contains('i9')].index
i3_index = df[df.Title.str.lower().str.contains('i3')].index
amd_index = df[df.Title.str.lower().str.contains('amd')].index

In [13]:
df["Processor"] = 'unknown'

In [14]:
df.loc[i5_index,'Processor'] = 'i5'
df.loc[i7_index,'Processor'] = 'i7'
df.loc[i9_index,'Processor'] = 'i9'
df.loc[i3_index,'Processor'] = 'i3'
df.loc[amd_index,'Processor'] = 'amd'

#### There are 9 M1 chips and i left them as unknown, cause creating a new column for 9 values, does'nt seem important
#### Just looked there are some laptops with wrong RAM values, just have to manually update it. Its painful :\

In [15]:
df.loc[1774,'RAM'] = 32
df.loc[1774,'Brand'] = 'apple'

df.loc[2741,'RAM'] = 16

df.loc[296,"RAM"] = 16

In [16]:
df[df.RAM == 1].loc[:,'RAM'] = 8

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


### Reseting the index

In [17]:
df = df.reset_index(drop=True)

### Creating a new csv file with cleaned data, to further work

In [18]:
df.to_csv('cleaned_data.csv')