In [1]:
import pandas as pd

In [2]:
# read the csv file and display the data frame
df = pd.read_csv('laptop_price.csv')
df.head(3)

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0


## Find Duplicate Rows

### .duplicated() 

In [3]:
# find duplicates in 1 column/series
duplicates = df['laptop_ID'].duplicated()
df[duplicates]

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros


In [4]:
# duplicates in 2 or more columns
more_duplicates = df.duplicated(['Product', 'TypeName', 'Inches'])
df[more_duplicates].head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6
6,7,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.2GHz,16GB,256GB Flash Storage,Intel Iris Pro Graphics,Mac OS X,2.04kg,2139.97
7,8,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,256GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,1158.7
10,11,HP,250 G6,Notebook,15.6,1366x768,Intel Core i5 7200U 2.5GHz,4GB,500GB HDD,Intel HD Graphics 620,No OS,1.86kg,393.9
11,12,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i3 6006U 2GHz,4GB,500GB HDD,Intel HD Graphics 520,No OS,1.86kg,344.99


In [5]:
# keep='first' - (cheapest laptop per company)
duplicated_first = df.duplicated('Company', keep='first')
df[~duplicated_first][['Company', 'Price_euros']]

Unnamed: 0,Company,Price_euros
0,Apple,1339.69
2,HP,575.0
5,Acer,400.0
8,Asus,1495.0
13,Dell,498.9
18,Lenovo,499.0
30,Chuwi,244.99
58,MSI,2449.0
70,Microsoft,1089.0
143,Toshiba,602.0


In [6]:
# check all value categories
df[~duplicated_first].value_counts('Company')

Company
Acer         1
Lenovo       1
Vero         1
Toshiba      1
Samsung      1
Razer        1
Microsoft    1
Mediacom     1
MSI          1
LG           1
Apple        1
Huawei       1
HP           1
Google       1
Fujitsu      1
Dell         1
Chuwi        1
Asus         1
Xiaomi       1
Name: count, dtype: int64

In [7]:
# keep='last'(most expensive laptop per company)
duplicated_last = df.duplicated('Company', keep='last')
df[~duplicated_last][['Company', 'Price_euros']]

Unnamed: 0,Company,Price_euros
214,Huawei,1499.0
483,Chuwi,248.9
670,Microsoft,1867.85
718,Mediacom,265.0
762,Google,1559.0
877,Xiaomi,935.0
909,LG,1899.0
983,Fujitsu,649.0
1118,Toshiba,2226.0
1120,Vero,196.0


In [8]:
# keep=False - non duplicated values in "Company" column
duplicated_false = df.duplicated('Company', keep=False)
df[~duplicated_false]

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros


In [9]:
# all elements in company column are repeated at least once
df.value_counts(['Company'])

Company  
Dell         287
Lenovo       283
HP           265
Asus         148
Acer         100
MSI           53
Toshiba       48
Apple         21
Samsung        9
Razer          7
Mediacom       7
Microsoft      6
Vero           4
Xiaomi         4
Google         3
Fujitsu        3
Chuwi          3
LG             3
Huawei         2
Name: count, dtype: int64

## Drop Duplicate Elements 

### .drop_duplicates() 

In [10]:
# drop duplicates in 2 or more columns
df.drop_duplicates(['Company'])[['Company', 'Price_euros']]

Unnamed: 0,Company,Price_euros
0,Apple,1339.69
2,HP,575.0
5,Acer,400.0
8,Asus,1495.0
13,Dell,498.9
18,Lenovo,499.0
30,Chuwi,244.99
58,MSI,2449.0
70,Microsoft,1089.0
143,Toshiba,602.0


In [18]:
# sort dataframe asending by company and price - cheapest first
df = df.sort_values(['Company', 'Price_euros'])
# cheapest: keep='first'
df.drop_duplicates(['Company'], keep='first')[['Company', 'Price_euros']]

Unnamed: 0,Company,Price_euros
1215,Acer,174.0
1,Apple,898.94
20,Asus,191.9
30,Chuwi,244.99
340,Dell,274.9
983,Fujitsu,649.0
472,Google,1275.0
67,HP,229.0
170,Huawei,1349.0
909,LG,1899.0


In [13]:
# most expensive: keep='last'
df.drop_duplicates(['Company'], keep='last')[['Company', 'Price_euros']]

Unnamed: 0,Company,Price_euros
1189,Acer,2599.0
17,Apple,2858.0
1066,Asus,3975.0
421,Chuwi,449.0
723,Dell,3659.4
623,Fujitsu,799.0
437,Google,2199.0
749,HP,4389.0
214,Huawei,1499.0
678,LG,2299.0
