[notebook](http://localhost:8888/notebooks/scripts/29.ipynb)

In [30]:
import pandas as pd
from IPython.display import display
pd.set_option('display.max_rows', 20)

In [31]:
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('../data/u.user.txt', sep='|', header=None, names=user_cols, index_col='user_id')
users.head()

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [32]:
users.shape

(943, 4)

In [33]:
# duplicated: rows of series
users.zip_code.duplicated().tail(10)

user_id
934     True
935    False
936    False
937     True
938    False
939    False
940     True
941    False
942    False
943    False
Name: zip_code, dtype: bool

In [34]:
users.zip_code.duplicated().sum()

148

In [35]:
# duplicated: for rows of dataframe
users.duplicated().tail()

user_id
939    False
940    False
941    False
942    False
943    False
dtype: bool

In [36]:
users.duplicated().sum()

7

Logic for [**`duplicated`**](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.duplicated.html):

- **`keep='first'`** (default): Mark duplicates as True except for the first occurrence.
- **`keep='last'`**: Mark duplicates as True except for the last occurrence.
- **`keep=False`**: Mark all duplicates as True.

In [37]:
# keep='first': first part keep in dataset, last part regarded as duplicated
users.loc[users.duplicated(keep='first'), :]

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402
684,28,M,student,55414
733,44,F,other,60630
805,27,F,other,20009
890,32,M,student,97301


In [38]:
# keep='last': last part kept in dataset, first part regarded as duplicated
users.loc[users.duplicated(keep='last'), :]

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
67,17,M,student,60402
85,51,M,educator,20003
198,21,F,student,55414
350,32,M,student,97301
428,28,M,student,55414
437,27,F,other,20009
460,44,F,other,60630


In [39]:
# keep=False: both first and last regarded duplicated
users.loc[users.duplicated(keep=False), :]

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
67,17,M,student,60402
85,51,M,educator,20003
198,21,F,student,55414
350,32,M,student,97301
428,28,M,student,55414
437,27,F,other,20009
460,44,F,other,60630
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402


In [40]:
# drop_duplicates keep='first': return dataset without last part of duplicates
users.drop_duplicates(keep='first').loc[60:70,:]

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
60,50,M,healthcare,6472
61,36,M,engineer,30040
62,27,F,administrator,97214
63,31,M,marketing,75240
64,32,M,educator,43202
65,51,F,educator,48118
66,23,M,student,80521
67,17,M,student,60402
68,19,M,student,22904
69,24,M,engineer,55337


In [41]:
users.drop_duplicates(keep='last').loc[60:70, :]

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
60,50,M,healthcare,6472
61,36,M,engineer,30040
62,27,F,administrator,97214
63,31,M,marketing,75240
64,32,M,educator,43202
65,51,F,educator,48118
66,23,M,student,80521
68,19,M,student,22904
69,24,M,engineer,55337
70,27,M,engineer,60067


In [42]:
# remove first and last duplicates
users.drop_duplicates(keep=False).shape

(929, 4)

In [43]:
# only consider a subset of columns when identifying duplicates
users.duplicated(subset=['age', 'zip_code']).sum()
# keep first part, and consider last part to be duplicates

16

In [44]:
# duplicated subset: only occur in 2 columns
users[users.duplicated(subset=['age', 'zip_code'], keep='last')].sort_values('age')

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
67,17,M,student,60402
274,20,F,student,55414
198,21,F,student,55414
408,23,M,student,61755
31,24,M,artist,10003
734,25,F,other,63108
178,26,M,other,49512
498,26,M,writer,55408
437,27,F,other,20009
428,28,M,student,55414


In [45]:
users[users.duplicated(subset=['age', 'zip_code'], keep='first')].sort_values('age')

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
621,17,M,student,60402
773,20,M,student,55414
496,21,F,student,55414
898,23,M,homemaker,61755
567,24,M,entertainment,10003
741,25,M,writer,63108
630,26,F,healthcare,55408
703,26,M,educator,49512
805,27,F,other,20009
684,28,M,student,55414


In [46]:
print(users.duplicated('age').sum())

# drop_duplicates: on column age
users.drop_duplicates('age')

882


Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
5,33,F,other,15213
6,42,M,executive,98101
7,57,M,administrator,91344
8,36,M,administrator,05201
9,29,M,student,01002
11,39,F,other,30329
12,28,F,other,06405


In [47]:
print(users.duplicated('zip_code').sum())
users.drop_duplicates('zip_code').shape

148


(795, 4)