In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({
    "Sex": ['M', 'F', 'F', 'D', '?'],
    "Age": [29, 30, 24, 290, 25],
})

In [3]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [4]:
df.head()

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


## **Finding Unique Values**

In [5]:
df["Sex"].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [6]:
df["Sex"].value_counts()

F    2
D    1
?    1
M    1
Name: Sex, dtype: int64

In [7]:
#we need to replace F in D
df["Sex"].replace("D", "F")

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [8]:
df["Sex"].replace({"D":"F", "N":"M"})

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [9]:
df.replace({
    "Sex": {"D":"F", "N":"M"},
     "Age":{
         290:29
         }
     

})

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,29
4,?,25


In [10]:
df[df["Age"]>100]

Unnamed: 0,Sex,Age
3,D,290


In [11]:
df.replace({
    "Sex": {
        "D":"F",
        "N":"F"
    },
    "Age": {
        290:29
    }
    
})

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,29
4,?,25


In [12]:
df.head()

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [13]:
df.loc[df["Age"]>100, "Age"] 

3    290
Name: Age, dtype: int64

In [14]:
df.loc[df["Age"]>100, "Age"] = df.loc[df["Age"]>100, "Age"] / 100

In [15]:
df.head()

Unnamed: 0,Sex,Age
0,M,29.0
1,F,30.0
2,F,24.0
3,D,2.9
4,?,25.0


## **Duplicates**
Checking duplicate values is extremely simple. It'll behave differently between Series and DataFrames. Let's start with Series. As an example, let's say we're throwing a fancy party and we're inviting Ambassadors from Europe. But can only invite one ambassador per country. This is our original list, and as you can see, both the UK and Germany have duplicated ambassadors:

In [16]:
ambassadors = pd.Series([
                         'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany',
                         
], index=[
          'Gérard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth '
          
])

In [17]:
ambassadors

Gérard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [18]:
#The two most important methods to deal with duplicates are duplicated (that will tell you which values are duplicates) 
#and drop_duplicates (which will just get rid of duplicates):
ambassadors.duplicated()

Gérard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [19]:
ambassadors.duplicated(keep="last")

Gérard Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [21]:
ambassadors.duplicated(keep=
                       False)

Gérard Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [22]:
ambassadors.drop_duplicates()

Gérard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

In [23]:
ambassadors.drop_duplicates(keep="last")

Gérard Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

In [24]:
ambassadors.drop_duplicates(keep=False)

Gérard Araud          France
Armando Varricchio     Italy
dtype: object

### **Duplicates in DataFrames**

In [25]:
players = pd.DataFrame({
    "Name": [
             'Kobe Bryant',
        'LeBron James',
        'Kobe Bryant',
        'Carmelo Anthony',
        'Kobe Bryant',
    ],
    "Pos": [
            'SG',
        'SF',
        'SG',
        'SF',
        'SF'
    ]
})

In [26]:
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [27]:
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [28]:
players.duplicated(keep="last")

0     True
1    False
2    False
3    False
4    False
dtype: bool

In [29]:
players.duplicated(keep=False)

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [36]:
players.duplicated(subset=["Name"])

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [32]:
players.duplicated(subset="Name")

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [33]:
players.duplicated(subset="Name", keep="last")

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [34]:
players.drop_duplicates()

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [37]:
players.drop_duplicates(subset=["Name"], keep="last")

Unnamed: 0,Name,Pos
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


### **Text Handling**

In [38]:
df = pd.DataFrame({
    "Data": [
              '1987_M_US _1',
        '1990?_M_UK_1',
        '1992_F_US_2',
        '1970?_M_   IT_1',
        '1985_F_I  T_2'
    ]
})

In [39]:
df.head()

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [42]:
df["Data"].str.split("_")

0       [1987, M, US , 1]
1       [1990?, M, UK, 1]
2        [1992, F, US, 2]
3    [1970?, M,    IT, 1]
4      [1985, F, I  T, 2]
Name: Data, dtype: object

In [43]:
df["Data"].str.split("_",expand=True)

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [44]:
df = df["Data"].str.split("_", expand=True)

In [45]:
df

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [46]:
df.columns=["Year", "Sex", "Country", "No. of Children"]

In [47]:
df

Unnamed: 0,Year,Sex,Country,No. of Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [48]:
df["Year"].str.contains("\?")

0    False
1     True
2    False
3     True
4    False
Name: Year, dtype: bool

In [52]:
#regular letter doesn't need ?
df["Country"].str.contains("U")

0     True
1     True
2     True
3    False
4    False
Name: Country, dtype: bool

In [54]:
#Removing blank spaces (like in 'US ' or 'I  T' can be achieved with strip (lstrip and rstrip also exist) or just replace:
df["Country"].str.strip()

0      US
1      UK
2      US
3      IT
4    I  T
Name: Country, dtype: object

In [55]:
df

Unnamed: 0,Year,Sex,Country,No. of Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [57]:
df["Country"].str.replace(" ", "")

0    US
1    UK
2    US
3    IT
4    IT
Name: Country, dtype: object

In [63]:
#As we said, replace and contains take regex patterns, which can make it easier to replace values in bulk:

df["Year"].str.replace(r'(?P<year>\d{4})\?', lambda m: m.group('year'))

0    1987
1    1990
2    1992
3    1970
4    1985
Name: Year, dtype: object

In [62]:

df['Year'].str.replace(r'(?P<year>\d{4})\?', lambda m: m.group('year'))

0    1987
1    1990
2    1992
3    1970
4    1985
Name: Year, dtype: object