# Cleaning not null values

In [2]:
# Lbries
import numpy as np
import pandas as pd

In [4]:
df = pd.DataFrame({
    'Sex': ['M', 'F', 'F', 'D', '?'],
    'Age': [29, 30, 24, 290, 25],
})
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


## Finding Unique Values

In [7]:
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [9]:
df['Sex'].replace('D', 'F')

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [13]:
df['Sex'].replace({'D': 'F', 'M': 'N'})

0    N
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [15]:
df.replace({
    'Sex': {
        'D': 'F',
        'N': 'M'
    },
    'Age': {
        290: 29
    }
})

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,29
4,?,25


In [17]:
df[df['Age'] > 100]

Unnamed: 0,Sex,Age
3,D,290


In [19]:
df.loc[df['Age'] > 100, 'Age'] = df.loc[df['Age'] > 100, 'Age'] / 10

In [21]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,29
4,?,25


## Duplicates

In [28]:
ambassadors = pd.Series([
    'France',
    'UK',
    'UK',
    'Italy',
    'Germany',
    'Germany',
    'Germany',
    ], index=[
    'Férard Araud',
    'Kim Dorroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth'
])

In [31]:
ambassadors

Férard Araud           France
Kim Dorroch                UK
Peter Westmacott           UK
Armando Varricchio      Italy
Peter Wittig          Germany
Peter Ammon           Germany
Klaus Scharioth       Germany
dtype: object

In [35]:
ambassadors.duplicated()

Férard Araud          False
Kim Dorroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [37]:
ambassadors.duplicated(keep='last')

Férard Araud          False
Kim Dorroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [39]:
ambassadors.duplicated(keep=False)

Férard Araud          False
Kim Dorroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [41]:
ambassadors.drop_duplicates()

Férard Araud           France
Kim Dorroch                UK
Armando Varricchio      Italy
Peter Wittig          Germany
dtype: object

In [43]:
ambassadors.drop_duplicates(keep='last')

Férard Araud           France
Peter Westmacott           UK
Armando Varricchio      Italy
Klaus Scharioth       Germany
dtype: object

In [45]:
ambassadors.drop_duplicates(keep=False)

Férard Araud          France
Armando Varricchio     Italy
dtype: object

## Suplicates in DataFrames

In [64]:
players = pd.DataFrame({
    'Name': [
        'Kobe Bryant',
        'LeBron James',
        'Kobe Bryant',
        'Carmelo Anthony',
        'Kobe Bryant',
    ],
    'Pos' : [
        'SG',
        'SF',
        'SG',
        'SF',
        'SF'
    ]
})

In [66]:
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [68]:
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [70]:
players.duplicated(subset=['Name'])

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [72]:
players.duplicated(subset=['Name'], keep='last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [74]:
players.drop_duplicates()

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [76]:
players.drop_duplicates(subset=['Name'])

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF


In [78]:
players.drop_duplicates(subset=['Name'], keep='last')

Unnamed: 0,Name,Pos
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


## Text Handling

### Splitting Columns

In [118]:
df = pd.DataFrame({
    'Data': [
        '1987_M_US_1',
        '1990?_M_UK_1',
        '1992_F_US_2',
        '1970?_M_  IT_1',
        '1985_F_I  T_2'
    ]})

In [120]:
df['Data'].str.split('_')

0       [1987, M, US, 1]
1      [1990?, M, UK, 1]
2       [1992, F, US, 2]
3    [1970?, M,   IT, 1]
4     [1985, F, I  T, 2]
Name: Data, dtype: object

In [122]:
df['Data'].str.split('_', expand=True)

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [124]:
df = df['Data'].str.split('_', expand=True)

In [126]:
df.columns = ['Year', 'Sex', 'Country', 'No Children']

In [128]:
df

Unnamed: 0,Year,Sex,Country,No Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [130]:
df['Year'].str.contains('\?')

  df['Year'].str.contains('\?')


0    False
1     True
2    False
3     True
4    False
Name: Year, dtype: bool

In [132]:
df['Country'].str.contains('U')

0     True
1     True
2     True
3    False
4    False
Name: Country, dtype: bool

In [134]:
df['Country'].str.replace(' ', '')

0    US
1    UK
2    US
3    IT
4    IT
Name: Country, dtype: object

In [136]:
df['Year'].str.replace(
    r'(?P<year>\d{4})\?', 
    lambda m: m.group('year'), 
    regex=True
)

0    1987
1    1990
2    1992
3    1970
4    1985
Name: Year, dtype: object

In [138]:
df

Unnamed: 0,Year,Sex,Country,No Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2
