# Pandas

In [1]:
import pandas as pd
import numpy as np

In [2]:
people = {
    "first": ["Corey", 'Jane', 'John', "Adam"], 
    "last": ["Schafer", 'Doe', 'Doe', "Doe"], 
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com', "AdamDoe@email.com"]
}

In [3]:
data = pd.DataFrame(people)

In [4]:
data

Unnamed: 0,first,last,email
0,Corey,Schafer,CoreyMSchafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com
3,Adam,Doe,AdamDoe@email.com


In [5]:
data["email"]

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
2          JohnDoe@email.com
3          AdamDoe@email.com
Name: email, dtype: object

In [6]:
data.email

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
2          JohnDoe@email.com
3          AdamDoe@email.com
Name: email, dtype: object

In [7]:
data[["last", "email"]]

Unnamed: 0,last,email
0,Schafer,CoreyMSchafer@gmail.com
1,Doe,JaneDoe@email.com
2,Doe,JohnDoe@email.com
3,Doe,AdamDoe@email.com


In [8]:
data.columns

Index(['first', 'last', 'email'], dtype='object')

In [9]:
data.iloc[[0, 1], 2]

0    CoreyMSchafer@gmail.com
1          JaneDoe@email.com
Name: email, dtype: object

In [10]:
data.loc[[0, 1], ["email", "last"]]

Unnamed: 0,email,last
0,CoreyMSchafer@gmail.com,Schafer
1,JaneDoe@email.com,Doe


In [11]:
data.set_index("email", inplace=True)

In [12]:
data.index

Index(['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com',
       'AdamDoe@email.com'],
      dtype='object', name='email')

In [13]:
data.loc["CoreyMSchafer@gmail.com", "last"]

'Schafer'

In [14]:
data.iloc[0]

first      Corey
last     Schafer
Name: CoreyMSchafer@gmail.com, dtype: object

In [15]:
data.reset_index(inplace=True)

In [16]:
data

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe
3,AdamDoe@email.com,Adam,Doe


In [17]:
filters = (data['last'] == 'Schafer') | (data['first'] == 'John')

In [18]:
data.loc[filters, 'email']

0    CoreyMSchafer@gmail.com
2          JohnDoe@email.com
Name: email, dtype: object

In [19]:
data.loc[~filters, 'email']

1    JaneDoe@email.com
3    AdamDoe@email.com
Name: email, dtype: object

In [20]:
data.columns

Index(['email', 'first', 'last'], dtype='object')

In [21]:
data.columns = ['email', 'first_name', 'last_name']

In [22]:
data

Unnamed: 0,email,first_name,last_name
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe
3,AdamDoe@email.com,Adam,Doe


In [23]:
data.columns = [x.lower() for x in data.columns]

In [24]:
data

Unnamed: 0,email,first_name,last_name
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe
3,AdamDoe@email.com,Adam,Doe


In [25]:
data.rename(columns={'first_name': 'first', 'last_name': 'last'}, inplace=True)

In [26]:
data

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe
3,AdamDoe@email.com,Adam,Doe


In [27]:
data.loc[2] = ['JohnSmith@email.com', 'John', 'Smith']

In [28]:
data

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnSmith@email.com,John,Smith
3,AdamDoe@email.com,Adam,Doe


In [29]:
data.loc[2, ['last', 'email']] = ['Doe', 'JohnDoe@email.com']

In [30]:
data

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe
3,AdamDoe@email.com,Adam,Doe


In [31]:
data.loc[2, 'last'] = 'Smith'

In [32]:
data

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Smith
3,AdamDoe@email.com,Adam,Doe


In [33]:
data.at[2, 'last'] = 'Doe'

In [34]:
data

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe
3,AdamDoe@email.com,Adam,Doe


In [35]:
filters = (data['email'] == 'JohnDoe@email.com')
data.loc[filters, 'last'] = 'Smith'

In [36]:
data

Unnamed: 0,email,first,last
0,CoreyMSchafer@gmail.com,Corey,Schafer
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Smith
3,AdamDoe@email.com,Adam,Doe


In [37]:
data['email'] = data['email'].str.lower()

In [38]:
data

Unnamed: 0,email,first,last
0,coreymschafer@gmail.com,Corey,Schafer
1,janedoe@email.com,Jane,Doe
2,johndoe@email.com,John,Smith
3,adamdoe@email.com,Adam,Doe


In [39]:
data['email'].apply(len)

0    23
1    17
2    17
3    17
Name: email, dtype: int64

In [40]:
def update_email(email):
    return email.upper()

In [41]:
data['email'] = data['email'].apply(update_email)

In [42]:
data

Unnamed: 0,email,first,last
0,COREYMSCHAFER@GMAIL.COM,Corey,Schafer
1,JANEDOE@EMAIL.COM,Jane,Doe
2,JOHNDOE@EMAIL.COM,John,Smith
3,ADAMDOE@EMAIL.COM,Adam,Doe


In [43]:
data['email'] = data['email'].apply(lambda x: x.lower())

In [44]:
data

Unnamed: 0,email,first,last
0,coreymschafer@gmail.com,Corey,Schafer
1,janedoe@email.com,Jane,Doe
2,johndoe@email.com,John,Smith
3,adamdoe@email.com,Adam,Doe


In [45]:
data['email'].apply(len)

0    23
1    17
2    17
3    17
Name: email, dtype: int64

In [46]:
data.apply(len, axis='columns')

0    3
1    3
2    3
3    3
dtype: int64

In [47]:
len(data['email'])

4

In [48]:
data.apply(pd.Series.min)

email    adamdoe@email.com
first                 Adam
last                   Doe
dtype: object

In [49]:
data.apply(lambda x: x.min())

email    adamdoe@email.com
first                 Adam
last                   Doe
dtype: object

In [50]:
data.applymap(len)

Unnamed: 0,email,first,last
0,23,5,7
1,17,4,3
2,17,4,5
3,17,4,3


In [51]:
data.applymap(str.lower)

Unnamed: 0,email,first,last
0,coreymschafer@gmail.com,corey,schafer
1,janedoe@email.com,jane,doe
2,johndoe@email.com,john,smith
3,adamdoe@email.com,adam,doe


In [52]:
data['first'].map({'Corey': 'Chris', 'Jane': 'Mary'})

0    Chris
1     Mary
2      NaN
3      NaN
Name: first, dtype: object

In [53]:
data['first'] = data['first'].replace({'Corey': 'Chris', 'Jane': 'Mary'})

In [54]:
data

Unnamed: 0,email,first,last
0,coreymschafer@gmail.com,Chris,Schafer
1,janedoe@email.com,Mary,Doe
2,johndoe@email.com,John,Smith
3,adamdoe@email.com,Adam,Doe


In [55]:
data["first"] + " " + data["last"]

0    Chris Schafer
1         Mary Doe
2       John Smith
3         Adam Doe
dtype: object

In [56]:
data["full_name"] = data["first"] + " " + data["last"]

In [57]:
data

Unnamed: 0,email,first,last,full_name
0,coreymschafer@gmail.com,Chris,Schafer,Chris Schafer
1,janedoe@email.com,Mary,Doe,Mary Doe
2,johndoe@email.com,John,Smith,John Smith
3,adamdoe@email.com,Adam,Doe,Adam Doe


In [58]:
data.drop(columns=["first", "last"], inplace=True)

In [59]:
data

Unnamed: 0,email,full_name
0,coreymschafer@gmail.com,Chris Schafer
1,janedoe@email.com,Mary Doe
2,johndoe@email.com,John Smith
3,adamdoe@email.com,Adam Doe


In [60]:
data["full_name"].str.split(" ", expand=True)

Unnamed: 0,0,1
0,Chris,Schafer
1,Mary,Doe
2,John,Smith
3,Adam,Doe


In [61]:
data[["first", "last"]] = data["full_name"].str.split(" ", expand=True)

In [62]:
data

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Chris Schafer,Chris,Schafer
1,janedoe@email.com,Mary Doe,Mary,Doe
2,johndoe@email.com,John Smith,John,Smith
3,adamdoe@email.com,Adam Doe,Adam,Doe


In [63]:
data.append({"first" : "Tony"}, ignore_index=True)

  data.append({"first" : "Tony"}, ignore_index=True)


Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Chris Schafer,Chris,Schafer
1,janedoe@email.com,Mary Doe,Mary,Doe
2,johndoe@email.com,John Smith,John,Smith
3,adamdoe@email.com,Adam Doe,Adam,Doe
4,,,Tony,


In [64]:
data.sort_values(by="last")

Unnamed: 0,email,full_name,first,last
1,janedoe@email.com,Mary Doe,Mary,Doe
3,adamdoe@email.com,Adam Doe,Adam,Doe
0,coreymschafer@gmail.com,Chris Schafer,Chris,Schafer
2,johndoe@email.com,John Smith,John,Smith


In [65]:
data.sort_values(by="last", ascending=False)

Unnamed: 0,email,full_name,first,last
2,johndoe@email.com,John Smith,John,Smith
0,coreymschafer@gmail.com,Chris Schafer,Chris,Schafer
1,janedoe@email.com,Mary Doe,Mary,Doe
3,adamdoe@email.com,Adam Doe,Adam,Doe


In [66]:
data.sort_values(by=["last", "first"], ascending=[False, True], inplace=True)

In [67]:
data

Unnamed: 0,email,full_name,first,last
2,johndoe@email.com,John Smith,John,Smith
0,coreymschafer@gmail.com,Chris Schafer,Chris,Schafer
3,adamdoe@email.com,Adam Doe,Adam,Doe
1,janedoe@email.com,Mary Doe,Mary,Doe


In [68]:
data.sort_index()

Unnamed: 0,email,full_name,first,last
0,coreymschafer@gmail.com,Chris Schafer,Chris,Schafer
1,janedoe@email.com,Mary Doe,Mary,Doe
2,johndoe@email.com,John Smith,John,Smith
3,adamdoe@email.com,Adam Doe,Adam,Doe


In [69]:
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [70]:
data = pd.DataFrame(people)
data.replace("NA", np.nan, inplace=True)
data.replace("Missing", np.nan, inplace=True)

In [71]:
data

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [72]:
data.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [73]:
data.dropna(axis="index", how="any")

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [74]:
data.dropna(axis="index", how="all")

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [75]:
data.dropna(axis="columns", how="any")

0
1
2
3
4
5
6


In [76]:
data.dropna(axis="index", how="any", subset=["email"])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
5,,,Anonymous@email.com,


In [77]:
data.dropna(axis="index", how="all", subset=["last", "email"])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [78]:
data.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [79]:
data.fillna("Missing")

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,Missing,36
4,Missing,Missing,Missing,Missing
5,Missing,Missing,Anonymous@email.com,Missing
6,Missing,Missing,Missing,Missing


In [80]:
data.fillna(0)

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,Anonymous@email.com,0
6,0,0,0,0


In [81]:
data.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [82]:
data["age"] = data["age"].astype(float)

In [83]:
data.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [84]:
data["age"].mean()

46.75