In [None]:
# Update the single DataFrame:
x = (df['z'] == "xyz")
df.at[x,y] = "xyz"

# Update the multiple values in the DataFrame
# a series that has a lower case of that col
df['x'].str.lower()
# now assign the col to that series
df['x'] = df['x'].str.lower()

In [2]:
# initial the data
people = {
    "first": ["Bilal", 'Alan', 'Hui', 'Sassy'], 
    "last": ["Waleed", 'Thompson', 'Chen', 'Chen'], 
    "email": ["bwaleed@gmail.com", 'athompson@email.com', 'hchen@email.com', 'schen@email.com']
}

In [3]:
import pandas as pd
df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email
0,Bilal,Waleed,bwaleed@gmail.com
1,Alan,Thompson,athompson@email.com
2,Hui,Chen,hchen@email.com
3,Sassy,Chen,schen@email.com


In [None]:
# ============================================================================
# ============================= Update operation =============================
# ============================================================================

df.columns

# update the col name
df.columns = ['first_name', 'last_name', 'email']

In [None]:
df

In [None]:
df.columns = [x.lower() for x in df.columns]
df

In [None]:
# rename the col
df.rename(columns = {'first_name': 'first', 'last_name': 'last'}, inplace=True)
df

In [None]:
df['email'] = df['email'].str.lower()
df

In [None]:
# However, there are four popular methods that people are using the all time:
# apply
# applymap
# map
# replace

# Apply - appliable to DataFrame/ Series
# when it comes to series, it runs the val to each func
df['email'].apply(len)

In [None]:
# Apply - entire series
def update_upper(email):
    return email.upper()

# note, we pass the func itself without () so that it will not execute the execution version of itself 
df['email'] = df['email'].apply(update_upper)

In [None]:
df

In [None]:
# Apply: with lambda func
df['email'] = df['email'].apply(lambda x: x.lower())

In [None]:
df

In [None]:
# when it comes to DataFrame, it runs each row/ col to each func
print("Apply method for each val")
print(df['email'].apply(len))
print("\n")
print("Apply method for the DF")
print(df.apply(len))

In [None]:
# apply() accepts second para, which is axis
# by default, it is rows
df.apply(len, axis='columns')

In [None]:
df.apply(pd.Series.min)
# this will return in alphbetical order in apply()

In [None]:
# another way with lambda func:
df.apply(lambda x: x.min())
# Note, x is a serie in this case so that we can use min() func

In [None]:
# ApplyMap - apply every val in a DataFrame. N/A to series
df.applymap(len)

In [None]:
# e.g., we wanna change all content in the DF to lower case
df.applymap(str.lower)

In [None]:
# Map - only works on a serie, which substitues each val in a series with another val
new_info = {
    'Bilal': 'Bilall',
    'Alan': 'Darren'
}

df['first'].map(new_info)

In [None]:
# However, the last name Hui changes to "NaN", which is not something that we want
# So we use replace method

new_info = {
    'Bilal': 'Bilall',
    'Alan': 'Darren'
}

df['first'].replace(new_info)

In [None]:
# actual update
df['first'] = df['first'].replace(new_info)
df

In [None]:
# ============================================================================
# ========================== Add/ Delete operation ===========================
# ============================================================================

# Add:
df['full_name'] = df['first'] + ' ' + df['last']
# Note, we can't use dot notation when it comes to add a new col 
# because Python will think you are trying to assign an strribute to an df obj
# Also, we can use apply func in another col for mathmetical analysis
df

In [None]:
# first in instance, we now no longer need 'first' and 'last' col
df.drop(columns=['first', 'last'], inplace=True)
# it return what an actual df would look like so that you have an overview of how it's gonna look like
# if need to actually drop it, then make it inplace

In [None]:
# now, let's say we wanna to bring that 'first' and 'last' col back
df['full_name'].str.split(' ', expand=True)
# by default, the split() will split based on empty space

In [None]:
df[['first', 'last']] = df['full_name'].str.split(' ', expand=True)
df

In [None]:
# now, adding a single row of data through append() func
df.append({'first': 'Tony'}, ignore_index=True)
# Note, we assign without an index val 

In [None]:
# a diff data
theavengers = {
    "first": ['Tony', 'Steve'], 
    "last": ['Stark', 'Rogers'], 
    "email": ["IronMan@avengers.com", 'cap@avengers.com']
}

df2 = pd.DataFrame(theavengers)
df2

In [None]:
df.append(df2, ignore_index=True)
# make sure ignore the index, else the index will be messed up

In [None]:
df = df.append(df2, ignore_index=True)
# unfortunately there's no inplace option in the append(), so we must do this way to replace the df
df

In [None]:
# now let's remove a row
# we can drop based on index id/ #
df.drop(index=4)

In [None]:
# remove with a complex filter
filt = df['last'] == 'Chen'
df.drop(index=df[filt].index)

In [7]:
# ============================================================================
# ============================== Sort operation ==============================
# ============================================================================

# sort a col
# assume we sort the last name
df.sort_values(by='last', ascending=False)

Unnamed: 0,first,last,email
0,Bilal,Waleed,bwaleed@gmail.com
1,Alan,Thompson,athompson@email.com
2,Hui,Chen,hchen@email.com
3,Sassy,Chen,schen@email.com


In [8]:
# sort multiple cols - by passing a list
df.sort_values(by=['last', 'first'], ascending=False)

Unnamed: 0,first,last,email
0,Bilal,Waleed,bwaleed@gmail.com
1,Alan,Thompson,athompson@email.com
3,Sassy,Chen,schen@email.com
2,Hui,Chen,hchen@email.com


In [13]:
df.sort_values(by=['last', 'first'], ascending=[False, True], inplace=True)
df

Unnamed: 0,first,last,email
0,Bilal,Waleed,bwaleed@gmail.com
1,Alan,Thompson,athompson@email.com
2,Hui,Chen,hchen@email.com
3,Sassy,Chen,schen@email.com


In [14]:
# sort based on the index val
df.sort_index()

Unnamed: 0,first,last,email
0,Bilal,Waleed,bwaleed@gmail.com
1,Alan,Thompson,athompson@email.com
2,Hui,Chen,hchen@email.com
3,Sassy,Chen,schen@email.com


In [15]:
# sort in a series
df['last'].sort_values()

2        Chen
3        Chen
1    Thompson
0      Waleed
Name: last, dtype: object

In [29]:
# ============================================================================
# =============================== Data Cleaning ==============================
# ============================================================================

# assume we have the following data
import numpy as np

people2 = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'], 
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'], 
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

df = pd.DataFrame(people2)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [7]:
# solu 1:
# ---------
# simply remove it

df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [9]:
# let's look at the dropna() default argu
df.dropna(axis='index', how='all')
# note, 'axis' val can be col
# 'how' val can be all

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [17]:
# solu 2:
# -------
# drop specific col/ row if specific cell is missing
# for instance, last/ first/ email is not important

# i.e. let's say last and email col is important
df.dropna(axis='index', how='all', subset=['last', 'email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@email.com,


In [41]:
# solu 3:
# -------
# custome missing val such as NA and Missing
# We modify the DataFrame when we read/ create it

df = pd.DataFrame(people2)
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [36]:
# a different view whether if the cell is na val through:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [37]:
# solu 4:
# -------
# fill the data when we have numeric val
# i. assign those missing numeric val to 0 or anything demonstrate it

df.fillna(0, inplace=True)

In [42]:
# solu 5:
# -------
# casting the data type - fill the missing cell with mean (avg) the col

df['age'] = df['age'].astype(float)
# note, if your col doesn't have any missing val, then this would just work fine
# but if it does have the missing val, then we need to covert to something else such as 0 or cast to float
# however, if covert to float, the NaN stays the folat because it belongs to float

In [43]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [44]:
df['age'].mean()

46.75