[Reference](https://pub.towardsai.net/pandas-cheat-sheet-functions-for-data-analysis-2cf4923266aa)

In [1]:
import pandas as pd
data = {'name': ['John', 'Sarah', 'David', 'Rachel'], 
        'age': [24, 29, 32, 27], 
        'city': ['New York', 'Los Angeles', 'Chicago', 'Boston']}
df = pd.DataFrame(data)

# Selecting specific columns of a DataFrame

In [2]:
df[['name', 'age']]

Unnamed: 0,name,age
0,John,24
1,Sarah,29
2,David,32
3,Rachel,27


# Filtering rows based on a condition

In [3]:
df[df['age'] > 25]

Unnamed: 0,name,age,city
1,Sarah,29,Los Angeles
2,David,32,Chicago
3,Rachel,27,Boston


# Sorting a DataFrame by a specific column

In [4]:
df.sort_values('age')

Unnamed: 0,name,age,city
0,John,24,New York
3,Rachel,27,Boston
1,Sarah,29,Los Angeles
2,David,32,Chicago


# Renaming columns in a DataFrame

In [5]:
df.rename(columns={'name': 'full_name', 'city': 'location'})

Unnamed: 0,full_name,age,location
0,John,24,New York
1,Sarah,29,Los Angeles
2,David,32,Chicago
3,Rachel,27,Boston


# Grouping a DataFrame by a column and aggregating another column

In [6]:
df.groupby('city')['age'].mean()

city
Boston         27.0
Chicago        32.0
Los Angeles    29.0
New York       24.0
Name: age, dtype: float64

# Merging two DataFrames based on a common column

In [7]:
data1 = {'name': ['John', 'Sarah', 'David', 'Rachel'], 
         'age': [24, 29, 32, 27], 
         'city': ['New York', 'Los Angeles', 'Chicago', 'Boston']}
df1 = pd.DataFrame(data1)

data2 = {'name': ['John', 'Sarah', 'David', 'Rachel'], 
         'salary': [50000, 60000, 70000, 80000]}
df2 = pd.DataFrame(data2)

merged_df = pd.merge(df1, df2, on='name')

# Adding a new column to a DataFrame

In [8]:
df['gender'] = ['M', 'F', 'M', 'F']

# Handling missing values in a DataFrame

In [9]:
df.dropna()  # drops rows with any missing values
df.fillna(0)  # fills missing values with 0
df.interpolate()  # fills missing values using interpolation

Unnamed: 0,name,age,city,gender
0,John,24,New York,M
1,Sarah,29,Los Angeles,F
2,David,32,Chicago,M
3,Rachel,27,Boston,F


# Saving a DataFrame to a CSV file

In [10]:
df.to_csv('my_dataframe.csv', index=False)

# Selecting rows and columns of a DataFrame using loc

In [11]:
df.loc[1:3, ['name', 'city']]

Unnamed: 0,name,city
1,Sarah,Los Angeles
2,David,Chicago
3,Rachel,Boston


# Pivot a DataFrame using pivot_table

In [12]:
df.pivot_table(index='city', columns='gender', values='age', aggfunc='mean')

gender,F,M
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Boston,27.0,
Chicago,,32.0
Los Angeles,29.0,
New York,,24.0


# Apply a function to a column using apply

In [13]:
def add_one(x):
    return x + 1

df['age_plus_one'] = df['age'].apply(add_one)

# Merge two DataFrames using join

In [14]:
data1 = {'name': ['John', 'Sarah', 'David', 'Rachel'], 
         'age': [24, 29, 32, 27], 
         'city': ['New York', 'Los Angeles', 'Chicago', 'Boston']}
df1 = pd.DataFrame(data1)

data2 = {'salary': [50000, 60000, 70000, 80000]}
df2 = pd.DataFrame(data2, index=['John', 'Sarah', 'David', 'Rachel'])

joined_df = df1.join(df2, on='name')

# Count the occurrences of unique values in a column using value_counts

In [15]:
df['city'].value_counts()

New York       1
Los Angeles    1
Chicago        1
Boston         1
Name: city, dtype: int64

# Reshape a DataFrame using melt

In [16]:
wide_df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
wide_df

Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


In [17]:
melted_df = pd.melt(wide_df, id_vars=['A'], value_vars=['B', 'C'])
melted_df

Unnamed: 0,A,variable,value
0,1,B,4
1,2,B,5
2,3,B,6
3,1,C,7
4,2,C,8
5,3,C,9


# Merge two DataFrames using merge on multiple columns

In [18]:
data1 = {'name': ['John', 'Sarah', 'David', 'Rachel'], 
         'age': [24, 29, 32, 27], 
         'city': ['New York', 'Los Angeles', 'Chicago', 'Boston']}
df1 = pd.DataFrame(data1)

data2 = {'name': ['John', 'Sarah', 'David', 'Rachel'], 
         'city': ['New York', 'Los Angeles', 'Chicago', 'Boston'],
         'salary': [50000, 60000, 70000, 80000]}
df2 = pd.DataFrame(data2)

merged_df = pd.merge(df1, df2, on=['name', 'city'])

# Filter a DataFrame by multiple conditions using & (and) and | (or)

In [19]:
df[(df['age'] > 25) & (df['city'] == 'Chicago')]

Unnamed: 0,name,age,city,gender,age_plus_one
2,David,32,Chicago,M,33


# Reshape a DataFrame using stack and unstack

In [20]:
wide_df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}, index=['a', 'b'])
stacked_df = wide_df.stack()
unstacked_df = stacked_df.unstack()

# Replace values in a column using replace

In [21]:
df['city'] = df['city'].replace({'New York': 'NY', 'Los Angeles': 'LA'})

# Calculate the cumulative sum of a column using cumsum

In [22]:
df['age_cumulative_sum'] = df['age'].cumsum()

# Filter a DataFrame based on the existence of a value in a column using isin

In [23]:
df[df['city'].isin(['New York', 'Chicago'])]

Unnamed: 0,name,age,city,gender,age_plus_one,age_cumulative_sum
2,David,32,Chicago,M,33,85


# Create a new column based on the values of multiple columns using apply and lambda

In [24]:
df['age_location'] = df.apply(lambda row: str(row['age']) + ', ' + row['city'], axis=1)

In [25]:
df['age_location']

0         24, NY
1         29, LA
2    32, Chicago
3     27, Boston
Name: age_location, dtype: object

# Merge two DataFrames using merge with different column names

In [26]:
data1 = {'name': ['John', 'Sarah', 'David', 'Rachel'], 
         'age': [24, 29, 32, 27], 
         'city': ['New York', 'Los Angeles', 'Chicago', 'Boston']}
df1 = pd.DataFrame(data1)

data2 = {'employee_name': ['John', 'Sarah', 'David', 'Rachel'], 
         'salary': [50000, 60000, 70000, 80000]}
df2 = pd.DataFrame(data2)

merged_df = pd.merge(df1, df2, left_on='name', right_on='employee_name')

# Apply a function to a DataFrame using applymap

In [27]:
def format_age(age):
    return str(age) + ' years'

df[['name', 'age']].applymap(format_age)

Unnamed: 0,name,age
0,John years,24 years
1,Sarah years,29 years
2,David years,32 years
3,Rachel years,27 years


In [28]:
df

Unnamed: 0,name,age,city,gender,age_plus_one,age_cumulative_sum,age_location
0,John,24,NY,M,25,24,"24, NY"
1,Sarah,29,LA,F,30,53,"29, LA"
2,David,32,Chicago,M,33,85,"32, Chicago"
3,Rachel,27,Boston,F,28,112,"27, Boston"


# Merge two DataFrames using merge with a non-default join type

In [31]:
data1 = {'name': ['John', 'Sarah', 'David', 'Rachel'], 
         'age': [24, 29, 32, 27], 
         'city': ['New York', 'Los Angeles', 'Chicago', 'Boston']}
df1 = pd.DataFrame(data1)

data2 = {'name': ['John', 'Sarah', 'David'], 
         'salary': [50000, 60000, 70000]}
df2 = pd.DataFrame(data2)

merged_df = pd.merge(df1, df2, on='name', how='left')

# Calculate summary statistics for a column using describe

In [30]:
df['age'].describe()

count     4.000000
mean     28.000000
std       3.366502
min      24.000000
25%      26.250000
50%      28.000000
75%      29.750000
max      32.000000
Name: age, dtype: float64

# Create a new column based on a condition using np.where

In [32]:
import numpy as np

df['age_category'] = np.where(df['age'] < 30, 'young', 'old')

# Reshape a DataFrame using crosstab

In [33]:
df['age_category'] = np.where(df['age'] < 30, 'young', 'old')
pd.crosstab(df['city'], df['age_category'])

age_category,old,young
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Boston,0,1
Chicago,1,0
LA,0,1
NY,0,1


# Select a random sample of rows from a DataFrame using sample

In [34]:
df.sample(n=3)

Unnamed: 0,name,age,city,gender,age_plus_one,age_cumulative_sum,age_location,age_category
3,Rachel,27,Boston,F,28,112,"27, Boston",young
1,Sarah,29,LA,F,30,53,"29, LA",young
2,David,32,Chicago,M,33,85,"32, Chicago",old


# Calculate the correlation between columns using corr

In [35]:
df.corr()

Unnamed: 0,age,age_plus_one,age_cumulative_sum
age,1.0,1.0,0.479123
age_plus_one,1.0,1.0,0.479123
age_cumulative_sum,0.479123,0.479123,1.0


# Create a new column based on a rolling window calculation using rolling

In [36]:
df['rolling_mean_age'] = df['age'].rolling(window=2).mean()

# Pivot a DataFrame using pivot

In [37]:
df.pivot(index='name', columns='city', values='age')

city,Boston,Chicago,LA,NY
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
David,,32.0,,
John,,,,24.0
Rachel,27.0,,,
Sarah,,,29.0,


# Sort a DataFrame by one or more columns using sort_values

In [38]:
df.sort_values(by=['city', 'age'], ascending=[True, False])

Unnamed: 0,name,age,city,gender,age_plus_one,age_cumulative_sum,age_location,age_category,rolling_mean_age
3,Rachel,27,Boston,F,28,112,"27, Boston",young,29.5
2,David,32,Chicago,M,33,85,"32, Chicago",old,30.5
1,Sarah,29,LA,F,30,53,"29, LA",young,26.5
0,John,24,NY,M,25,24,"24, NY",young,


# Create a new column based on a group-wise calculation using groupby and transform

In [39]:
df['age_mean_by_city'] = df.groupby('city')['age'].transform('mean')

# Filter a DataFrame based on the existence of null values using isnull

In [40]:
df[df.isnull().any(axis=1)]

Unnamed: 0,name,age,city,gender,age_plus_one,age_cumulative_sum,age_location,age_category,rolling_mean_age,age_mean_by_city
0,John,24,NY,M,25,24,"24, NY",young,,24.0


# Create a new column based on a group-wise calculation using groupby and apply

In [42]:
# def age_diff(group):
#     return group['age'] - group['age'].mean()

# df['age_diff_from_mean_by_city'] = df.groupby('city').apply(age_diff)

# Create a new column based on the difference between consecutive rows using diff

In [43]:
df['age_diff'] = df['age'].diff()

# Create a new column based on a group-wise calculation using groupby and agg

In [44]:
df.groupby('city').agg({'age': 'mean', 'salary': 'sum'})

KeyError: ignored

# Filter a DataFrame based on the non-existence of null values using notnull

In [45]:
df[df.notnull().all(axis=1)]

Unnamed: 0,name,age,city,gender,age_plus_one,age_cumulative_sum,age_location,age_category,rolling_mean_age,age_mean_by_city,age_diff
1,Sarah,29,LA,F,30,53,"29, LA",young,26.5,29.0,5.0
2,David,32,Chicago,M,33,85,"32, Chicago",old,30.5,32.0,3.0
3,Rachel,27,Boston,F,28,112,"27, Boston",young,29.5,27.0,-5.0


# Create a new column based on a group-wise calculation using groupby and filter

In [46]:
def has_young_member(group):
    return (group['age'] < 30).any()

df.groupby('city').filter(has_young_member)

Unnamed: 0,name,age,city,gender,age_plus_one,age_cumulative_sum,age_location,age_category,rolling_mean_age,age_mean_by_city,age_diff
0,John,24,NY,M,25,24,"24, NY",young,,24.0,
1,Sarah,29,LA,F,30,53,"29, LA",young,26.5,29.0,5.0
3,Rachel,27,Boston,F,28,112,"27, Boston",young,29.5,27.0,-5.0


# Create a new column based on a group-wise calculation using groupby and pivot_table

In [47]:
pd.pivot_table(df, index='city', values='age', aggfunc=np.mean)

Unnamed: 0_level_0,age
city,Unnamed: 1_level_1
Boston,27
Chicago,32
LA,29
NY,24


# Filter a DataFrame based on a regular expression using str.contains

In [48]:
df[df['city'].str.contains('New|Los')]

Unnamed: 0,name,age,city,gender,age_plus_one,age_cumulative_sum,age_location,age_category,rolling_mean_age,age_mean_by_city,age_diff
