Pandas is a powerful data manipulation and analysis library for Python

Create DataFrame using dictionary:

In [4]:
import pandas as pd

data = {'Name': ['Alice', 'Bob', 'Charlie'],'Age': [25, 30, 35]}
df = pd.DataFrame(data)
print(df)

      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35


In [5]:
import pandas as pd
age_series = pd.Series([25, 30, 35], index=['Alice', 'Bob', 'Charlie'])
print(age_series)

Alice      25
Bob        30
Charlie    35
dtype: int64


 Reading CSV Files

In [10]:
import pandas as pd 
df = pd.read_csv(r"people_data.csv")
print(df)

  First Name Last Name     Sex                       Email Date of birth  \
0     Shelby   Terrell    Male        elijah57@example.net    1945-10-26   
1    Phillip   Summers  Female       bethany14@example.com    1910-03-24   
2   Kristine    Travis    Male       bthompson@example.com    1992-07-02   
3    Yesenia  Martinez    Male   kaitlinkaiser@example.com    2017-08-03   
4       Lori      Todd    Male  buchananmanuel@example.net    1938-12-01   

            Job Title  
0     Games developer  
1      Phytotherapist  
2           Homeopath  
3   Market researcher  
4  Veterinary surgeon  


In [11]:
import pandas as pd

data = {'Name': ['Alice', 'Bob', 'Charlie'],'Age': [25, 30, 35]}
df = pd.DataFrame(data)

# Select a single column
age_column = df['Age']
print(age_column)

0    25
1    30
2    35
Name: Age, dtype: int64


In [12]:
import pandas as pd

data = {'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 30, 35]}
df = pd.DataFrame(data)

# Using .loc[] to select rows by label
row_by_label = df.loc[1]  # Selects the row with index label 1 (Bob's data)

# Using .iloc[] to select rows by position
row_by_position = df.iloc[1]  # Selects the second row (Bob's data)

print("Row by label:\n", row_by_label)
print("Row by position:\n", row_by_position)

Row by label:
 Name    Bob
Age      30
Name: 1, dtype: object
Row by position:
 Name    Bob
Age      30
Name: 1, dtype: object


In [2]:
import pandas as pd

# Create a dictionary
data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35],
    "City": ["New York", "Los Angeles", "Chicago"]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)


      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


In [13]:
import pandas as pd

data = {'Name': ['Alice', 'Bob', 'Charlie'],'Age': [25, 30, 35]}
df = pd.DataFrame(data)

# Filtering rows where Age is greater than 30
filtered_df = df[df['Age'] > 28]
print(filtered_df)

      Name  Age
1      Bob   30
2  Charlie   35


In [14]:
import pandas as pd

df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 30, 35]})

# Adding a new column with a list of values
df['Salary'] = [50000, 60000, 70000]
print(df)

      Name  Age  Salary
0    Alice   25   50000
1      Bob   30   60000
2  Charlie   35   70000


In [15]:
import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})

# Renaming columns
df.rename(columns={'A': 'X', 'B': 'Y', 'C': 'Z'}, inplace=True)
print(df)

   X  Y  Z
0  1  4  7
1  2  5  8
2  3  6  9


In [16]:
import pandas as pd

# Sample DataFrame
data = {'A': [10, 20, 30], 'B': [40, 50, 60]}
df = pd.DataFrame(data)

# Reindex rows
new_index = [0, 1, 2, 3]
df_reindexed = df.reindex(new_index)
print("Reindexed Rows:\n", df_reindexed)

Reindexed Rows:
       A     B
0  10.0  40.0
1  20.0  50.0
2  30.0  60.0
3   NaN   NaN


In [17]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'Col1': [1, 2, np.nan],'Col2': [3, np.nan, np.nan]})

# Check for missing values
print(df.isnull())

# Output:
#     Col1   Col2
# 0  False  False
# 1  False   True
# 2   True   True


    Col1   Col2
0  False  False
1  False   True
2   True   True


In [18]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'Col1': [1, 2, np.nan],'Col2': [3, np.nan, np.nan]})
df.fillna(0)
print(df)

   Col1  Col2
0   1.0   3.0
1   2.0   NaN
2   NaN   NaN


In [19]:
import pandas as pd
data = {'Name': ['Alice', 'Bob', None], 'Age': [25, None, 35]}
df = pd.DataFrame(data)

# Drop rows with missing values
df_dropped = df.dropna()
print(df_dropped)

    Name   Age
0  Alice  25.0


In [20]:
import pandas as pd
data = {'Category': ['A', 'B', 'A', 'B'], 'Value': [10, 20, 30, 40]}
df = pd.DataFrame(data)

# Group data by 'Category' and calculate the sum
grouped_sum = df.groupby('Category').sum()
print("Sum:\n", grouped_sum)

# Compute the mean for each group
grouped_mean = df.groupby('Category')['Value'].mean()
print("\nMean:\n", grouped_mean)

# Aggregate using multiple functions
grouped_agg = df.groupby('Category').agg(['sum', 'mean'])
print("\nAggregated:\n", grouped_agg)


Sum:
           Value
Category       
A            40
B            60

Mean:
 Category
A    20.0
B    30.0
Name: Value, dtype: float64

Aggregated:
          Value      
           sum  mean
Category            
A           40  20.0
B           60  30.0


In [26]:
import pandas as pd
data = {'Name': ['Alice', 'Charlie', 'Edward', 'Grace'],'Years_Experience': [2, 3, 4, 6],'Role': ['Manager', 'Analyst', 'Developer', 'HR']}
df = pd.DataFrame(data)
print(" intial  data ")
print(df)

# New DataFrame to concatenate
new_data = {'Name': ['John', 'Lily'],'Years_Experience': [5, 3],'Role': ['Designer', 'Developer']}
new_df = pd.DataFrame(new_data)
print("  new  data")
print(new_df)

# Concatenate the original and new DataFrames along rows (axis=0)
concatenated_df = pd.concat([df, new_df], axis=0, ignore_index=True)

print("Concatenated DataFrame:\n", concatenated_df)

 intial  data 
      Name  Years_Experience       Role
0    Alice                 2    Manager
1  Charlie                 3    Analyst
2   Edward                 4  Developer
3    Grace                 6         HR
  new  data
   Name  Years_Experience       Role
0  John                 5   Designer
1  Lily                 3  Developer
Concatenated DataFrame:
       Name  Years_Experience       Role
0    Alice                 2    Manager
1  Charlie                 3    Analyst
2   Edward                 4  Developer
3    Grace                 6         HR
4     John                 5   Designer
5     Lily                 3  Developer


Reshaping Data in Pandas

In [27]:
import pandas as pd
data = {'Date': ['2024-10-01', '2024-10-01', '2024-10-02', '2024-10-02'],'Category': ['A', 'B', 'A', 'B'],'Values': [10, 20, 15, 25]}
df = pd.DataFrame(data)

pivot_table = df.pivot_table(values='Values', index='Date', columns='Category', aggfunc='sum')
print(pivot_table)

Category     A   B
Date              
2024-10-01  10  20
2024-10-02  15  25


 Melting and Unmelting: Multiple columns are combined into a single key-value pair.

In [28]:
import pandas as pd
data = {'Date': ['2024-10-01', '2024-10-01', '2024-10-02', '2024-10-02'],'Category': [10, None, 15, None],'Category_B': [None, 20, None, 25]}
df = pd.DataFrame(data)

# Melt data from wide to long format
melt_df = df.melt(id_vars='Date', var_name='Category', value_name='Values')
print(melt_df)

         Date    Category  Values
0  2024-10-01    Category    10.0
1  2024-10-01    Category     NaN
2  2024-10-02    Category    15.0
3  2024-10-02    Category     NaN
4  2024-10-01  Category_B     NaN
5  2024-10-01  Category_B    20.0
6  2024-10-02  Category_B     NaN
7  2024-10-02  Category_B    25.0


 Stacking and Unstacking With Pandas:

In [29]:
import pandas as pd
data = {
    ('Sales', 'Q1'): [100, 150],
    ('Sales', 'Q2'): [200, 250],
    ('Expenses', 'Q1'): [50, 70],
    ('Expenses', 'Q2'): [80, 90]
}

index = ['Product_A', 'Product_B']
df = pd.DataFrame(data, index=index)
print(df)

stacked_df = df.stack() # Columns → Rows
print(stacked_df)

unstacked_df = stacked_df.unstack() # Rows → Columns
print(unstacked_df)

          Sales      Expenses    
             Q1   Q2       Q1  Q2
Product_A   100  200       50  80
Product_B   150  250       70  90
              Sales  Expenses
Product_A Q1    100        50
          Q2    200        80
Product_B Q1    150        70
          Q2    250        90
          Sales      Expenses    
             Q1   Q2       Q1  Q2
Product_A   100  200       50  80
Product_B   150  250       70  90


  stacked_df = df.stack() # Columns → Rows


Create Pandas Dataframe from 2D List using pd.DataFrame()

In [30]:
# import pandas as pd 
import pandas as pd  
    
# List1  
lst = [['Geek', 25], ['is', 30], 
       ['for', 26], ['Geeksforgeeks', 22]] 

# creating df object with columns specified    
df = pd.DataFrame(lst, columns =['Tag', 'number']) 
print(df )

             Tag  number
0           Geek      25
1             is      30
2            for      26
3  Geeksforgeeks      22


Create Pandas Dataframe from 2D List using

In [31]:
import pandas as pd

# Two-dimensional list
data = [['Geek1', 28, 'Analyst'],
        ['Geek2', 35, 'Manager'],
        ['Geek3', 29, 'Developer']]

# Column names
columns = ['Name', 'Age', 'Occupation']

# Creating DataFrame using pd.DataFrame.from_records()
df = pd.DataFrame.from_records(data, columns=columns)

# Displaying the DataFrame
print(df)

    Name  Age Occupation
0  Geek1   28    Analyst
1  Geek2   35    Manager
2  Geek3   29  Developer


In [32]:
import pandas as pd 
 
# initialize list of lists 
data = [['Geeks', 10], ['for', 15], ['geeks', 20]] 
 
# Create the pandas DataFrame 
df = pd.DataFrame(data, columns = ['Name', 'Age']) 
 
# print dataframe. 
print(df)

    Name  Age
0  Geeks   10
1    for   15
2  geeks   20


In [33]:
# Import pandas library 
import pandas as pd 
 
# initialize list of lists 
data = [['DS', 'Linked_list', 10], ['DS', 'Stack', 9], ['DS', 'Queue', 7],
        ['Algo', 'Greedy', 8], ['Algo', 'DP', 6], ['Algo', 'BackTrack', 5], ] 
 
# Create the pandas DataFrame 
df = pd.DataFrame(data, columns = ['Category', 'Name', 'Marks']) 
 
# print dataframe. 
print(df)

  Category         Name  Marks
0       DS  Linked_list     10
1       DS        Stack      9
2       DS        Queue      7
3     Algo       Greedy      8
4     Algo           DP      6
5     Algo    BackTrack      5


In [34]:
import pandas as pd
import numpy as np
 
# Creating a DataFrame with missing values from a list of lists
data = [['Geek1', 28, 'Engineer'],
        ['Geek2', None, 'Data Scientist'],
        ['Geek3', 32, None]]
 
columns = ['Name', 'Age', 'Occupation']
 
df = pd.DataFrame(data, columns=columns)
df = df.replace({None: np.nan})  # Replacing None with NaN for missing values
print(df)

    Name   Age      Occupation
0  Geek1  28.0        Engineer
1  Geek2   NaN  Data Scientist
2  Geek3  32.0             NaN


In [35]:
import pandas as pd
 
# Creating a DataFrame with different data types from a list of lists
data = [['Geek1', 28, 'Engineer'],
        ['Geek2', 25, 'Data Scientist'],
        ['Geek3', '32', 'Manager']]  # Age represented as a string
 
columns = ['Name', 'Age', 'Occupation']
 
df = pd.DataFrame(data, columns=columns)
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')  # Convert 'Age' column to numeric, handling errors
print(df)

    Name  Age      Occupation
0  Geek1   28        Engineer
1  Geek2   25  Data Scientist
2  Geek3   32         Manager


In [36]:
import pandas as pd 
 
# initialize list of lists 
data = [[1, 5, 10], [2, 6, 9], [3, 7, 8]] 
 
# Create the pandas DataFrame 
df = pd.DataFrame(data)
 
# specifying column names
df.columns = ['Col_1', 'Col_2', 'Col_3']
 
# print dataframe. 
print(df, "\n")
 
# transpose of dataframe
df = df.transpose()
print("Transpose of above dataframe is-\n", df)

   Col_1  Col_2  Col_3
0      1      5     10
1      2      6      9
2      3      7      8 

Transpose of above dataframe is-
         0  1  2
Col_1   1  2  3
Col_2   5  6  7
Col_3  10  9  8


Creating a Pandas dataframe using list of tuples

In [37]:
# import pandas to use pandas DataFrame
import pandas as pd

# data in the form of list of tuples
data = [('Peter', 18, 7),
        ('Riff', 15, 6),
        ('John', 17, 8),
        ('Michel', 18, 7),
        ('Sheli', 17, 5) ]

# create DataFrame using data
df = pd.DataFrame(data, columns =['Name', 'Age', 'Score'])

print(df)

     Name  Age  Score
0   Peter   18      7
1    Riff   15      6
2    John   17      8
3  Michel   18      7
4   Sheli   17      5


Create a Pandas DataFrame from List of Dicts

In [38]:
import pandas as pd  
    
# Initialise data to lists.  
data = [{'Geeks': 'dataframe', 'For': 'using', 'geeks': 'list'}, 
        {'Geeks':10, 'For': 20, 'geeks': 30}]  
  
df = pd.DataFrame.from_records(data,index=['1', '2']) 
print(df)

       Geeks    For geeks
1  dataframe  using  list
2         10     20    30


In [39]:
# Creating a Series from a list
series = pd.Series([10, 20, 30, 40])
print(series)


0    10
1    20
2    30
3    40
dtype: int64


In [40]:
# Creating a DataFrame from a dictionary
data = {'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 30, 35]}
df = pd.DataFrame(data)
print(df)


      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35


In [41]:
# DataFrame from a list of lists
data = [['Alice', 25], ['Bob', 30], ['Charlie', 35]]
df = pd.DataFrame(data, columns=['Name', 'Age'])
print(df)


      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35


In [42]:
# DataFrame from a dictionary
data = {'Name': ['Alice', 'Bob'], 'Age': [25, 30]}
df = pd.DataFrame(data)
print(df)


    Name  Age
0  Alice   25
1    Bob   30


In [44]:
# Reading from a CSV file
df = pd.read_csv('people_data.csv')
print(df.head())


  First Name Last Name     Sex                       Email Date of birth  \
0     Shelby   Terrell    Male        elijah57@example.net    1945-10-26   
1    Phillip   Summers  Female       bethany14@example.com    1910-03-24   
2   Kristine    Travis    Male       bthompson@example.com    1992-07-02   
3    Yesenia  Martinez    Male   kaitlinkaiser@example.com    2017-08-03   
4       Lori      Todd    Male  buchananmanuel@example.net    1938-12-01   

            Job Title  
0     Games developer  
1      Phytotherapist  
2           Homeopath  
3   Market researcher  
4  Veterinary surgeon  


In [45]:
# Inspecting the DataFrame
print(df.head())
print(df.tail())
print(df.info())
print(df.describe())
print(df.shape)
print(df.columns)


  First Name Last Name     Sex                       Email Date of birth  \
0     Shelby   Terrell    Male        elijah57@example.net    1945-10-26   
1    Phillip   Summers  Female       bethany14@example.com    1910-03-24   
2   Kristine    Travis    Male       bthompson@example.com    1992-07-02   
3    Yesenia  Martinez    Male   kaitlinkaiser@example.com    2017-08-03   
4       Lori      Todd    Male  buchananmanuel@example.net    1938-12-01   

            Job Title  
0     Games developer  
1      Phytotherapist  
2           Homeopath  
3   Market researcher  
4  Veterinary surgeon  
  First Name Last Name     Sex                       Email Date of birth  \
0     Shelby   Terrell    Male        elijah57@example.net    1945-10-26   
1    Phillip   Summers  Female       bethany14@example.com    1910-03-24   
2   Kristine    Travis    Male       bthompson@example.com    1992-07-02   
3    Yesenia  Martinez    Male   kaitlinkaiser@example.com    2017-08-03   
4       Lori      T

In [47]:
# Check for missing data
print(df.isnull())

# Fill missing data with a default value
df = df.fillna(0)

# Drop rows with missing data
df = df.dropna()


   First Name  Last Name    Sex  Email  Date of birth  Job Title
0       False      False  False  False          False      False
1       False      False  False  False          False      False
2       False      False  False  False          False      False
3       False      False  False  False          False      False
4       False      False  False  False          False      False


In [48]:
# Check for duplicates
print(df.duplicated())

# Remove duplicates
df = df.drop_duplicates()


0    False
1    False
2    False
3    False
4    False
dtype: bool


In [51]:
# Rename a column
df = df.rename(columns={'Sex': 'Full Name'})


In [53]:
# Sort by Age
df = df.sort_values(by='Date of birth', ascending=False)

# Sort by index
df = df.sort_index()
