In [2]:
import pandas as pd
import numpy as np

In [3]:
# Create DataFrame with missing values
data_with_na = {
    'A': [1, 2, np.nan, 4, 5,np.nan],
    'B': [np.nan, 2, 3, 4, np.nan, 55],
    'C': ['x', 'y', None, 'w', 'z', None]
}
df_na = pd.DataFrame(data_with_na)

In [4]:
df_na

Unnamed: 0,A,B,C
0,1.0,,x
1,2.0,2.0,y
2,,3.0,
3,4.0,4.0,w
4,5.0,,z
5,,55.0,


In [5]:
df_na.isna().sum()

A    2
B    2
C    2
dtype: int64

In [6]:
df_na.dropna()

Unnamed: 0,A,B,C
1,2.0,2.0,y
3,4.0,4.0,w


In [8]:
df_na.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,,x
1,2.0,2.0,y
3,4.0,4.0,w
4,5.0,,z


In [11]:
df_na.fillna(4)

Unnamed: 0,A,B,C
0,1.0,4.0,x
1,2.0,2.0,y
2,4.0,3.0,4
3,4.0,4.0,w
4,5.0,4.0,z
5,4.0,55.0,4


In [16]:
df_na["A"] = df_na["A"].fillna(df_na["A"].mean())

In [17]:
df_na

Unnamed: 0,A,B,C
0,1.0,,x
1,2.0,2.0,y
2,3.0,3.0,
3,4.0,4.0,w
4,5.0,,z
5,3.0,55.0,


In [18]:
df_types = pd.DataFrame({
    'integers': ['1', '2', '3', '4'],
    'floats': ['1.1', '2.2', '3.3', '4.4'],
    'dates': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'],
    'categories': ['A', 'B', 'A', 'C']
})

In [20]:
df_types.dtypes

integers      object
floats        object
dates         object
categories    object
dtype: object

In [21]:
df_types

Unnamed: 0,integers,floats,dates,categories
0,1,1.1,2023-01-01,A
1,2,2.2,2023-01-02,B
2,3,3.3,2023-01-03,A
3,4,4.4,2023-01-04,C


In [26]:
df_types["integers"]=df_types["integers"].astype("int")

In [31]:
df_types.dtypes

integers               int32
floats               float64
dates         datetime64[ns]
categories            object
dtype: object

In [28]:
df_types["floats"]=df_types["floats"].astype("float")

In [30]:
df_types["dates"]=pd.to_datetime(df_types["dates"])

In [32]:
df_mixed = pd.DataFrame({'col': ['1', '2', 'three', '4']})
df_mixed['col_numeric'] = pd.to_numeric(df_mixed['col'], errors='coerce')  # NaN for non-numeric
print(df_mixed)

     col  col_numeric
0      1          1.0
1      2          2.0
2  three          NaN
3      4          4.0


In [33]:
df_str = pd.DataFrame({
    'names': ['  Alice  ', 'BOB', 'charlie', 'DIANA'],
    'emails': ['alice@email.com', 'bob@COMPANY.COM', 'charlie@email.com', 'diana@company.com']
})

# String methods (vectorized operations)
print(df_str['names'].str.lower())       # Convert to lowercase
print(df_str['names'].str.upper())       # Convert to uppercase
print(df_str['names'].str.title())       # Title case
print(df_str['names'].str.strip())       # Remove leading/trailing whitespace
print(df_str['names'].str.len())         # Length of strings

0      alice  
1          bob
2      charlie
3        diana
Name: names, dtype: object
0      ALICE  
1          BOB
2      CHARLIE
3        DIANA
Name: names, dtype: object
0      Alice  
1          Bob
2      Charlie
3        Diana
Name: names, dtype: object
0      Alice
1        BOB
2    charlie
3      DIANA
Name: names, dtype: object
0    9
1    3
2    7
3    5
Name: names, dtype: int64


In [34]:
df_str

Unnamed: 0,names,emails
0,Alice,alice@email.com
1,BOB,bob@COMPANY.COM
2,charlie,charlie@email.com
3,DIANA,diana@company.com


In [35]:
# String manipulation
print(df_str['names'].str.replace(' ', '_'))   # Replace characters

0    __Alice__
1          BOB
2      charlie
3        DIANA
Name: names, dtype: object


In [36]:
print(df_str['emails'].str.split('@'))          # Split strings

0      [alice, email.com]
1      [bob, COMPANY.COM]
2    [charlie, email.com]
3    [diana, company.com]
Name: emails, dtype: object


In [37]:
print(df_str['emails'].str.split('@', expand=True))  # Split into columns

         0            1
0    alice    email.com
1      bob  COMPANY.COM
2  charlie    email.com
3    diana  company.com


In [38]:
df_str

Unnamed: 0,names,emails
0,Alice,alice@email.com
1,BOB,bob@COMPANY.COM
2,charlie,charlie@email.com
3,DIANA,diana@company.com


In [39]:
df_dup = pd.DataFrame({
    'A': [1, 2, 2, 3, 3, 3,4,5,6,7],
    'B': ['x', 'y', 'y', 'z', 'z', 'w',"a","b", "c", "d"],
    'C': [10, 20, 20, 30, 30, 40, 40, 50, 30, 20]
})


In [40]:
df_dup

Unnamed: 0,A,B,C
0,1,x,10
1,2,y,20
2,2,y,20
3,3,z,30
4,3,z,30
5,3,w,40
6,4,a,40
7,5,b,50
8,6,c,30
9,7,d,20


In [42]:
df_dup.duplicated().sum()

2

In [47]:
df_dup["C"].duplicated().sum()

5

In [51]:
df_dup.drop_duplicates(inplace=True)

In [52]:
df_dup

Unnamed: 0,A,B,C
0,1,x,10
1,2,y,20
3,3,z,30
5,3,w,40
6,4,a,40
7,5,b,50
8,6,c,30
9,7,d,20


In [53]:
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', "mohit", "suresh"],
    'Age': [25, 30, 35, 20, 28],
    'Salary': [70000, 80000, 90000, 40000, 78000]
})

In [54]:
df

Unnamed: 0,Name,Age,Salary
0,Alice,25,70000
1,Bob,30,80000
2,Charlie,35,90000
3,mohit,20,40000
4,suresh,28,78000


In [55]:
df["Name_upper"] = df["Name"].str.upper()

In [56]:
df

Unnamed: 0,Name,Age,Salary,Name_upper
0,Alice,25,70000,ALICE
1,Bob,30,80000,BOB
2,Charlie,35,90000,CHARLIE
3,mohit,20,40000,MOHIT
4,suresh,28,78000,SURESH


In [57]:
df['Bonus'] = df['Salary'] * 0.1           # Calculated column

In [58]:
df

Unnamed: 0,Name,Age,Salary,Name_upper,Bonus
0,Alice,25,70000,ALICE,7000.0
1,Bob,30,80000,BOB,8000.0
2,Charlie,35,90000,CHARLIE,9000.0
3,mohit,20,40000,MOHIT,4000.0
4,suresh,28,78000,SURESH,7800.0


In [59]:
df['Department'] = 'IT'         

In [60]:
df

Unnamed: 0,Name,Age,Salary,Name_upper,Bonus,Department
0,Alice,25,70000,ALICE,7000.0,IT
1,Bob,30,80000,BOB,8000.0,IT
2,Charlie,35,90000,CHARLIE,9000.0,IT
3,mohit,20,40000,MOHIT,4000.0,IT
4,suresh,28,78000,SURESH,7800.0,IT


In [61]:
df['Full_Info'] = df['Name'] + ' (' + df['Age'].astype(str) + ')'  # String concatenation


In [62]:
df

Unnamed: 0,Name,Age,Salary,Name_upper,Bonus,Department,Full_Info
0,Alice,25,70000,ALICE,7000.0,IT,Alice (25)
1,Bob,30,80000,BOB,8000.0,IT,Bob (30)
2,Charlie,35,90000,CHARLIE,9000.0,IT,Charlie (35)
3,mohit,20,40000,MOHIT,4000.0,IT,mohit (20)
4,suresh,28,78000,SURESH,7800.0,IT,suresh (28)


In [63]:
# Add multiple columns using assign
df = df.assign( Tax = df['Salary'] * 0.3, Net_Salary = lambda x: x['Salary'] - x['Tax'])

In [64]:
df

Unnamed: 0,Name,Age,Salary,Name_upper,Bonus,Department,Full_Info,Tax,Net_Salary
0,Alice,25,70000,ALICE,7000.0,IT,Alice (25),21000.0,49000.0
1,Bob,30,80000,BOB,8000.0,IT,Bob (30),24000.0,56000.0
2,Charlie,35,90000,CHARLIE,9000.0,IT,Charlie (35),27000.0,63000.0
3,mohit,20,40000,MOHIT,4000.0,IT,mohit (20),12000.0,28000.0
4,suresh,28,78000,SURESH,7800.0,IT,suresh (28),23400.0,54600.0


In [65]:
# Insert column at specific position
df.insert(1, 'Employee_ID', ['E001', 'E002', 'E003', "E004","E005"])

In [66]:
df

Unnamed: 0,Name,Employee_ID,Age,Salary,Name_upper,Bonus,Department,Full_Info,Tax,Net_Salary
0,Alice,E001,25,70000,ALICE,7000.0,IT,Alice (25),21000.0,49000.0
1,Bob,E002,30,80000,BOB,8000.0,IT,Bob (30),24000.0,56000.0
2,Charlie,E003,35,90000,CHARLIE,9000.0,IT,Charlie (35),27000.0,63000.0
3,mohit,E004,20,40000,MOHIT,4000.0,IT,mohit (20),12000.0,28000.0
4,suresh,E005,28,78000,SURESH,7800.0,IT,suresh (28),23400.0,54600.0


In [68]:
# Insert column at specific position
df.insert(0, 'Employee_ID1', ['E001', 'E002', 'E003', "E004","E005"])

In [69]:
df

Unnamed: 0,Employee_ID1,Name,Employee_ID,Age,Salary,Name_upper,Bonus,Department,Full_Info,Tax,Net_Salary
0,E001,Alice,E001,25,70000,ALICE,7000.0,IT,Alice (25),21000.0,49000.0
1,E002,Bob,E002,30,80000,BOB,8000.0,IT,Bob (30),24000.0,56000.0
2,E003,Charlie,E003,35,90000,CHARLIE,9000.0,IT,Charlie (35),27000.0,63000.0
3,E004,mohit,E004,20,40000,MOHIT,4000.0,IT,mohit (20),12000.0,28000.0
4,E005,suresh,E005,28,78000,SURESH,7800.0,IT,suresh (28),23400.0,54600.0


In [71]:
df.drop("Employee_ID",axis=1,inplace=True)

In [72]:
df_renamed = df[["Name", "Age"]].rename(columns={'Name': 'Employee_Name', 'Age': 'Employee_Age'})

In [73]:
df_renamed

Unnamed: 0,Employee_Name,Employee_Age
0,Alice,25
1,Bob,30
2,Charlie,35
3,mohit,20
4,suresh,28


In [75]:
df = df.iloc[:,[0,1,2,3,5,8,9]]

In [76]:
df

Unnamed: 0,Employee_ID1,Name,Age,Salary,Bonus,Tax,Net_Salary
0,E001,Alice,25,70000,7000.0,21000.0,49000.0
1,E002,Bob,30,80000,8000.0,24000.0,56000.0
2,E003,Charlie,35,90000,9000.0,27000.0,63000.0
3,E004,mohit,20,40000,4000.0,12000.0,28000.0
4,E005,suresh,28,78000,7800.0,23400.0,54600.0


In [77]:
df.columns

Index(['Employee_ID1', 'Name', 'Age', 'Salary', 'Bonus', 'Tax', 'Net_Salary'], dtype='object')

In [78]:
df.columns = ['col_' + str(i) for i in range(len(df.columns))]

In [79]:
df.columns

Index(['col_0', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6'], dtype='object')

In [80]:
df_sort = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'Age': [25, 35, 30, 28],
    'Salary': [70000, 90000, 80000, 75000],
    'Department': ['HR', 'IT', 'Finance', 'HR']
})

In [81]:
df_sort

Unnamed: 0,Name,Age,Salary,Department
0,Alice,25,70000,HR
1,Bob,35,90000,IT
2,Charlie,30,80000,Finance
3,Diana,28,75000,HR


In [82]:
print(df_sort.sort_values('Age'))                    # Ascending

      Name  Age  Salary Department
0    Alice   25   70000         HR
3    Diana   28   75000         HR
2  Charlie   30   80000    Finance
1      Bob   35   90000         IT


In [83]:
print(df_sort.sort_values('Age', ascending=False))   # Descending

      Name  Age  Salary Department
1      Bob   35   90000         IT
2  Charlie   30   80000    Finance
3    Diana   28   75000         HR
0    Alice   25   70000         HR


In [84]:
df_sort.drop(0)

Unnamed: 0,Name,Age,Salary,Department
1,Bob,35,90000,IT
2,Charlie,30,80000,Finance
3,Diana,28,75000,HR


In [85]:
df_sort

Unnamed: 0,Name,Age,Salary,Department
0,Alice,25,70000,HR
1,Bob,35,90000,IT
2,Charlie,30,80000,Finance
3,Diana,28,75000,HR
