In [1]:
#get data from CSV
import pandas as pd
df = pd.read_csv('data.csv')
df

Unnamed: 0,Name,Age,Score,City,Contact No.
0,Ravi,27.0,,Bangalore,
1,Riya,24.0,72.0,Bangalore,
2,Anita,,,Pune,
3,Akshat,22.0,79.0,Bangalore,
4,Priya,23.0,,Delhi,9.674956e+09
...,...,...,...,...,...
60,Ravi,27.0,,Bangalore,
61,Riya,24.0,72.0,Bangalore,
62,Anita,,,Pune,
63,Akshat,22.0,79.0,Bangalore,


Preprocessing using Pandas library

In [2]:
#detecting missing values
print(df.isnull().sum()) #Check how many values are missing in each column.

# Drop rows where 'Name' is missing
df.dropna(subset=['Name'], inplace=True)
df

Name            4
Age            27
Score          34
City            0
Contact No.    35
dtype: int64


Unnamed: 0,Name,Age,Score,City,Contact No.
0,Ravi,27.0,,Bangalore,
1,Riya,24.0,72.0,Bangalore,
2,Anita,,,Pune,
3,Akshat,22.0,79.0,Bangalore,
4,Priya,23.0,,Delhi,9.674956e+09
...,...,...,...,...,...
60,Ravi,27.0,,Bangalore,
61,Riya,24.0,72.0,Bangalore,
62,Anita,,,Pune,
63,Akshat,22.0,79.0,Bangalore,


In [3]:
#fill missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Score'].fillna(df['Score'].median(), inplace=True)
df["Contact No."].fillna("Not Provided", inplace=True)
# Check if there are still any missing values
print(df.isnull().sum())
df

Name           0
Age            0
Score          0
City           0
Contact No.    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Score'].fillna(df['Score'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

Unnamed: 0,Name,Age,Score,City,Contact No.
0,Ravi,27.0,72.0,Bangalore,Not Provided
1,Riya,24.0,72.0,Bangalore,Not Provided
2,Anita,26.0,72.0,Pune,Not Provided
3,Akshat,22.0,79.0,Bangalore,Not Provided
4,Priya,23.0,72.0,Delhi,9674956245.0
...,...,...,...,...,...
60,Ravi,27.0,72.0,Bangalore,Not Provided
61,Riya,24.0,72.0,Bangalore,Not Provided
62,Anita,26.0,72.0,Pune,Not Provided
63,Akshat,22.0,79.0,Bangalore,Not Provided


In [17]:
#Removing Duplicates
print(df.duplicated().sum())  # Check how many duplicate rows exist
df.drop_duplicates(inplace=True)
df.head()


0


Unnamed: 0,Name,Age,Marks,City,Phone,Grades,Email
0,RaVi,27.0,72.0,BLR,Not Provided,C,ravi@msbc.com
1,Riya,24.0,72.0,BLR,Not Provided,C,riya@msbc.com
2,Anita,26.0,72.0,PN,Not Provided,C,anita@msbc.com
3,AkshaT,22.0,79.0,BLR,Not Provided,B,akshat@msbc.com
4,Priya,23.0,72.0,DL,9674956245.0,C,priya@msbc.com


In [5]:
print(df["City"].unique())
df["City"] = df["City"].astype(str).str.strip().str.title()
print(df["City"].unique())


['Bangalore' 'Pune' 'Delhi' 'Mumbai' 'Kolkata' 'Chennai' 'Hyderabad']
['Bangalore' 'Pune' 'Delhi' 'Mumbai' 'Kolkata' 'Chennai' 'Hyderabad']


In [7]:
# DATA TRANSFORMATION
df.rename(columns={"Contact No.": "Phone", "Score": "Marks"}, inplace=True)
city_map = {"Delhi": "DL", "Mumbai": "MH", "Chennai": "TN", "Bangalore": "BLR", "Hyderabad": "HYD", "Pune": "PN","Kolkata": "KL"}
df["City"] = df["City"].replace(city_map)
print(df)

      Name   Age  Marks City         Phone
0     Ravi  27.0   72.0  BLR  Not Provided
1     Riya  24.0   72.0  BLR  Not Provided
2    Anita  26.0   72.0   PN  Not Provided
3   Akshat  22.0   79.0  BLR  Not Provided
4    Priya  23.0   72.0   DL  9674956245.0
5    Vikas  26.0   72.0   MH  Not Provided
6    Karan  26.0   72.0   DL  9305858927.0
8    Vikas  28.0   72.0   KL  Not Provided
9     Riya  35.0   75.0   KL  Not Provided
10    Riya  24.0   66.0   MH  Not Provided
11   Sonal  26.0   72.0  BLR  9565773465.0
12   Priya  26.0   94.0  BLR  Not Provided
13  Akshat  26.0   58.0   TN  9680884201.0
14    Ravi  26.0   72.0  BLR  9315608720.0
16   Sonal  31.0   72.0   DL  9681640025.0
17   Vikas  26.0   72.0  HYD  Not Provided
18    Riya  26.0   72.0   DL  9324085759.0
19   Anita  30.0   91.0  HYD  9508496377.0
20   Anita  34.0   72.0   DL  9276879595.0
21   Sonal  35.0   59.0  BLR  Not Provided
22   Karan  31.0   67.0  BLR  Not Provided
23    Amit  26.0  100.0   TN  Not Provided
24    Ravi 

In [10]:
#Applying Functions
def grades(Marks):
    if Marks >= 85:
        return 'A'
    elif Marks >= 75:
        return 'B'
    elif Marks >= 65:
        return 'C'
    elif Marks >= 55:
        return 'D'
    else:
        return 'Fail'

df['Grades']=df["Marks"].apply(grades)

In [14]:
#Clean or modify text data using .str accessor
df["Name"] = df["Name"].str.title()               # Capitalize each word
df["City"] = df["City"].str.strip()               # Remove spaces
df["Name"] = df["Name"].str.replace("@", "a")     # Replace characters
df["Email"] = df["Name"].str.lower() + "@msbc.com"  # Create email column


In [15]:
df

Unnamed: 0,Name,Age,Marks,City,Phone,Grades,Email
0,RaVi,27.0,72.0,BLR,Not Provided,C,ravi@msbc.com
1,Riya,24.0,72.0,BLR,Not Provided,C,riya@msbc.com
2,Anita,26.0,72.0,PN,Not Provided,C,anita@msbc.com
3,AkshaT,22.0,79.0,BLR,Not Provided,B,akshat@msbc.com
4,Priya,23.0,72.0,DL,9674956245.0,C,priya@msbc.com
5,VikaS,26.0,72.0,MH,Not Provided,C,vikas@msbc.com
6,KaRaN,26.0,72.0,DL,9305858927.0,C,karan@msbc.com
8,VikaS,28.0,72.0,KL,Not Provided,C,vikas@msbc.com
9,Riya,35.0,75.0,KL,Not Provided,B,riya@msbc.com
10,Riya,24.0,66.0,MH,Not Provided,C,riya@msbc.com


In [16]:
pd.DataFrame.to_csv(df, 'data_cleaned.csv', index=False)