In [7]:
#Pandas is the most popular Python library for data analysis and manipulation. 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [8]:
#Series is alike numpy array but have index labels
#Dataframe is like an Excel table or Sql table

In [9]:
#load the dataset
df=pd.read_csv("messy_data.csv")

In [10]:
# read first 4  lines
df.head(4)

Unnamed: 0,Student Name,Age,Course,Course Fee
0,John Doe,20.0,Computer Science,50000
1,Jane Smith,22.0,Data Science,60000
2,Alice,,Artificial Intelligence,65000
3,Bob Brown,23.0,Cyber Security,Five Fifty


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Student Name  19 non-null     object 
 1   Age           18 non-null     float64
 2   Course        19 non-null     object 
 3   Course Fee    18 non-null     object 
dtypes: float64(1), object(3)
memory usage: 740.0+ bytes


In [12]:
# total sum count of null values in each column
df.isnull().sum()

Student Name    0
Age             1
Course          0
Course Fee      1
dtype: int64

In [13]:
#removed duplicates
df.drop_duplicates(inplace=True) #inplace.True Modifies the DataFrame in place (changes the original data). 

In [14]:
#handling large values
df["Age"] = df["Age"].apply( lambda x: x if 10 <= x <= 30  else np.nan)


In [15]:
#access age column
df["Age"]

0     20.0
1     22.0
2      NaN
3     23.0
4      NaN
5     24.0
6     22.0
7     21.0
8     23.0
10     NaN
11    25.0
12    20.0
13    23.0
14    21.0
15    22.0
16    24.0
17    19.0
18    20.0
Name: Age, dtype: float64

In [16]:
#apply filtering on age column
df[df["Age"]<20]

Unnamed: 0,Student Name,Age,Course,Course Fee
17,Sophia Clark,19.0,Cloud Computing,58000


In [17]:
#handling missing values
df.fillna({"Age":df["Age"].mean()}, inplace=True)
df["Age"] = df["Age"].astype(int)


In [18]:
df["Age"]

0     20
1     22
2     21
3     23
4     21
5     24
6     22
7     21
8     23
10    21
11    25
12    20
13    23
14    21
15    22
16    24
17    19
18    20
Name: Age, dtype: int64

In [19]:
#handling of wrong format values
non_numeric_values = ["?", "Forty-Eight","Five Fifty"]
df.replace(non_numeric_values,np.nan, inplace=True)
df["Course Fee"]

0     50000
1     60000
2     65000
3       NaN
4     52000
5     70000
6     58000
7     62000
8     48000
10    73000
11    55000
12      NaN
13    50000
14    65000
15    60000
16      NaN
17    58000
18      NaN
Name: Course Fee, dtype: object

In [20]:
df["Course Fee"] = pd.to_numeric(df["Course Fee"], errors="coerce")
df.fillna({"Course Fee": df["Course Fee"].mean()}, inplace=True)
df["Course Fee"] = df["Course Fee"].astype(int)


#If "Course Fee" contains strings like "Five Fifty", "?", "N/A", or "Unknown", df["Course Fee"].mean() fails because mean can only be calculated on numbers. pd.to_numeric(df["Course Fee"], errors="coerce") fixes this by converting invalid values to NaN, allowing the mean calculation to work.

In [21]:
df

Unnamed: 0,Student Name,Age,Course,Course Fee
0,John Doe,20,Computer Science,50000
1,Jane Smith,22,Data Science,60000
2,Alice,21,Artificial Intelligence,65000
3,Bob Brown,23,Cyber Security,59000
4,Charlie White,21,Software Engineering,52000
5,David Black,24,Machine Learning,70000
6,Emma Wilson,22,Cloud Computing,58000
7,Frank Thomas,21,Blockchain Technology,62000
8,Grace Adams,23,Web Development,48000
10,Henry Miller,21,Big Data Analytics,73000


In [22]:
#access a row using index value
df.loc[0]

Student Name            John Doe
Age                           20
Course          Computer Science
Course Fee                 50000
Name: 0, dtype: object

In [23]:
df.loc[:,"Course Fee"]  # loc is used to get row and column info using row_labels,column_labels

0     50000
1     60000
2     65000
3     59000
4     52000
5     70000
6     58000
7     62000
8     48000
10    73000
11    55000
12    59000
13    50000
14    65000
15    60000
16    59000
17    58000
18    59000
Name: Course Fee, dtype: int64

In [24]:
#iloc is used to get the row and column by index values
df.iloc[2,3]

np.int64(65000)

In [26]:
#Data is cleaned 
df

Unnamed: 0,Student Name,Age,Course,Course Fee
0,John Doe,20,Computer Science,50000
1,Jane Smith,22,Data Science,60000
2,Alice,21,Artificial Intelligence,65000
3,Bob Brown,23,Cyber Security,59000
4,Charlie White,21,Software Engineering,52000
5,David Black,24,Machine Learning,70000
6,Emma Wilson,22,Cloud Computing,58000
7,Frank Thomas,21,Blockchain Technology,62000
8,Grace Adams,23,Web Development,48000
10,Henry Miller,21,Big Data Analytics,73000
