In [1]:
import pandas as pd #
#create a simple, clean synthetic dataset
data = {
    "student_ID": [101,102,103,104,105],
    "name": ["Alice", "Bob", "Charlie", "David", "Eva"],
    "age": [20, 21, 19, 22, 20],
    "grade": ["A", "B", "A", "C", "B"],
    "Department": ["Physics", "Chemistry", "Mathematics", "Biology", "Computer Science"]
}
df = pd.DataFrame(data)
df


Unnamed: 0,student_ID,name,age,grade,Department
0,101,Alice,20,A,Physics
1,102,Bob,21,B,Chemistry
2,103,Charlie,19,A,Mathematics
3,104,David,22,C,Biology
4,105,Eva,20,B,Computer Science


In [52]:
#create a synthetic dataset, with Noise and missing values
data = {
    "student_ID": [101,102,103,104,None],
    "name": ["Alice", "Bob", "Charlie", "David", "Eva", None],
    "age": [20, 21, 19, None, 22, None, 20],
    "grade": ["A", "B", "A","A", "C", "B",1009, None],
    "Department": ["Physics", "Chemistry", "Mathematics", "Biology", 42, "Computer Science", None]
}
df = pd.DataFrame(data)
df

ValueError: All arrays must be of the same length

In [53]:
#create a synthetic dataset, with Noise and missing values
data = {
    "student_ID": [101,102,103,104,None,106, 108],
    "name": ["Alice", "Bob", "Charlie", "David", "Eva", None, "John"],
    "age": [20, 21, 19, None, 22, None, 20],
    "grade": ["A", "B", "A","A", "B",1009, None],
    "Department": ["Physics", "Chemistry", "Mathematics", "Biology", 42, "Computer Science", None]
}
df = pd.DataFrame(data)
df

Unnamed: 0,student_ID,name,age,grade,Department
0,101.0,Alice,20.0,A,Physics
1,102.0,Bob,21.0,B,Chemistry
2,103.0,Charlie,19.0,A,Mathematics
3,104.0,David,,A,Biology
4,,Eva,22.0,B,42
5,106.0,,,1009,Computer Science
6,108.0,John,20.0,,


In [54]:
import pandas as pd #
import numpy as np # random number generation


In [55]:
np. random. seed (42)
n = 10
np.random.randint(1, 100, 10 )
print (np. random. randint (1, 100, n))



[88 24  3 22 53  2 88 30 38  2]


In [56]:
#np. random. seed (42)
#n = 10
np.random.randint(1, 100, 10 )
print (np. random. randint (1, 100, n))

[59 42 92 60 80 15 62 62 47 62]


In [57]:
np.random.seed(41)
n = 1000
df = pd.DataFrame ({
    'student_ID': np.arange(101, 101+n),
    'age': np.random.randint(18, 30,size= n),
    'marks': np.random.randint(40, 100, size=n),
    'grade': np.random.choice(['A', 'B', 'C', 'D'], size=n),
    'Department': np.random.choice(["Physics", "Chemistry", "Mathematics", "Biology", "Computer Science"], size=n)

})
df=pd.DataFrame(data)
df.head()

Unnamed: 0,student_ID,name,age,grade,Department
0,101.0,Alice,20.0,A,Physics
1,102.0,Bob,21.0,B,Chemistry
2,103.0,Charlie,19.0,A,Mathematics
3,104.0,David,,A,Biology
4,,Eva,22.0,B,42


In [58]:
#introduce missing values in age and marks and department  columns
for col in ['age', 'marks', 'Department']:
    df.loc[df.sample(frac=0.05).index, col] = np.nan
df

Unnamed: 0,student_ID,name,age,grade,Department,marks
0,101.0,Alice,20.0,A,Physics,
1,102.0,Bob,21.0,B,Chemistry,
2,103.0,Charlie,19.0,A,Mathematics,
3,104.0,David,,A,Biology,
4,,Eva,22.0,B,42,
5,106.0,,,1009,Computer Science,
6,108.0,John,20.0,,,


In [59]:
#duplicate records/rows
pd.concat([df, df.sample(10)], ignore_index=True)

ValueError: Cannot take a larger sample than population when 'replace=False'

In [None]:
#introduce invalid ages
df.loc[df.sample(10).index,"age"] = np.random.choice([-5, 150, 240], size=10)

ValueError: Cannot take a larger sample than population when 'replace=False'

In [None]:
#introduce OUTLIERS 
df.loc[df.sample(10).index,"marks"] = np.random.choice([-5, 150, 240], size=10)

ValueError: Cannot take a larger sample than population when 'replace=False'

In [40]:
#inconsistent text formatting
df["Department"] = df["Department"].astype(str)
df.loc[df.sample(fract=0.1).index, "Department"] = df["Department"].str.lower()

TypeError: sample() got an unexpected keyword argument 'fract'

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   student_ID  6 non-null      float64
 1   name        6 non-null      object 
 2   age         5 non-null      float64
 3   grade       6 non-null      object 
 4   Department  6 non-null      object 
 5   marks       0 non-null      float64
dtypes: float64(3), object(3)
memory usage: 464.0+ bytes


In [41]:

df.isnull().sum()

student_ID    1
name          1
age           2
grade         1
Department    0
marks         7
dtype: int64

In [42]:
df.shape

(7, 6)

In [43]:
df.columns

Index(['student_ID', 'name', 'age', 'grade', 'Department', 'marks'], dtype='object')

In [44]:
df.sample(10)

ValueError: Cannot take a larger sample than population when 'replace=False'

In [45]:
df.isnull().mean()*100

student_ID     14.285714
name           14.285714
age            28.571429
grade          14.285714
Department      0.000000
marks         100.000000
dtype: float64

In [46]:
df[df.isnull().any(axis=1)]

Unnamed: 0,student_ID,name,age,grade,Department,marks
0,101.0,Alice,20.0,A,Physics,
1,102.0,Bob,21.0,B,Chemistry,
2,103.0,Charlie,19.0,A,Mathematics,
3,104.0,David,,A,Biology,
4,,Eva,22.0,B,42,
5,106.0,,,1009,Computer Science,
6,108.0,John,20.0,,,


In [47]:
df[df.isnull().any(axis=1)].count()

student_ID    6
name          6
age           5
grade         6
Department    7
marks         0
dtype: int64

In [48]:
df.duplicated().sum()

0