### Synthetic Data Creation

In [None]:
import pandas as pd #
import numpy as np  # Random() function used later

In [9]:
# Create a Simple, Clean Synthetic Dataset
data = {
    "Student_ID": [101, 102, 103, 104, 105],
    "Name": ["Alice", "Bob", "Charlie", "David", "Eva"],
    "Age": [20, 21, 19, 22, 20],
    "Marks": [85, 90, 78, 88, 92],
    "Department": ["AIML", "AIML", "AIML", "CSE", "CSE"]
}
df=pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Name,Age,Marks,Department
0,101,Alice,20,85,AIML
1,102,Bob,21,90,AIML
2,103,Charlie,19,78,AIML
3,104,David,22,88,CSE
4,105,Eva,20,92,CSE


In [11]:
# Create a Synthetic Dataset with Noise/Which is not clean
data = {
    "Student_ID": [101, 102, 103, 104, None,104],
    "Name": ["Alice", "Bob", None, "David", "Eva","Sanjay"],
    "Age": [20, None, 19, None, 20,"25"],
    "Marks": [85, 90, 78, None, 92,1009],
    "Department": ["AIML", "AIML", "AIML", None, "CSE", "CSE"]
}
df=pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Name,Age,Marks,Department
0,101.0,Alice,20.0,85.0,AIML
1,102.0,Bob,,90.0,AIML
2,103.0,,19.0,78.0,AIML
3,104.0,David,,,
4,,Eva,20.0,92.0,CSE
5,104.0,Sanjay,25.0,1009.0,CSE


In [14]:
np.random.seed(42)  # For reproducibility
n=10
np.random.randint(1, 101, size=n)  # Random integers between 1 and 100

array([52, 93, 15, 72, 61, 21, 83, 87, 75, 75])

In [15]:
np.random.seed(41)
n=1000
df=pd.DataFrame({
    "Student_ID": np.arange(101, 101+n),
    "Age": np.random.randint(18, 24,size=n),
    "Marks": np.random.randint(50, 100,size=n),
    "Department": np.random.choice(["AIML", "ECE", "ME", "CSE", "AR"], size=n)
})

In [18]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,66,CSE
1,102,21,75,CSE
2,103,22,56,ME
3,104,20,90,ME
4,105,18,82,ME
...,...,...,...,...
995,1096,21,88,AR
996,1097,21,80,ECE
997,1098,20,51,CSE
998,1099,21,53,AR


In [20]:
df.head() #show first 5 rows

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,66,CSE
1,102,21,75,CSE
2,103,22,56,ME
3,104,20,90,ME
4,105,18,82,ME


In [22]:
# Introduce Missing Values in Age, Marks, Department
for col in["Age", "Marks", "Department"]:
    df.loc[df.sample(frac=0.05).index, col] = np.nan  # Introduce 5% missing values
df  


Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME
...,...,...,...,...
995,1096,21.0,88.0,AR
996,1097,,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR


In [26]:
#Introdece Duplicate records/Rows
pd.concat([df, df.sample(20)], ignore_index=True) # Duplicate 20 random rows

#Introduce invalid ages
df.loc[df.sample(10).index, "Age"] = np.random.choice([-5, 240], size=10)  
df



Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME
...,...,...,...,...
995,1096,21.0,88.0,AR
996,1097,,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR


In [25]:
#Introduce Outliers
df.loc[df.sample(10).index, "Marks"] = np.random.choice([150, 300], size=10)
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME
...,...,...,...,...
995,1096,21.0,88.0,AR
996,1097,,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR


In [None]:
# Inconsistent Text Formatting
df["Department"] = df["Department"].astype(str)
df.loc[df.sample(frac=0.1).index, "Department"] = df["Department"].sample(10).str.lower()
df
df.head() #show first 5 rows
df.tail() #show last 5 rows

Unnamed: 0,Student_ID,Age,Marks,Department
995,1096,21.0,88.0,AR
996,1097,,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR
999,1100,,90.0,AR


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Student_ID  1000 non-null   int32  
 1   Age         907 non-null    float64
 2   Marks       904 non-null    float64
 3   Department  900 non-null    object 
dtypes: float64(2), int32(1), object(1)
memory usage: 27.5+ KB


In [29]:
#column wise null value
df.isnull().sum()

Student_ID      0
Age            93
Marks          96
Department    100
dtype: int64

In [31]:
df.shape

(1000, 4)

In [32]:
df.columns

Index(['Student_ID', 'Age', 'Marks', 'Department'], dtype='object')