In [1]:
import pandas as pd

In [8]:
# Create a Simple, Clean Synthetic Dataset
data = {
    "Student_ID": [101,102, 103, 104, 105],
    "Name": ["Alice", "Bob", "Charlie", "David", "Eva"],
    "Age": [20, 21, 19, 22, 20],
    "Marks": [85, 90, 78, 88, 92],
    "Department": ["AIML", "AIML", "AIML", "CSE", "CSE"]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Name,Age,Marks,Department
0,101,Alice,20,85,AIML
1,102,Bob,21,90,AIML
2,103,Charlie,19,78,AIML
3,104,David,22,88,CSE
4,105,Eva,20,92,CSE


In [10]:
# Create a Simple, Clean Synthetic Dataset
data = {
    "Student_ID": [101,102, 103, None, 105],
    "Name": ["Alice", "Bob", "None", "David", "Eva"],
    "Age": [20, None, 19, 22, 20],
    "Marks": [85, None, 78, 88, 92],
    "Department": ["AIML", "AIML", "AIML", "CSE", "None"]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Name,Age,Marks,Department
0,101.0,Alice,20.0,85.0,AIML
1,102.0,Bob,,,AIML
2,103.0,,19.0,78.0,AIML
3,,David,22.0,88.0,CSE
4,105.0,Eva,20.0,92.0,


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Student_ID  4 non-null      float64
 1   Name        5 non-null      object 
 2   Age         4 non-null      float64
 3   Marks       4 non-null      float64
 4   Department  5 non-null      object 
dtypes: float64(3), object(2)
memory usage: 332.0+ bytes


In [15]:
import numpy as np
np.random.seed(42)
#n=10
np.random.randint(1,100,10)

array([52, 93, 15, 72, 61, 21, 83, 87, 75, 75])

In [19]:
# Create a Simple, Clean Synthetic Dataset
np.random.seed(42)
n=1000

df=pd.DataFrame({
    "Student_ID": np.arange(101,101+n),
    "Age": np.random.randint(18,25,size=n),
    "Marks": np.random.randint(50,100,size=n),
    "Department": np.random.choice(["AIML", "CSE", "ECE", "MECH","AUTOMATION"],size=n)
})


In [20]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,24,91,AUTOMATION
1,102,21,66,AIML
2,103,22,54,AUTOMATION
3,104,24,78,AUTOMATION
4,105,20,53,ECE
...,...,...,...,...
995,1096,23,83,CSE
996,1097,18,78,CSE
997,1098,24,53,AIML
998,1099,21,66,AUTOMATION


In [21]:
df.head()

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,24,91,AUTOMATION
1,102,21,66,AIML
2,103,22,54,AUTOMATION
3,104,24,78,AUTOMATION
4,105,20,53,ECE


In [None]:
#Introduce Missing Values in Age, Marks, Department
for col in ["Age","Marks","Department"]:
    df.loc[df.sample(frac=0.5).index, col] = np.nan

## Duplicate records/rows
df = pd.concat([df, df.sample(20)], ignore_index=True)

#Introduce Invalid ages
df.loc[df.sample(10).index, "Age"] = np.random.choice([-5, -10, -15], size=10)

#Introduce outliers
df.loc[df.sample(10).index, "Marks"] = np.random.choice([0, 150, 200], size=10)

#Inconsistent Text Formatting
df["Department"] = df["Department"].astype(str)
# lowercase only the sampled rows (use 'frac' not 'fract')
dept_idx = df.sample(frac=0.1).index
df.loc[dept_idx, "Department"] = df.loc[dept_idx, "Department"].str.lower()




Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,,automation
1,102,,66.0,
2,103,,200.0,
3,104,,,
4,105,,,


In [24]:
df.head()

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,,automation
1,102,,66.0,
2,103,,200.0,
3,104,,,
4,105,,,


In [25]:
df.tail()

Unnamed: 0,Student_ID,Age,Marks,Department
1035,382,,56.0,
1036,913,,,
1037,421,,,
1038,874,21.0,,
1039,1055,-15.0,,


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1040 entries, 0 to 1039
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Student_ID  1040 non-null   int32  
 1   Age         273 non-null    float64
 2   Marks       256 non-null    float64
 3   Department  1040 non-null   object 
dtypes: float64(2), int32(1), object(1)
memory usage: 28.6+ KB


In [27]:
df.isnull().sum()   

Student_ID      0
Age           767
Marks         784
Department      0
dtype: int64

In [28]:
df.shape

(1040, 4)

In [29]:
df.columns

Index(['Student_ID', 'Age', 'Marks', 'Department'], dtype='object')

In [30]:
df.sample(10)

Unnamed: 0,Student_ID,Age,Marks,Department
118,219,,,
794,895,,,
902,1003,19.0,,
375,476,20.0,70.0,
824,925,,69.0,
850,951,,52.0,ece
568,669,18.0,55.0,
708,809,21.0,79.0,
427,528,-10.0,89.0,ECE
469,570,,,


In [31]:
df.isnull().mean()*100

Student_ID     0.000000
Age           73.750000
Marks         75.384615
Department     0.000000
dtype: float64

In [33]:
df[df.isnull().any(axis=1)].count()

Student_ID    967
Age           200
Marks         183
Department    967
dtype: int64

In [35]:
df.duplicated().sum()    

25

In [36]:
df[df.duplicated(keep=False)]

Unnamed: 0,Student_ID,Age,Marks,Department
4,105,,,
153,254,,,
208,309,,,
281,382,,56.0,
294,395,,,
320,421,,,
365,466,22.0,,
401,502,,,
426,527,18.0,,
589,690,,89.0,


In [37]:
df.duplicated().sum()
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,,automation
1,102,,66.0,
2,103,,200.0,
3,104,,,
4,105,,,
...,...,...,...,...
1035,382,,56.0,
1036,913,,,
1037,421,,,
1038,874,21.0,,


In [1]:

import pandas as pd 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)



In [4]:
data = {
    "Student_ID": [101,102, 103, None, 105],
    "Name": ["Alice", "Bob", "None", "David", "Eva"],
    "Age": [20, None, 19, 22, 20],
    "Marks": [85, None, 78, 88, 92],
    "Department": ["AIML", "AIML", "AIML", "CSE", "None"]
}
df = pd.DataFrame(data)
df


pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

df= pd.DataFrame(data)
df 



Unnamed: 0,Student_ID,Name,Age,Marks,Department
0,101.0,Alice,20.0,85.0,AIML
1,102.0,Bob,,,AIML
2,103.0,,19.0,78.0,AIML
3,,David,22.0,88.0,CSE
4,105.0,Eva,20.0,92.0,


In [7]:
import pandas as pd
import numpy as np
np.random.seed(41)
n=1000

df=pd.DataFrame ({

    "student_id": np.arange(101,101+n),
    
    "age":np.random.randint(18,24,size=n),
    "marks":np.random.randint(40,100,size=n),
    "department":np.random.choice(["AIML","AR","CSE","ME","automation"],size=n)


})

df=pd.DataFrame(df)
df

Unnamed: 0,student_id,age,marks,department
0,101,18,92,AIML
1,102,21,56,CSE
2,103,22,65,CSE
3,104,20,46,AIML
4,105,18,80,AIML
...,...,...,...,...
995,1096,21,48,AIML
996,1097,21,76,CSE
997,1098,20,42,AIML
998,1099,21,62,AR


In [8]:
#Display rows 500-600
df.iloc[500:601]

Unnamed: 0,student_id,age,marks,department
500,601,21,40,AIML
501,602,19,72,ME
502,603,20,90,automation
503,604,20,64,CSE
504,605,21,67,AIML
...,...,...,...,...
596,697,23,71,ME
597,698,19,81,AIML
598,699,21,90,AIML
599,700,22,59,AIML


In [10]:
df.to_csv("synthetic_student_data.csv",index=False)


In [11]:
df.to_excel("synthetic_student_data.xlsx",index=False)