### Sythentic Data Creation

In [2]:
import pandas as pd 
import numpy as np # to use random

In [3]:
# Create a Simple, Clean Synthetic Dataset
data = {
    "Student_ID": [101,102, 103, 104, 105],
    "Name": ["Alice", "Bob", "Charlie", "David", "Eva"],
    "Age": [20, 21, 19, 22, 20],
    "Marks": [85, 90, 78, 88, 92],
    "Department": ["AIML", "AIML", "AIML", "CSE", "CSE"]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Name,Age,Marks,Department
0,101,Alice,20,85,AIML
1,102,Bob,21,90,AIML
2,103,Charlie,19,78,AIML
3,104,David,22,88,CSE
4,105,Eva,20,92,CSE


In [4]:
# Create a Simple, Clean Synthetic Dataset
data = {
    "Student_ID": [101,102, 103, 104,None, 104],
    "Name": ["Alice", "Bob", "Charlie", None, "David", "Eva"],
    "Age": [20,None, 21, None, 22, "20"],
    "Marks": [85, 90, None, 88, 92,1089],
    "Department": ["AIML", "AIML", None, "CSE", "CSE",42]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Name,Age,Marks,Department
0,101.0,Alice,20.0,85.0,AIML
1,102.0,Bob,,90.0,AIML
2,103.0,Charlie,21.0,,
3,104.0,,,88.0,CSE
4,,David,22.0,92.0,CSE
5,104.0,Eva,20.0,1089.0,42


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Student_ID  5 non-null      float64
 1   Name        5 non-null      object 
 2   Age         4 non-null      object 
 3   Marks       5 non-null      float64
 4   Department  5 non-null      object 
dtypes: float64(2), object(3)
memory usage: 372.0+ bytes


In [6]:
#np.random.seed(42)  # For reproducibility
#n = 10
print(np.random.randint(1, 100, 10))

[65 21 92  2 50  2 34  1  8 47]


In [7]:
np.random.seed(41)
n=1000

df=pd.DataFrame( {
    "Student_ID": np.arange(101,101+n),
    "Age": np.random.randint(18,25,size=n),
    "Marks": np.random.randint(40,100, size=n),
    "Department": np.random.choice(["AIML", "ECE", "ME", "CSE", "AUTOMATION"],size= n)
})


In [8]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,72,AUTOMATION
1,102,21,83,AUTOMATION
2,103,22,67,ME
3,104,20,96,AUTOMATION
4,105,18,58,ECE
...,...,...,...,...
995,1096,24,54,ECE
996,1097,19,92,AIML
997,1098,23,98,ECE
998,1099,21,89,ME


In [9]:
df.head()

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,72,AUTOMATION
1,102,21,83,AUTOMATION
2,103,22,67,ME
3,104,20,96,AUTOMATION
4,105,18,58,ECE


In [10]:
# Introduce Missing Values in Age, Marks and Department

for col in ["Age", "Marks", "Department"]:
    df.loc[df.sample(frac=0.05).index, col] = np.nan


# Duplicate records/rows

pd.concat([df, df.sample(20)], ignore_index=True)


#Introduce invalid Ages

df.loc[df.sample(n=10).index, "Age"] = np.random.choice([-5,240], size=10)


#Introduce Outliers 

df.loc[df.sample(n=10).index, "Marks"] = np.random.choice([150,240], size=10)


# Incosistent Text Formatting

df["Department"]= df["Department"].astype(str)
df.loc[df.sample(frac=0.1).index, "Department"] = df["Department"].str.lower()



In [11]:
df.head()

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,72.0,AUTOMATION
1,102,21.0,83.0,AUTOMATION
2,103,22.0,67.0,ME
3,104,20.0,96.0,AUTOMATION
4,105,18.0,58.0,ECE


In [12]:
df.tail()

Unnamed: 0,Student_ID,Age,Marks,Department
995,1096,24.0,54.0,ECE
996,1097,19.0,92.0,AIML
997,1098,23.0,98.0,ECE
998,1099,21.0,89.0,me
999,1100,20.0,45.0,CSE


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Student_ID  1000 non-null   int64  
 1   Age         950 non-null    float64
 2   Marks       950 non-null    float64
 3   Department  1000 non-null   object 
dtypes: float64(2), int64(1), object(1)
memory usage: 31.4+ KB


In [14]:
df.isnull().sum()

Student_ID     0
Age           50
Marks         50
Department     0
dtype: int64

# DATA EXPLORATION

In [40]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)



In [None]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,72.0,AUTOMATION
1,102,21.0,83.0,AUTOMATION
2,103,22.0,67.0,ME
3,104,20.0,96.0,AUTOMATION
4,105,18.0,58.0,ECE
5,106,19.0,99.0,
6,107,21.0,,ME
7,108,19.0,79.0,ECE
8,109,23.0,46.0,ECE
9,110,20.0,48.0,ME


In [None]:
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

In [41]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,72.0,AUTOMATION
1,102,21.0,83.0,AUTOMATION
2,103,22.0,67.0,ME
3,104,20.0,96.0,AUTOMATION
4,105,18.0,58.0,ECE
5,106,19.0,99.0,
6,107,21.0,,ME
7,108,19.0,79.0,ECE
8,109,23.0,46.0,ECE
9,110,20.0,48.0,ME


In [49]:
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

In [50]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,72.0,AUTOMATION
1,102,21.0,83.0,AUTOMATION
2,103,22.0,67.0,ME
3,104,20.0,96.0,AUTOMATION
4,105,18.0,58.0,ECE
...,...,...,...,...
995,1096,24.0,54.0,ECE
996,1097,19.0,92.0,AIML
997,1098,23.0,98.0,ECE
998,1099,21.0,89.0,me


In [48]:
df.shape

(1000, 4)

In [19]:
df.columns

Index(['Student_ID', 'Age', 'Marks', 'Department'], dtype='object')

In [20]:
df.sample(10)

Unnamed: 0,Student_ID,Age,Marks,Department
683,784,21.0,83.0,CSE
935,1036,18.0,61.0,AUTOMATION
910,1011,23.0,62.0,ECE
694,795,22.0,66.0,ME
203,304,23.0,66.0,aiml
152,253,24.0,52.0,CSE
260,361,19.0,95.0,ECE
579,680,22.0,88.0,AUTOMATION
213,314,20.0,82.0,CSE
177,278,22.0,57.0,CSE


In [21]:
df.isnull().mean() * 100

Student_ID    0.0
Age           5.0
Marks         5.0
Department    0.0
dtype: float64

In [22]:
df[df.isnull().any(axis=1)].count()

Student_ID    99
Age           49
Marks         49
Department    99
dtype: int64

In [23]:
df.duplicated().sum()

np.int64(0)

In [24]:
# Show all rows that have duplicates (including the original occurrences)
df[df.duplicated(keep=False)]

Unnamed: 0,Student_ID,Age,Marks,Department


In [25]:
df.duplicated(subset=["Student_ID"]).sum()  

np.int64(0)

In [51]:
df.describe()

Unnamed: 0,Student_ID,Age,Marks
count,1000.0,950.0,950.0
mean,600.5,22.436842,70.752632
std,288.819436,18.914808,21.984736
min,101.0,-5.0,40.0
25%,350.75,19.0,55.0
50%,600.5,21.0,70.0
75%,850.25,23.0,86.0
max,1100.0,240.0,240.0


In [58]:
# Displauy roes from 500-600

df.iloc[500:600].to_string()

'     Student_ID   Age  Marks  Department\n500         601  22.0   76.0         CSE\n501         602  19.0   96.0  AUTOMATION\n502         603  23.0   92.0        AIML\n503         604  19.0   75.0          ME\n504         605  23.0   64.0  AUTOMATION\n505         606  21.0   78.0          ME\n506         607  19.0   83.0        AIML\n507         608  19.0   85.0        AIML\n508         609  24.0   58.0         ECE\n509         610  21.0   95.0         ECE\n510         611  23.0    NaN          ME\n511         612  24.0   74.0        AIML\n512         613  18.0   40.0  AUTOMATION\n513         614  19.0   54.0          ME\n514         615  23.0   41.0         CSE\n515         616  20.0   58.0        aiml\n516         617  23.0   65.0        AIML\n517         618  24.0   42.0  AUTOMATION\n518         619  23.0   75.0         ECE\n519         620  18.0   43.0          ME\n520         621  23.0   54.0         CSE\n521         622  23.0   86.0         CSE\n522         623  24.0   88.0     

In [59]:
df.to_csv("students.csv", index=False)