### Synthetic Data Creation

In [None]:
import pandas as pd #
import numpy as np  # Random() function used later

In [3]:
# Create a Simple, Clean Synthetic Dataset
data = {
    "Student_ID": [101, 102, 103, 104, 105],
    "Name": ["Alice", "Bob", "Charlie", "David", "Eva"],
    "Age": [20, 21, 19, 22, 20],
    "Marks": [85, 90, 78, 88, 92],
    "Department": ["AIML", "AIML", "AIML", "CSE", "CSE"]
}
df=pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Name,Age,Marks,Department
0,101,Alice,20,85,AIML
1,102,Bob,21,90,AIML
2,103,Charlie,19,78,AIML
3,104,David,22,88,CSE
4,105,Eva,20,92,CSE


In [4]:
# Create a Synthetic Dataset with Noise/Which is not clean
data = {
    "Student_ID": [101, 102, 103, 104, None,104],
    "Name": ["Alice", "Bob", None, "David", "Eva","Sanjay"],
    "Age": [20, None, 19, None, 20,"25"],
    "Marks": [85, 90, 78, None, 92,1009],
    "Department": ["AIML", "AIML", "AIML", None, "CSE", "CSE"]
}
df=pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Name,Age,Marks,Department
0,101.0,Alice,20.0,85.0,AIML
1,102.0,Bob,,90.0,AIML
2,103.0,,19.0,78.0,AIML
3,104.0,David,,,
4,,Eva,20.0,92.0,CSE
5,104.0,Sanjay,25.0,1009.0,CSE


In [5]:
np.random.seed(42)  # For reproducibility
n=10
np.random.randint(1, 101, size=n)  # Random integers between 1 and 100

array([52, 93, 15, 72, 61, 21, 83, 87, 75, 75])

In [6]:
np.random.seed(41)
n=1000
df=pd.DataFrame({
    "Student_ID": np.arange(101, 101+n),
    "Age": np.random.randint(18, 24,size=n),
    "Marks": np.random.randint(50, 100,size=n),
    "Department": np.random.choice(["AIML", "ECE", "ME", "CSE", "AR"], size=n)
})

In [7]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,66,CSE
1,102,21,75,CSE
2,103,22,56,ME
3,104,20,90,ME
4,105,18,82,ME
...,...,...,...,...
995,1096,21,88,AR
996,1097,21,80,ECE
997,1098,20,51,CSE
998,1099,21,53,AR


In [8]:
df.head() #show first 5 rows

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,66,CSE
1,102,21,75,CSE
2,103,22,56,ME
3,104,20,90,ME
4,105,18,82,ME


In [9]:
# Introduce Missing Values in Age, Marks, Department
for col in["Age", "Marks", "Department"]:
    df.loc[df.sample(frac=0.05).index, col] = np.nan  # Introduce 5% missing values
df  


Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,66.0,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME
...,...,...,...,...
995,1096,21.0,88.0,AR
996,1097,21.0,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR


In [10]:
#Introdece Duplicate records/Rows
pd.concat([df, df.sample(20)], ignore_index=True) # Duplicate 20 random rows

#Introduce invalid ages
df.loc[df.sample(10).index, "Age"] = np.random.choice([-5, 240], size=10)  
df



Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,66.0,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME
...,...,...,...,...
995,1096,21.0,88.0,AR
996,1097,21.0,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR


In [11]:
#Introduce Outliers
df.loc[df.sample(10).index, "Marks"] = np.random.choice([150, 300], size=10)
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,66.0,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME
...,...,...,...,...
995,1096,21.0,88.0,AR
996,1097,21.0,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR


In [12]:
# Inconsistent Text Formatting
df["Department"] = df["Department"].astype(str)
df.loc[df.sample(frac=0.1).index, "Department"] = df["Department"].sample(10).str.lower()
df
df.head() #show first 5 rows
df.tail() #show last 5 rows

Unnamed: 0,Student_ID,Age,Marks,Department
995,1096,21.0,88.0,AR
996,1097,21.0,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR
999,1100,23.0,90.0,AR


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Student_ID  1000 non-null   int32  
 1   Age         950 non-null    float64
 2   Marks       950 non-null    float64
 3   Department  900 non-null    object 
dtypes: float64(2), int32(1), object(1)
memory usage: 27.5+ KB


In [14]:
#column wise null value
df.isnull().sum()

Student_ID      0
Age            50
Marks          50
Department    100
dtype: int64

In [15]:
df.shape

(1000, 4)

In [16]:
df.columns

Index(['Student_ID', 'Age', 'Marks', 'Department'], dtype='object')

In [17]:
df.columns

Index(['Student_ID', 'Age', 'Marks', 'Department'], dtype='object')

In [18]:
df.sample(10)

Unnamed: 0,Student_ID,Age,Marks,Department
990,1091,21.0,86.0,ECE
288,389,23.0,64.0,CSE
100,201,19.0,55.0,AIML
244,345,19.0,87.0,ME
194,295,19.0,92.0,ECE
641,742,20.0,98.0,ME
904,1005,21.0,53.0,ECE
500,601,21.0,97.0,
481,582,,75.0,AIML
759,860,,82.0,ECE


In [19]:
df.isnull().sum()

Student_ID      0
Age            50
Marks          50
Department    100
dtype: int64

In [20]:
df.isnull().mean()*100  # Percentage of missing values per column

Student_ID     0.0
Age            5.0
Marks          5.0
Department    10.0
dtype: float64

In [21]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Student_ID,Age,Marks,Department
13,114,21.0,,AIML
14,115,21.0,59.0,
15,116,20.0,67.0,
16,117,,78.0,AR
38,139,20.0,55.0,
...,...,...,...,...
972,1073,,74.0,CSE
974,1075,19.0,72.0,
981,1082,20.0,,AR
983,1084,23.0,,ECE


In [22]:
df[df.isnull().any(axis=1)].count()

Student_ID    191
Age           141
Marks         141
Department     91
dtype: int64

In [23]:
df.duplicated().sum()  # Count of duplicate rows

0

In [24]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

In [25]:
df.duplicated().sum()  # Count of duplicate rows

0

In [26]:
df[df.duplicated(keep=False)]  # Show all duplicate rows

Unnamed: 0,Student_ID,Age,Marks,Department


In [27]:
df.duplicated(subset=["Student_ID"]).sum() # Duplicates based on specific columns

0

In [28]:
df.describe()

Unnamed: 0,Student_ID,Age,Marks
count,1000.0,950.0,950.0
mean,600.5,20.986316,76.233684
std,288.819436,12.639144,23.467533
min,101.0,-5.0,50.0
25%,350.75,19.0,62.0
50%,600.5,20.0,75.0
75%,850.25,22.0,87.0
max,1100.0,240.0,300.0


In [29]:
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns   

In [30]:
df.head()

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,66.0,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME


In [33]:
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

In [35]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,66.0,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME
...,...,...,...,...
995,1096,21.0,88.0,AR
996,1097,21.0,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR


In [39]:
# Display rows 500-600
df.iloc[500:600].to_string()

'     Student_ID   Age  Marks Department\n500         601  21.0   97.0        NaN\n501         602  19.0   57.0        CSE\n502         603  20.0   55.0        ECE\n503         604  20.0   76.0        NaN\n504         605  21.0   72.0         ME\n505         606  22.0   68.0        CSE\n506         607  21.0   90.0         AR\n507         608  18.0   72.0         ME\n508         609  22.0   76.0       AIML\n509         610  19.0   70.0         ME\n510         611  22.0   99.0        CSE\n511         612  21.0   50.0        NaN\n512         613  21.0   92.0        ECE\n513         614  21.0   94.0         AR\n514         615  23.0   63.0        NaN\n515         616  23.0   96.0       AIML\n516         617  20.0   66.0         AR\n517         618  19.0   69.0        ECE\n518         619  22.0   75.0         ME\n519         620  20.0   76.0        CSE\n520         621  20.0   88.0         ME\n521         622  18.0   81.0        nan\n522         623  21.0   65.0       AIML\n523         624

In [43]:
# convert dataframe into csv file
df.to_csv("synthetic_student_data.csv", index=False)

In [45]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,66.0,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME
...,...,...,...,...
995,1096,21.0,88.0,AR
996,1097,21.0,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR


In [47]:
df.to_excel("synthetic_student_data.xlsx", index=False)


In [48]:
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,66.0,CSE
1,102,21.0,75.0,CSE
2,103,22.0,56.0,ME
3,104,20.0,90.0,ME
4,105,18.0,82.0,ME
...,...,...,...,...
995,1096,21.0,88.0,AR
996,1097,21.0,80.0,ECE
997,1098,,51.0,CSE
998,1099,21.0,53.0,AR
