### synthetic Data creation

In [2]:
import pandas as pd # for data manipulation
import numpy as np  # for numerical operations(random data generation)

In [3]:
# create a simple, clean synthetic dataset
data = {
    "Student_ID":[101,102,103,104,105],
    "Name": ['Amit', 'Sara', 'John', 'Anita', 'Raj'],
    "Age": [25, 30, 22, 28, 24],
    "Marks":[85, 90, 78, 88, 92],
    "Departrment": ['Physics', 'Chemistry', 'Mathematics', 'Biology', 'Computer Science'],
}
df = pd.DataFrame(data)
df


Unnamed: 0,Student_ID,Name,Age,Marks,Departrment
0,101,Amit,25,85,Physics
1,102,Sara,30,90,Chemistry
2,103,John,22,78,Mathematics
3,104,Anita,28,88,Biology
4,105,Raj,24,92,Computer Science


In [4]:
#create a synthetic data with noiser
import numpy as np
data = {
    "Student_ID":[101,102,103,104,None,104],
    "Name": ['Amit', 'Sara', None, 'Anita', 'Raj', 'Anita'],
    "Age": [25, None, 22, None, 24, '23'],
    "Marks":[85, None, 78, 88, 92, -9],
    "Departrment": ['Physics', 'Chemistry', 'Mathematics', None, 'Computer Science',42],
}
df = pd.DataFrame(data)
df


Unnamed: 0,Student_ID,Name,Age,Marks,Departrment
0,101.0,Amit,25.0,85.0,Physics
1,102.0,Sara,,,Chemistry
2,103.0,,22.0,78.0,Mathematics
3,104.0,Anita,,88.0,
4,,Raj,24.0,92.0,Computer Science
5,104.0,Anita,23.0,-9.0,42


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Student_ID   5 non-null      float64
 1   Name         5 non-null      object 
 2   Age          4 non-null      object 
 3   Marks        5 non-null      float64
 4   Departrment  5 non-null      object 
dtypes: float64(2), object(3)
memory usage: 372.0+ bytes


In [6]:
np.random.seed(42)  # for reproducibility
n=10
print(np.random.randint(1,100,size=n))

[52 93 15 72 61 21 83 87 75 75]


In [7]:
#np.random.seed(42)  # for reproducibility
n=10
print(np.random.randint(1,100,size=n))

[88 24  3 22 53  2 88 30 38  2]


In [8]:
np.random.seed(41)
n=1000
df = pd.DataFrame({    
    "Student_ID":np.arange(101,101+n),
    "Age": np.random.randint(18,30,size=n),
    "Marks":np.random.randint(40,100,size=n),
    "Departrment": np.random.choice(['Physics', 'Chemistry', 'Mathematics', 'Biology', 'Computer Science'], size=n)
})
df 



Unnamed: 0,Student_ID,Age,Marks,Departrment
0,101,18,81,Computer Science
1,102,21,56,Computer Science
2,103,20,99,Mathematics
3,104,18,59,Mathematics
4,105,19,67,Biology
...,...,...,...,...
995,1096,23,76,Physics
996,1097,24,42,Physics
997,1098,25,79,Chemistry
998,1099,26,52,Chemistry


In [9]:
df.head()


Unnamed: 0,Student_ID,Age,Marks,Departrment
0,101,18,81,Computer Science
1,102,21,56,Computer Science
2,103,20,99,Mathematics
3,104,18,59,Mathematics
4,105,19,67,Biology


In [10]:
#introduce missing values in age, Marks and Department columns
for col in ['Age', 'Marks', 'Departrment']:
    df.loc[df.sample(frac=0.1).index, col] = np. nan
df


Unnamed: 0,Student_ID,Age,Marks,Departrment
0,101,18.0,81.0,Computer Science
1,102,21.0,56.0,Computer Science
2,103,20.0,99.0,Mathematics
3,104,,59.0,Mathematics
4,105,19.0,67.0,Biology
...,...,...,...,...
995,1096,23.0,,Physics
996,1097,24.0,42.0,Physics
997,1098,25.0,79.0,Chemistry
998,1099,26.0,52.0,Chemistry


In [11]:
#Adding Duplicate rows 
df = pd.concat([df, df.sample(20)], ignore_index=True)

#Rename misspelled column to correct name if necessary
if 'Departrment' in df.columns and 'Department' not in df.columns:
	df = df.rename(columns={'Departrment': 'Department'})

#Introduce invalid age values
df.loc[df.sample(20).index, 'Age'] = np.random.choice([-5, 240], size=20)

#outliers in Marks
df.loc[df.sample(20).index, 'Marks'] = np.random.choice([-10, 150], size=20)

#Inconsistent text formatting
df['Department'] = df['Department'].astype(str)
df.loc[df.sample(20).index, 'Department'] = df['Department'].str.lower()

df
df.head()
df.tail()


Unnamed: 0,Student_ID,Age,Marks,Department
1015,915,29.0,94.0,Biology
1016,598,28.0,65.0,Chemistry
1017,1033,29.0,85.0,Computer Science
1018,252,18.0,70.0,Physics
1019,556,29.0,40.0,Biology


In [12]:
df.isnull().sum()

Student_ID      0
Age           103
Marks          99
Department      0
dtype: int64

In [13]:
df.shape

(1020, 4)

In [14]:
df.columns

Index(['Student_ID', 'Age', 'Marks', 'Department'], dtype='object')

In [15]:
df.sample(10)

Unnamed: 0,Student_ID,Age,Marks,Department
303,404,20.0,83.0,
1005,443,29.0,82.0,Physics
982,1083,23.0,61.0,Biology
448,549,20.0,,Physics
336,437,29.0,150.0,Chemistry
17,118,24.0,65.0,Chemistry
657,758,25.0,64.0,Computer Science
821,922,,58.0,Computer Science
901,1002,22.0,40.0,Physics
564,665,24.0,72.0,Biology


In [16]:
df.isnull().mean()*100

Student_ID     0.000000
Age           10.098039
Marks          9.705882
Department     0.000000
dtype: float64

In [17]:
df.isnull().any(axis=1)

0       False
1       False
2       False
3        True
4       False
        ...  
1015    False
1016    False
1017    False
1018    False
1019    False
Length: 1020, dtype: bool

In [18]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Student_ID,Age,Marks,Department
3,104,,59.0,Mathematics
6,107,27.0,,Biology
10,111,,99.0,Mathematics
13,114,20.0,,Mathematics
21,122,29.0,,Computer Science
...,...,...,...,...
991,1092,,,Biology
995,1096,23.0,,Physics
1002,266,,51.0,Mathematics
1010,460,,78.0,Biology


In [19]:
df[df.isnull().any(axis=1)].count()

Student_ID    192
Age            89
Marks          93
Department    192
dtype: int64

In [20]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1015     True
1016     True
1017     True
1018     True
1019     True
Length: 1020, dtype: bool

In [21]:
df[df.duplicated(keep=False)]

Unnamed: 0,Student_ID,Age,Marks,Department
58,159,,,Physics
151,252,18.0,70.0,Physics
165,266,,51.0,Mathematics
170,271,20.0,58.0,Mathematics
243,344,27.0,86.0,Computer Science
290,391,22.0,79.0,Mathematics
342,443,29.0,82.0,Physics
356,457,20.0,92.0,Mathematics
359,460,,78.0,Biology
455,556,29.0,40.0,Biology


In [22]:
df.describe()


Unnamed: 0,Student_ID,Age,Marks
count,1020.0,917.0,921.0
mean,600.728431,25.895311,70.37785
std,288.804075,25.048293,20.764636
min,101.0,-5.0,-10.0
25%,350.75,20.0,55.0
50%,599.5,23.0,70.0
75%,850.25,26.0,85.0
max,1100.0,240.0,150.0


In [23]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [25]:
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

In [26]:
df


Unnamed: 0,Student_ID,Age,Marks,Department
0,101,-5.0,81.0,Computer Science
1,102,21.0,56.0,Computer Science
2,103,20.0,99.0,Mathematics
3,104,,59.0,Mathematics
4,105,19.0,67.0,Biology
...,...,...,...,...
1015,915,29.0,94.0,Biology
1016,598,28.0,65.0,Chemistry
1017,1033,29.0,85.0,Computer Science
1018,252,18.0,70.0,Physics


In [35]:
pd.set_option('display.max_rows', 8)
pd.set_option('display.max_columns', 2)
df

Unnamed: 0,Student_ID,...,Department
0,101,...,Computer Science
1,102,...,Computer Science
2,103,...,Mathematics
3,104,...,Mathematics
...,...,...,...
1016,598,...,Chemistry
1017,1033,...,Computer Science
1018,252,...,Physics
1019,556,...,Biology


In [30]:
#display  rows 500-600
df.iloc[500:600].to_string()

'     Student_ID   Age  Marks        Department\n500         601  21.0   47.0           Biology\n501         602  21.0   50.0               nan\n502         603  28.0   93.0         Chemistry\n503         604   NaN   43.0           Biology\n504         605  22.0   74.0         Chemistry\n505         606  28.0   71.0         Chemistry\n506         607  25.0    NaN         Chemistry\n507         608  21.0   40.0           Physics\n508         609  20.0   52.0         Chemistry\n509         610  29.0   89.0           Physics\n510         611   NaN   89.0               nan\n511         612  20.0   40.0           physics\n512         613  25.0   96.0           Biology\n513         614  20.0   68.0  Computer Science\n514         615  24.0   66.0       Mathematics\n515         616  25.0   40.0           Physics\n516         617  21.0   72.0           Physics\n517         618  22.0   90.0  Computer Science\n518         619  29.0  150.0         Chemistry\n519         620  26.0    NaN  Computer 

In [31]:
#convert dataframe to csv
df.to_csv('synthetic_student_data.csv', index=False)


In [32]:
df.to_excel('synthetic_student_data.xlsx', index=False)