### synthetic Data creation

In [9]:
import pandas as pd # for data manipulation
import numpy as np  # for numerical operations(random data generation)

In [3]:
# create a simple, clean synthetic dataset
data = {
    "Student_ID":[101,102,103,104,105],
    "Name": ['Amit', 'Sara', 'John', 'Anita', 'Raj'],
    "Age": [25, 30, 22, 28, 24],
    "Marks":[85, 90, 78, 88, 92],
    "Departrment": ['Physics', 'Chemistry', 'Mathematics', 'Biology', 'Computer Science'],
}
df = pd.DataFrame(data)
df


Unnamed: 0,Student_ID,Name,Age,Marks,Departrment
0,101,Amit,25,85,Physics
1,102,Sara,30,90,Chemistry
2,103,John,22,78,Mathematics
3,104,Anita,28,88,Biology
4,105,Raj,24,92,Computer Science


In [7]:
#create a synthetic data with noiser
import numpy as np
data = {
    "Student_ID":[101,102,103,104,None,104],
    "Name": ['Amit', 'Sara', None, 'Anita', 'Raj', 'Anita'],
    "Age": [25, None, 22, None, 24, '23'],
    "Marks":[85, None, 78, 88, 92, -9],
    "Departrment": ['Physics', 'Chemistry', 'Mathematics', None, 'Computer Science',42],
}
df = pd.DataFrame(data)
df


Unnamed: 0,Student_ID,Name,Age,Marks,Departrment
0,101.0,Amit,25.0,85.0,Physics
1,102.0,Sara,,,Chemistry
2,103.0,,22.0,78.0,Mathematics
3,104.0,Anita,,88.0,
4,,Raj,24.0,92.0,Computer Science
5,104.0,Anita,23.0,-9.0,42


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Student_ID   5 non-null      float64
 1   Name         5 non-null      object 
 2   Age          4 non-null      object 
 3   Marks        6 non-null      int64  
 4   Departrment  5 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 372.0+ bytes


In [12]:
np.random.seed(42)  # for reproducibility
n=10
print(np.random.randint(1,100,size=n))

[52 93 15 72 61 21 83 87 75 75]


In [15]:
#np.random.seed(42)  # for reproducibility
n=10
print(np.random.randint(1,100,size=n))

[59 42 92 60 80 15 62 62 47 62]


In [17]:
np.random.seed(41)
n=1000
df = pd.DataFrame({    
    "Student_ID":np.arange(101,101+n),
    "Age": np.random.randint(18,30,size=n),
    "Marks":np.random.randint(40,100,size=n),
    "Departrment": np.random.choice(['Physics', 'Chemistry', 'Mathematics', 'Biology', 'Computer Science'], size=n)
})
df 



Unnamed: 0,Student_ID,Age,Marks,Departrment
0,101,18,81,Computer Science
1,102,21,56,Computer Science
2,103,20,99,Mathematics
3,104,18,59,Mathematics
4,105,19,67,Biology
...,...,...,...,...
995,1096,23,76,Physics
996,1097,24,42,Physics
997,1098,25,79,Chemistry
998,1099,26,52,Chemistry


In [None]:
df.head()


Unnamed: 0,Student_ID,Age,Marks,Departrment
0,101,18,81,Computer Science
1,102,21,56,Computer Science
2,103,20,99,Mathematics
3,104,18,59,Mathematics
4,105,19,67,Biology


In [21]:
#introduce missing values in age, Marks and Department columns
for col in ['Age', 'Marks', 'Departrment']:
    df.loc[df.sample(frac=0.1).index, col] = np. nan
df


Unnamed: 0,Student_ID,Age,Marks,Departrment
0,101,18.0,81.0,
1,102,21.0,,Computer Science
2,103,20.0,99.0,Mathematics
3,104,,59.0,Mathematics
4,105,19.0,67.0,Biology
...,...,...,...,...
995,1096,23.0,,Physics
996,1097,24.0,42.0,Physics
997,1098,25.0,,
998,1099,,,Chemistry


In [34]:
#Adding Duplicate rows 
df = pd.concat([df, df.sample(20)], ignore_index=True)

#Rename misspelled column to correct name if necessary
if 'Departrment' in df.columns and 'Department' not in df.columns:
	df = df.rename(columns={'Departrment': 'Department'})

#Introduce invalid age values
df.loc[df.sample(20).index, 'Age'] = np.random.choice([-5, 240], size=20)

#outliers in Marks
df.loc[df.sample(20).index, 'Marks'] = np.random.choice([-10, 150], size=20)

#Inconsistent text formatting
df['Department'] = df['Department'].astype(str)
df.loc[df.sample(20).index, 'Department'] = df['Department'].str.lower()

df
df.head()
df.tail()


Unnamed: 0,Student_ID,Age,Marks,Department
1035,878,26.0,,Physics
1036,105,19.0,67.0,Biology
1037,801,,,Chemistry
1038,282,29.0,-10.0,Mathematics
1039,806,240.0,93.0,Chemistry


In [36]:
df.isnull().sum()

Student_ID      0
Age           217
Marks         238
Department      0
dtype: int64

In [37]:
df.shape()

TypeError: 'tuple' object is not callable

In [38]:
df.shape

(1040, 4)

In [39]:
df.columns

Index(['Student_ID', 'Age', 'Marks', 'Department'], dtype='object')

In [40]:
df.sample(10)

Unnamed: 0,Student_ID,Age,Marks,Department
1035,878,26.0,,Physics
847,948,29.0,69.0,Chemistry
590,691,,90.0,
509,610,-5.0,89.0,Physics
697,798,,97.0,Chemistry
292,393,-5.0,72.0,
910,1011,-5.0,,
503,604,,,Biology
480,581,,60.0,Mathematics
397,498,25.0,,physics


In [41]:
df.isnull().mean()*100

Student_ID     0.000000
Age           20.865385
Marks         22.884615
Department     0.000000
dtype: float64

In [42]:
df.isnull().any(axis=1)

0       False
1        True
2       False
3        True
4       False
        ...  
1035     True
1036    False
1037     True
1038    False
1039    False
Length: 1040, dtype: bool

In [43]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Student_ID,Age,Marks,Department
1,102,240.0,,Computer Science
3,104,,59.0,Mathematics
5,106,,87.0,
6,107,27.0,,Biology
10,111,,99.0,Mathematics
...,...,...,...,...
1028,915,,94.0,Biology
1029,363,28.0,,Computer Science
1034,291,21.0,,Computer Science
1035,878,26.0,,Physics


In [44]:
df[df.isnull().any(axis=1)].count()

Student_ID    408
Age           191
Marks         170
Department    408
dtype: int64

In [46]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1035     True
1036     True
1037     True
1038     True
1039     True
Length: 1040, dtype: bool

In [47]:
df[df.duplicated(keep=False)]

Unnamed: 0,Student_ID,Age,Marks,Department
4,105,19.0,67.0,Biology
64,165,21.0,,
65,166,18.0,,Chemistry
67,168,25.0,-10.0,Mathematics
181,282,29.0,-10.0,Mathematics
...,...,...,...,...
1035,878,26.0,,Physics
1036,105,19.0,67.0,Biology
1037,801,,,Chemistry
1038,282,29.0,-10.0,Mathematics


In [48]:
df.describe()


Unnamed: 0,Student_ID,Age,Marks
count,1040.0,823.0,802.0
mean,600.554808,46.432564,68.190773
std,289.526079,73.95555,41.536889
min,101.0,-5.0,-10.0
25%,350.75,19.0,49.0
50%,599.5,23.0,70.0
75%,852.25,27.0,89.0
max,1100.0,240.0,150.0
