**Synthetic Data Creation**

In [218]:
import pandas as pd   #Importing Library

In [219]:
# Create a simple , clean synthetic Dataset
data = {
    "Student_ID": [1, 2, 3, 4, 5],
    "Name": ["Ashna","Anuj","Amit","Arshia","Ishita"],
    "Age": [20, 21, 19, 22, 23],
    "Grade": ["A", "B", "C", "A", "B"],
    "City": ["New York", "London", "Paris", "Tokyo", "Sydney"],
    "Department" :["CSE","CSE","AIML","AIML","AIML"]
}



In [220]:
df = pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Name,Age,Grade,City,Department
0,1,Ashna,20,A,New York,CSE
1,2,Anuj,21,B,London,CSE
2,3,Amit,19,C,Paris,AIML
3,4,Arshia,22,A,Tokyo,AIML
4,5,Ishita,23,B,Sydney,AIML


In [221]:
# Create a Synthetic Dataset with Noise/ehich is not clean
data = {
    "Student_ID": [1, 2,None,5,2],
    "Name": ["Ashna","Anuj",None,"Arshia","Ishita"], #Now its a keyword
    "Age": [20, 21, 19, 22, 23],
    "Grade": ["A", "B", "C",None, "25"],      #It contains format mismatch, missing values,duplicate data
    "City": ["New York", "London", "Paris", "Tokyo", "Sydney"],
    "Department" :["CSE","CSE","AIML",None,"42"]     #if written in double quotes it becomes a string.
}
df = pd.DataFrame(data)
df


Unnamed: 0,Student_ID,Name,Age,Grade,City,Department
0,1.0,Ashna,20,A,New York,CSE
1,2.0,Anuj,21,B,London,CSE
2,,,19,C,Paris,AIML
3,5.0,Arshia,22,,Tokyo,
4,2.0,Ishita,23,25,Sydney,42


In [222]:
#Creating Large Dataset
import numpy as np # Random
np.random.seed(15)  #one time randome number generated.
#n = 5
print(np.random.randint(1,100,5))

[73 13  6  1 29]


In [223]:
# Creating Synthetic clean data with 1000 entries
np.random.seed(41)
n=1000
data = ({
    "Student_ID": np.arange(101,101+n),
    "Age": np.random.randint(18,24,size=n),
    "Marks": np.random.randint(40,100,size=n),
    "Department" :np.random.choice(["CSE","ECE","A&R","ME","AIML"],size=n)
})
df = pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,92,CSE
1,102,21,56,A&R
2,103,22,65,A&R
3,104,20,46,CSE
4,105,18,80,CSE
...,...,...,...,...
995,1096,21,48,CSE
996,1097,21,76,A&R
997,1098,20,42,CSE
998,1099,21,62,ECE


In [224]:
from numpy.random.mtrand import sample
# Introduce Missing values in Age, Marks and Department
for col in ["Age","Marks","Department"]:
    df.loc[df.sample(frac=0.5).index,col] = np.nan

df


Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,92.0,CSE
1,102,21.0,,A&R
2,103,,,A&R
3,104,,46.0,
4,105,,80.0,
...,...,...,...,...
995,1096,,48.0,
996,1097,21.0,,A&R
997,1098,,42.0,CSE
998,1099,,62.0,ECE


In [248]:
# Duplicate Records/Rows
pd.concat([df,df.sample(50)], ignore_index=True)

df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,92,CSE
1,102,21.0,56,A&R
2,103,22.0,65,A&R
3,104,20.0,46,CSE
4,105,18.0,80,CSE
...,...,...,...,...
995,1096,,48,CSE
996,1097,21.0,76,A&R
997,1098,20.0,42,CSE
998,1099,21.0,62,ECE


In [247]:
df.loc[df.sample(10).index,"Age"]= np.nan

df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,92,CSE
1,102,21.0,56,A&R
2,103,22.0,65,A&R
3,104,20.0,46,CSE
4,105,18.0,80,CSE
...,...,...,...,...
995,1096,,48,CSE
996,1097,21.0,76,A&R
997,1098,20.0,42,CSE
998,1099,21.0,62,ECE


In [249]:
#Duplicate invalid ages
df.loc[df.sample(10).index,"Age"]=np.random.choice([-5, 240], size=10)

df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,92,CSE
1,102,21.0,56,A&R
2,103,22.0,65,A&R
3,104,20.0,46,CSE
4,105,18.0,80,CSE
...,...,...,...,...
995,1096,,48,CSE
996,1097,21.0,76,A&R
997,1098,20.0,42,CSE
998,1099,21.0,62,ECE


In [252]:
df.loc[df.sample(60).index,"Marks"]=np.nan
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,92.0,CSE
1,102,21.0,56.0,A&R
2,103,22.0,65.0,A&R
3,104,20.0,,CSE
4,105,18.0,80.0,CSE
...,...,...,...,...
995,1096,,48.0,CSE
996,1097,21.0,76.0,A&R
997,1098,20.0,42.0,CSE
998,1099,21.0,62.0,ECE


In [253]:
#Introduce Outliers
df.loc[df.sample(10).index,"Marks"] = np.random.choice([150, 240], size=10)

df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,92.0,CSE
1,102,21.0,56.0,A&R
2,103,22.0,65.0,A&R
3,104,20.0,,CSE
4,105,18.0,80.0,CSE
...,...,...,...,...
995,1096,,48.0,CSE
996,1097,21.0,76.0,A&R
997,1098,20.0,42.0,CSE
998,1099,21.0,62.0,ECE


In [260]:
#Inconsistent Text Formatting
df["Department"] = df["Department"].astype(str)
df.loc[df.sample(frac=0.5).index, "Department"]=df["Department"].str.lower()
df = pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,92,CSE
1,102,21,56,A&R
2,103,22,65,A&R
3,104,20,46,CSE
4,105,18,80,CSE
...,...,...,...,...
995,1096,21,48,CSE
996,1097,21,76,A&R
997,1098,20,42,CSE
998,1099,21,62,ECE


In [261]:
df.tail()

Unnamed: 0,Student_ID,Age,Marks,Department
995,1096,21,48,CSE
996,1097,21,76,A&R
997,1098,20,42,CSE
998,1099,21,62,ECE
999,1100,23,54,ME


In [262]:
df.info()  #To get information of data stored.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Student_ID  1000 non-null   int64 
 1   Age         1000 non-null   int64 
 2   Marks       1000 non-null   int64 
 3   Department  1000 non-null   object
dtypes: int64(3), object(1)
memory usage: 31.4+ KB


In [233]:
df.isnull().sum()

Unnamed: 0,0
Student_ID,0
Age,0
Marks,0
Department,0


In [234]:
df.shape

(1000, 4)

In [235]:
df.columns

Index(['Student_ID', 'Age', 'Marks', 'Department'], dtype='object')

In [236]:
df.sample(10)

Unnamed: 0,Student_ID,Age,Marks,Department
608,709,22,98,A&R
916,1017,19,58,AIML
557,658,23,54,A&R
471,572,23,74,CSE
725,826,18,97,AIML
509,610,19,73,A&R
556,657,20,62,CSE
177,278,18,70,ECE
155,256,19,58,AIML
119,220,21,45,AIML


In [237]:
df.isnull().sum()

Unnamed: 0,0
Student_ID,0
Age,0
Marks,0
Department,0


In [238]:
df.isnull().mean()*100

Unnamed: 0,0
Student_ID,0.0
Age,0.0
Marks,0.0
Department,0.0


In [239]:
df[df.isnull().any(axis=1)].count()

Unnamed: 0,0
Student_ID,0
Age,0
Marks,0
Department,0
