**Synthetic Data Creation**

In [75]:
import pandas as pd   #Importing Library

In [43]:
# Create a simple , clean synthetic Dataset
data = {
    "Student_ID": [1, 2, 3, 4, 5],
    "Name": ["Ashna","Anuj","Amit","Arshia","Ishita"],
    "Age": [20, 21, 19, 22, 23],
    "Grade": ["A", "B", "C", "A", "B"],
    "City": ["New York", "London", "Paris", "Tokyo", "Sydney"],
    "Department" :["CSE","CSE","AIML","AIML","AIML"]
}



In [44]:
df = pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Name,Age,Grade,City,Department
0,1,Ashna,20,A,New York,CSE
1,2,Anuj,21,B,London,CSE
2,3,Amit,19,C,Paris,AIML
3,4,Arshia,22,A,Tokyo,AIML
4,5,Ishita,23,B,Sydney,AIML


In [45]:
# Create a Synthetic Dataset with Noise/ehich is not clean
data = {
    "Student_ID": [1, 2,None,5,2],
    "Name": ["Ashna","Anuj",None,"Arshia","Ishita"], #Now its a keyword
    "Age": [20, 21, 19, 22, 23],
    "Grade": ["A", "B", "C",None, "25"],      #It contains format mismatch, missing values,duplicate data
    "City": ["New York", "London", "Paris", "Tokyo", "Sydney"],
    "Department" :["CSE","CSE","AIML",None,"42"]     #if written in double quotes it becomes a string.
}
df = pd.DataFrame(data)
df


Unnamed: 0,Student_ID,Name,Age,Grade,City,Department
0,1.0,Ashna,20,A,New York,CSE
1,2.0,Anuj,21,B,London,CSE
2,,,19,C,Paris,AIML
3,5.0,Arshia,22,,Tokyo,
4,2.0,Ishita,23,25,Sydney,42


In [46]:
#Creating Large Dataset
import numpy as np # Random
np.random.seed(15)  #one time randome number generated.
#n = 5
print(np.random.randint(1,100,5))

[73 13  6  1 29]


In [47]:
# Creating Synthetic clean data with 1000 entries
np.random.seed(41)
n=1000
data = ({
    "Student_ID": np.arange(101,101+n),
    "Age": np.random.randint(18,24,size=n),
    "Marks": np.random.randint(40,100,size=n),
    "Department" :np.random.choice(["CSE","ECE","A&R","ME","AIML"],size=n)
})
df = pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,92,CSE
1,102,21,56,A&R
2,103,22,65,A&R
3,104,20,46,CSE
4,105,18,80,CSE
...,...,...,...,...
995,1096,21,48,CSE
996,1097,21,76,A&R
997,1098,20,42,CSE
998,1099,21,62,ECE


In [48]:
from numpy.random.mtrand import sample
# Introduce Missing values in Age, Marks and Department
for col in ["Age","Marks","Department"]:
    df.loc[df.sample(frac=0.5).index,col] = np.nan

df


Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,92.0,CSE
1,102,21.0,,A&R
2,103,,,A&R
3,104,,46.0,
4,105,,80.0,
...,...,...,...,...
995,1096,,48.0,
996,1097,21.0,,A&R
997,1098,,42.0,CSE
998,1099,,62.0,ECE


In [49]:
# Duplicate Records/Rows
pd.concat([df,df.sample(50)], ignore_index=True)

df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,92.0,CSE
1,102,21.0,,A&R
2,103,,,A&R
3,104,,46.0,
4,105,,80.0,
...,...,...,...,...
995,1096,,48.0,
996,1097,21.0,,A&R
997,1098,,42.0,CSE
998,1099,,62.0,ECE


In [50]:
df.loc[df.sample(10).index,"Age"]= np.nan

df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,92.0,CSE
1,102,21.0,,A&R
2,103,,,A&R
3,104,,46.0,
4,105,,80.0,
...,...,...,...,...
995,1096,,48.0,
996,1097,21.0,,A&R
997,1098,,42.0,CSE
998,1099,,62.0,ECE


In [51]:
#Duplicate invalid ages
df.loc[df.sample(10).index,"Age"]=np.random.choice([-5, 240], size=10)

df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,92.0,CSE
1,102,21.0,,A&R
2,103,,,A&R
3,104,,46.0,
4,105,,80.0,
...,...,...,...,...
995,1096,,48.0,
996,1097,21.0,,A&R
997,1098,,42.0,CSE
998,1099,,62.0,ECE


In [52]:
df.loc[df.sample(60).index,"Marks"]=np.nan
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,92.0,CSE
1,102,21.0,,A&R
2,103,,,A&R
3,104,,46.0,
4,105,,80.0,
...,...,...,...,...
995,1096,,48.0,
996,1097,21.0,,A&R
997,1098,,42.0,CSE
998,1099,,,ECE


In [53]:
#Introduce Outliers
df.loc[df.sample(10).index,"Marks"] = np.random.choice([150, 240], size=10)

df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,,92.0,CSE
1,102,21.0,,A&R
2,103,,,A&R
3,104,,46.0,
4,105,,80.0,
...,...,...,...,...
995,1096,,48.0,
996,1097,21.0,,A&R
997,1098,,42.0,CSE
998,1099,,,ECE


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Student_ID  1000 non-null   int64  
 1   Age         502 non-null    float64
 2   Marks       474 non-null    float64
 3   Department  500 non-null    object 
dtypes: float64(2), int64(1), object(1)
memory usage: 31.4+ KB


In [55]:
#Inconsistent Text Formatting
df["Department"] = df["Department"].astype(str)
df.loc[df.sample(frac=0.1).index, "Department"]=df["Department"].str.lower()
df = pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,92,CSE
1,102,21,56,A&R
2,103,22,65,A&R
3,104,20,46,CSE
4,105,18,80,CSE
...,...,...,...,...
995,1096,21,48,CSE
996,1097,21,76,A&R
997,1098,20,42,CSE
998,1099,21,62,ECE


In [56]:
df.tail()

Unnamed: 0,Student_ID,Age,Marks,Department
995,1096,21,48,CSE
996,1097,21,76,A&R
997,1098,20,42,CSE
998,1099,21,62,ECE
999,1100,23,54,ME


In [57]:
df.info()  #To get information of data stored.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Student_ID  1000 non-null   int64 
 1   Age         1000 non-null   int64 
 2   Marks       1000 non-null   int64 
 3   Department  1000 non-null   object
dtypes: int64(3), object(1)
memory usage: 31.4+ KB


In [58]:
df.isnull().sum()

Unnamed: 0,0
Student_ID,0
Age,0
Marks,0
Department,0


In [59]:
df.shape

(1000, 4)

In [60]:
df.columns

Index(['Student_ID', 'Age', 'Marks', 'Department'], dtype='object')

In [61]:
df.sample(10)

Unnamed: 0,Student_ID,Age,Marks,Department
608,709,22,98,A&R
916,1017,19,58,AIML
557,658,23,54,A&R
471,572,23,74,CSE
725,826,18,97,AIML
509,610,19,73,A&R
556,657,20,62,CSE
177,278,18,70,ECE
155,256,19,58,AIML
119,220,21,45,AIML


In [62]:
df.isnull().sum()

Unnamed: 0,0
Student_ID,0
Age,0
Marks,0
Department,0


In [63]:
df.isnull().mean()*100

Unnamed: 0,0
Student_ID,0.0
Age,0.0
Marks,0.0
Department,0.0


In [64]:
df[df.isnull().any(axis=1)].count()

Unnamed: 0,0
Student_ID,0
Age,0
Marks,0
Department,0


In [65]:
pd.set_option('display.max_rows',None)
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,92,CSE
1,102,21,56,A&R
2,103,22,65,A&R
3,104,20,46,CSE
4,105,18,80,CSE
5,106,19,72,ECE
6,107,21,82,ECE
7,108,19,88,CSE
8,109,23,51,A&R
9,110,20,41,AIML


In [66]:
pd.reset_option('display.max_rows')
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,92,CSE
1,102,21,56,A&R
2,103,22,65,A&R
3,104,20,46,CSE
4,105,18,80,CSE
...,...,...,...,...
995,1096,21,48,CSE
996,1097,21,76,A&R
997,1098,20,42,CSE
998,1099,21,62,ECE


In [67]:
#Display 500-600 Rows
df.iloc[500:600].to_string()

'     Student_ID  Age  Marks Department\n500         601   21     40        CSE\n501         602   19     72         ME\n502         603   20     90       AIML\n503         604   20     64        A&R\n504         605   21     67        CSE\n505         606   22     90        CSE\n506         607   21     49       AIML\n507         608   18     86        ECE\n508         609   22     66       AIML\n509         610   19     73        A&R\n510         611   22     72        CSE\n511         612   21     77       AIML\n512         613   21     59         ME\n513         614   21     71        CSE\n514         615   23     77         ME\n515         616   23     71       AIML\n516         617   20     57       AIML\n517         618   19     97        CSE\n518         619   22     75       AIML\n519         620   20     60        A&R\n520         621   20     45        ECE\n521         622   18     61        A&R\n522         623   21     83         ME\n523         624   18     56       AIML\

In [68]:
df.to_csv

In [69]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Student_ID  1000 non-null   int64 
 1   Age         1000 non-null   int64 
 2   Marks       1000 non-null   int64 
 3   Department  1000 non-null   object
dtypes: int64(3), object(1)
memory usage: 31.4+ KB


In [80]:
df.to_csv("synthetic_data.csv",index=False)
df


Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,92,CSE
1,102,21,56,A&R
2,103,22,65,A&R
3,104,20,46,CSE
4,105,18,80,CSE
...,...,...,...,...
995,1096,21,48,CSE
996,1097,21,76,A&R
997,1098,20,42,CSE
998,1099,21,62,ECE


In [82]:
df.to_excel("synthetic_data.xlsx",index=False)
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,92,CSE
1,102,21,56,A&R
2,103,22,65,A&R
3,104,20,46,CSE
4,105,18,80,CSE
...,...,...,...,...
995,1096,21,48,CSE
996,1097,21,76,A&R
997,1098,20,42,CSE
998,1099,21,62,ECE
