Synthetic Data Creation

In [3]:
import pandas as pd  #importing pandas

In [4]:
# Create a simple, clean synthetic dataset
data = { 'Roll_no': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 40, 45],
    'marks': [85, 90, 95, 80, 75],
    'Department': ['CSE', 'ECE', 'BT', 'AIML', 'CSE']
    }

In [5]:
import pandas as pd
df = pd.DataFrame(data)
df

Unnamed: 0,Roll_no,Name,Age,marks,Department
0,1,Alice,25,85,CSE
1,2,Bob,30,90,ECE
2,3,Charlie,35,95,BT
3,4,David,40,80,AIML
4,5,Eva,45,75,CSE


In [6]:
data1 = { 'Roll_no': [1, 2, 3, 4, None ,4],
    'Name': ['Alice', 'Bob','David', None, 'Eva', 'David'],
    'Age': [25, 30, 35,45, None, 40],
    'marks': [85, 90, 95, 80, None, "Pass"],
    'Department': ['CSE', 'ECE', 'BT', 'AIML', 'CSE', 'AIML']
    }

In [7]:
import pandas as pd
import numpy as np  # to use random functions if needed


df = pd.DataFrame(data1)
df

Unnamed: 0,Roll_no,Name,Age,marks,Department
0,1.0,Alice,25.0,85,CSE
1,2.0,Bob,30.0,90,ECE
2,3.0,David,35.0,95,BT
3,4.0,,45.0,80,AIML
4,,Eva,,,CSE
5,4.0,David,40.0,Pass,AIML


In [8]:
np.random.seed(42)  # for reproducibility
n=10
np.random.randint(1,101,size=n)  # generate n random integers between 1 and 100     



array([52, 93, 15, 72, 61, 21, 83, 87, 75, 75])

In [9]:
np.random.seed(24)  # for reproducibility
n=10
np.random.randint(1,101,size=n)  # generat

array([ 35,   4,  65,  88,  18,  18,   2,  80,   5, 100])

In [10]:
#np.random.seed(42)  # for reproducibility
n=10
np.random.randint(1,101,size=n)  # generat

array([ 83,  12, 100,  16,  74,  19,   8,  26,  36,  96])

In [11]:
np.random.seed(42)  # for reproducibility
n=1000
df =pd.DataFrame(
    { 'Roll_no': np.random.randint(101,101+n),

    'Age': np.random.randint(18,24, size=n),
    'marks': np.random.randint(40,101,size=n),
    'Department': np.random.choice(['CSE', 'ECE', 'BT', 'Automation', 'ME', 'AIML'], size=n)
    })
#df['Random_Number'] = np.random.randint(1,2001,size=n)  # generate n random integers between 1 and 100
df



Unnamed: 0,Roll_no,Age,marks,Department
0,203,21,61,Automation
1,203,22,60,BT
2,203,20,45,CSE
3,203,22,45,Automation
4,203,22,94,AIML
...,...,...,...,...
995,203,23,43,Automation
996,203,22,59,ECE
997,203,18,71,CSE
998,203,18,75,ECE


In [12]:
df.head()

Unnamed: 0,Roll_no,Age,marks,Department
0,203,21,61,Automation
1,203,22,60,BT
2,203,20,45,CSE
3,203,22,45,Automation
4,203,22,94,AIML


In [13]:
#Introduce missing values in AGE, MARKS and Department columns randomly
for col in ['Age', 'marks', 'Department']:
    df.loc[df.sample(frac=0.05).index, col] = np.nan    
# Dupilcate Rows/records
pd.concat([df, df.sample(20)], ignore_index=True)
# Introduce invalid ages
df.loc[df.sample(10).index,'Age'] = np.random.choice([-5, 240], size=10)
# Introduce Outliers in marks
df.loc[df.sample(10).index,'marks'] = np.random.choice([-10, 150], size=10)
#Inconsistent entries in Department
df["Department"]= df["Department"].astype(str)
df.loc[df.sample(10).index,'Department'] = df["Department"].str.lower()    

df.tail()


Unnamed: 0,Roll_no,Age,marks,Department
995,203,,43.0,
996,203,22.0,59.0,ECE
997,203,18.0,71.0,CSE
998,203,18.0,75.0,ECE
999,203,22.0,89.0,AIML


In [14]:
# Initial data quality check
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Roll_no     1000 non-null   int64  
 1   Age         952 non-null    float64
 2   marks       951 non-null    float64
 3   Department  1000 non-null   object 
dtypes: float64(2), int64(1), object(1)
memory usage: 31.4+ KB


In [15]:
df.isnull().sum()

Roll_no        0
Age           48
marks         49
Department     0
dtype: int64

In [16]:
#Data Exploration
df.shape

(1000, 4)

In [17]:
df.columns

Index(['Roll_no', 'Age', 'marks', 'Department'], dtype='object')

In [18]:
#show random 10 records
df.sample(10)

Unnamed: 0,Roll_no,Age,marks,Department
428,203,23.0,,CSE
599,203,240.0,73.0,CSE
356,203,22.0,58.0,ME
116,203,18.0,80.0,
663,203,22.0,65.0,AIML
868,203,19.0,68.0,ECE
4,203,22.0,94.0,AIML
847,203,19.0,63.0,AIML
251,203,20.0,87.0,Automation
625,203,22.0,73.0,AIML


In [19]:
df.isnull().sum()

Roll_no        0
Age           48
marks         49
Department     0
dtype: int64

In [20]:
#how much % of missing values in each column
df.isnull().mean()*100

Roll_no       0.0
Age           4.8
marks         4.9
Department    0.0
dtype: float64

In [21]:
df[df.isnull().any(axis=1)].count()  # total rows with atleast one missing value

Roll_no       96
Age           48
marks         47
Department    96
dtype: int64

In [22]:
#check duplicate rows
df.duplicated().sum()

165

In [23]:
pd.concat([df, df.sample(20)], ignore_index=True)

Unnamed: 0,Roll_no,Age,marks,Department
0,203,21.0,61.0,Automation
1,203,22.0,60.0,BT
2,203,20.0,45.0,CSE
3,203,22.0,45.0,Automation
4,203,22.0,94.0,AIML
...,...,...,...,...
1015,203,23.0,57.0,AIML
1016,203,22.0,48.0,ECE
1017,203,18.0,92.0,ME
1018,203,19.0,68.0,ECE


In [24]:
df.describe()

Unnamed: 0,Roll_no,Age,marks
count,1000.0,952.0,951.0
mean,203.0,20.945378,69.626709
std,0.0,12.628692,19.202316
min,203.0,-5.0,-10.0
25%,203.0,19.0,54.0
50%,203.0,20.0,70.0
75%,203.0,22.0,85.0
max,203.0,240.0,150.0


In [35]:
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', 2)   
pd.reset_option('display.max_rows')
df

Unnamed: 0,Roll_no,...,Department
0,203,...,Automation
1,203,...,BT
2,203,...,CSE
3,203,...,Automation
4,203,...,AIML
...,...,...,...
995,203,...,
996,203,...,ECE
997,203,...,CSE
998,203,...,ECE


In [30]:
# Display rows 500 to 600 
df.iloc[500:601]    

Unnamed: 0,Roll_no,Age,marks,Department
500,203,18.0,84.0,ME
501,203,,64.0,ME
502,203,23.0,78.0,BT
503,203,22.0,48.0,ME
504,203,19.0,93.0,Automation
...,...,...,...,...
596,203,23.0,98.0,Automation
597,203,22.0,84.0,BT
598,203,20.0,82.0,ECE
599,203,240.0,73.0,CSE


In [31]:
df.iloc[500:601].to_string()

'     Roll_no    Age  marks  Department\n500      203   18.0   84.0          ME\n501      203    NaN   64.0          ME\n502      203   23.0   78.0          BT\n503      203   22.0   48.0          ME\n504      203   19.0   93.0  Automation\n505      203   19.0   61.0         CSE\n506      203   23.0   82.0  Automation\n507      203   22.0   85.0          BT\n508      203   21.0   43.0          BT\n509      203   19.0    NaN  Automation\n510      203   21.0   97.0          ME\n511      203   19.0   68.0          ME\n512      203   19.0   84.0         nan\n513      203   20.0   73.0          BT\n514      203   19.0    NaN         ECE\n515      203   18.0   76.0        AIML\n516      203   22.0   50.0          ME\n517      203   22.0   45.0  Automation\n518      203   21.0   57.0          ME\n519      203   19.0   60.0  Automation\n520      203    NaN   81.0         nan\n521      203   21.0   75.0          BT\n522      203   20.0   80.0          ME\n523      203   21.0   77.0  Automation\

In [33]:
df.to_csv('synthetic_student_data.csv', index=False)
df

Unnamed: 0,Roll_no,Age,marks,Department
0,203,21.0,61.0,Automation
1,203,22.0,60.0,BT
2,203,20.0,45.0,CSE
3,203,22.0,45.0,Automation
4,203,22.0,94.0,AIML
...,...,...,...,...
995,203,,43.0,
996,203,22.0,59.0,ECE
997,203,18.0,71.0,CSE
998,203,18.0,75.0,ECE


In [34]:
df.to_excel('synthetic_student_data.xlsx', index=False)