# synthetic data creation

In [39]:
import pandas as pd
import numpy as np

In [40]:
data = {"Student_ID":[101,102,103,104,105],
        "Name":["Amit","Suman","Ravi","Anjali","Priya"],
        "Age":[23,22,24,23,22],
       "Marks":[85,90,78,92,88],
        "Department":["CSE","ECE","ME","CSE","ECE"]
        }

In [41]:
df = pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Name,Age,Marks,Department
0,101,Amit,23,85,CSE
1,102,Suman,22,90,ECE
2,103,Ravi,24,78,ME
3,104,Anjali,23,92,CSE
4,105,Priya,22,88,ECE


In [42]:
data = {"Student_ID":[101,102,103,104,None,],
        "Name":["Amit","Suman","Ravi",None,"Priya"],
        "Age":[23,22,24,23,24],
       "Marks":[85,90,78,92,"Pass"],
        "Department":["CSE","ECE","ME",None,"ECE"]
        }
df = pd.DataFrame(data)
df

Unnamed: 0,Student_ID,Name,Age,Marks,Department
0,101.0,Amit,23,85,CSE
1,102.0,Suman,22,90,ECE
2,103.0,Ravi,24,78,ME
3,104.0,,23,92,
4,,Priya,24,Pass,ECE


In [43]:
import numpy as np
np.random.seed(42)
#n = 10
print(np.random.randint(1, 100, 10))

[52 93 15 72 61 21 83 87 75 75]


In [44]:
np.random.seed(41)
n = 1000
data = {"Student_ID": np.arange(101,101+n), "Age": np.random.randint(18,24,size=n), "Marks": np.random.randint(40,100,size=n), "Department": np.random.choice(["CSE","ECE","ME","ECE"], size=n)}
df = pd.DataFrame(data)
print(len(df))

1000


In [45]:
df.head()

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18,92,ECE
1,102,21,56,CSE
2,103,22,65,ME
3,104,20,46,ME
4,105,18,80,CSE


In [46]:
for col in ["Age","Marks","Department"]:
    df.loc[df.sample(frac=0.05).index, col] = np.nan

# add a small sample of duplicate rows
df = pd.concat([df, df.sample(20)], ignore_index=True)

# set some Age values to null
df.loc[df.sample(10).index, "Age"] = np.nan

df.head()

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,92.0,ECE
1,102,21.0,56.0,CSE
2,103,22.0,65.0,ME
3,104,,46.0,ME
4,105,18.0,80.0,CSE


In [47]:
df.loc[df.sample(10).index, "Marks"] = np.nan

df.head()

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,92.0,ECE
1,102,21.0,56.0,CSE
2,103,22.0,65.0,ME
3,104,,46.0,ME
4,105,18.0,80.0,CSE


In [48]:
df["Department"] = df["Department"].astype(str)
df.loc[df.sample(frac=0.1).index, "Department"] = df["Department"].str.lower()
df.head()


Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,92.0,ECE
1,102,21.0,56.0,CSE
2,103,22.0,65.0,ME
3,104,,46.0,ME
4,105,18.0,80.0,cse


In [49]:
df.tail()

Unnamed: 0,Student_ID,Age,Marks,Department
1015,995,,63.0,ECE
1016,152,22.0,90.0,CSE
1017,919,21.0,91.0,ece
1018,630,,91.0,ECE
1019,783,21.0,97.0,ECE


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1020 entries, 0 to 1019
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Student_ID  1020 non-null   int64  
 1   Age         957 non-null    float64
 2   Marks       961 non-null    float64
 3   Department  1020 non-null   object 
dtypes: float64(2), int64(1), object(1)
memory usage: 32.0+ KB


In [51]:
df.isnull().sum()

Student_ID     0
Age           63
Marks         59
Department     0
dtype: int64

In [52]:
df.describe()

Unnamed: 0,Student_ID,Age,Marks
count,1020.0,957.0,961.0
mean,599.191176,20.473354,69.703434
std,288.302316,1.704219,17.432085
min,101.0,18.0,40.0
25%,350.75,19.0,54.0
50%,598.5,20.0,70.0
75%,847.25,22.0,85.0
max,1100.0,23.0,99.0


In [53]:
pd.reset_option('display.max_rows')
df

Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,92.0,ECE
1,102,21.0,56.0,CSE
2,103,22.0,65.0,ME
3,104,,46.0,ME
4,105,18.0,80.0,cse
...,...,...,...,...
1015,995,,63.0,ECE
1016,152,22.0,90.0,CSE
1017,919,21.0,91.0,ece
1018,630,,91.0,ECE


In [54]:
df.iloc[500:600].to_string( )


'     Student_ID   Age  Marks Department\n500         601  21.0    NaN        ECE\n501         602  19.0   72.0         ME\n502         603  20.0   90.0         ME\n503         604  20.0   64.0        ECE\n504         605  21.0   67.0         ME\n505         606  22.0   90.0         ME\n506         607  21.0   49.0        ECE\n507         608  18.0   86.0        CSE\n508         609  22.0   66.0        ECE\n509         610  19.0   73.0        ECE\n510         611  22.0   72.0        CSE\n511         612  21.0   77.0        CSE\n512         613  21.0   59.0         ME\n513         614  21.0   71.0         ME\n514         615  23.0   77.0         ME\n515         616   NaN   71.0        ECE\n516         617  20.0   57.0        CSE\n517         618   NaN   97.0        ECE\n518         619  22.0    NaN        nan\n519         620  20.0   60.0        ECE\n520         621  20.0   45.0        ECE\n521         622  18.0   61.0         ME\n522         623  21.0   83.0        ECE\n523         624

In [55]:
df.to_csv()

',Student_ID,Age,Marks,Department\r\n0,101,18.0,92.0,ECE\r\n1,102,21.0,56.0,CSE\r\n2,103,22.0,65.0,ME\r\n3,104,,46.0,ME\r\n4,105,18.0,80.0,cse\r\n5,106,19.0,72.0,CSE\r\n6,107,21.0,,ECE\r\n7,108,19.0,88.0,ECE\r\n8,109,23.0,51.0,ECE\r\n9,110,20.0,41.0,ECE\r\n10,111,23.0,70.0,CSE\r\n11,112,22.0,77.0,ECE\r\n12,113,,73.0,ME\r\n13,114,21.0,50.0,ECE\r\n14,115,21.0,70.0,ME\r\n15,116,20.0,49.0,CSE\r\n16,117,18.0,57.0,CSE\r\n17,118,22.0,68.0,ECE\r\n18,119,23.0,52.0,ECE\r\n19,120,19.0,96.0,ECE\r\n20,121,21.0,77.0,ME\r\n21,122,22.0,82.0,ME\r\n22,123,18.0,46.0,CSE\r\n23,124,21.0,98.0,ECE\r\n24,125,22.0,81.0,CSE\r\n25,126,22.0,83.0,ece\r\n26,127,19.0,69.0,ME\r\n27,128,23.0,65.0,me\r\n28,129,20.0,,ME\r\n29,130,20.0,76.0,ECE\r\n30,131,19.0,48.0,cse\r\n31,132,22.0,40.0,ME\r\n32,133,23.0,95.0,ece\r\n33,134,21.0,80.0,CSE\r\n34,135,23.0,,ECE\r\n35,136,23.0,65.0,ME\r\n36,137,20.0,49.0,ECE\r\n37,138,20.0,48.0,ME\r\n38,139,20.0,71.0,nan\r\n39,140,22.0,97.0,ME\r\n40,141,21.0,85.0,ECE\r\n41,142,19.0,99.0,nan\r

In [58]:
df.to_csv('student.csv',index=False)
df


Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,92.0,ECE
1,102,21.0,56.0,CSE
2,103,22.0,65.0,ME
3,104,,46.0,ME
4,105,18.0,80.0,cse
...,...,...,...,...
1015,995,,63.0,ECE
1016,152,22.0,90.0,CSE
1017,919,21.0,91.0,ece
1018,630,,91.0,ECE


In [62]:
%pip install openpyxl

df.to_excel("synthetic_data.xlsx", index=False)


df.head()

Collecting openpyxl
  Obtaining dependency information for openpyxl from https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl.metadata
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Obtaining dependency information for et-xmlfile from https://files.pythonhosted.org/packages/c1/8b/5fe2cc11fee489817272089c4203e679c63b570a5aaeb18d852ae3cbba6a/et_xmlfile-2.0.0-py3-none-any.whl.metadata
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
   ---------------------------------------- 0.0/250.9 kB ? eta -:--:--
   ---------------------------------------- 0.0/250.9 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/250.9 kB 435.7 kB/s eta 0:00:01
   ----------------- ---------------------- 112.6/250.9 kB 1.1 MB/s eta 0:00:01
   ------------------------------------


[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Unnamed: 0,Student_ID,Age,Marks,Department
0,101,18.0,92.0,ECE
1,102,21.0,56.0,CSE
2,103,22.0,65.0,ME
3,104,,46.0,ME
4,105,18.0,80.0,cse
