In [1]:
import pandas as pd

Dataset Creation

In [2]:
data = {
    'Student_ID': [1,2,3,4,5,6,7,8,9,None],
    'Name': ['Aksha','Bickin','Clare','Dishon','Eva','Fiona','Griffin','Harry','Issac','Jones'],
    'Marks': [85,92,88,90,74,79,None,91,None,94],
    'Gender': ['F','M','F','M','F','F','M','M','M','M']
}

df = pd.DataFrame(data)
df.to_csv('Students.csv', index = False)

Data Cleaning

In [3]:
# Step 1: Load the dataset
df = pd.read_csv("/content/Students.csv")
print("Original Data:")
print(df)

Original Data:
   Student_ID     Name  Marks Gender
0         1.0    Aksha   85.0      F
1         2.0   Bickin   92.0      M
2         3.0    Clare   88.0      F
3         4.0   Dishon   90.0      M
4         5.0      Eva   74.0      F
5         6.0    Fiona   79.0      F
6         7.0  Griffin    NaN      M
7         8.0    Harry   91.0      M
8         9.0    Issac    NaN      M
9         NaN    Jones   94.0      M


In [15]:
# Convert Student_ID to Int64
df['Student_ID'] = df['Student_ID'].astype('Int64')
print(df)

   Student_ID     Name  Marks Gender
0           1    Aksha   85.0      F
1           2   Bickin   92.0      M
2           3    Clare   88.0      F
3           4   Dishon   90.0      M
4           5      Eva   74.0      F
5           6    Fiona   79.0      F
6           7  Griffin    NaN      M
7           8    Harry   91.0      M
8           9    Issac    NaN      M
9        <NA>    Jones   94.0      M


In [19]:
# Drop rows where Student_ID is missing
df_cleaned = df.dropna(subset=['Student_ID']).copy()
print("\nAfter dropping rows with missing Student_ID:")
print(df_cleaned)


After dropping rows with missing Student_ID:
   Student_ID     Name  Marks Gender
0           1    Aksha   85.0      F
1           2   Bickin   92.0      M
2           3    Clare   88.0      F
3           4   Dishon   90.0      M
4           5      Eva   74.0      F
5           6    Fiona   79.0      F
6           7  Griffin    NaN      M
7           8    Harry   91.0      M
8           9    Issac    NaN      M


In [20]:
# Fill missing Marks with a default value (0)
df_cleaned['Marks'] = df_cleaned['Marks'].fillna(0)

In [22]:
# Check for missing values
df_cleaned.isnull().sum()

Unnamed: 0,0
Student_ID,0
Name,0
Marks,0
Gender,0


In [23]:
# Show cleaned DataFrame
print("\nFinal Cleaned Data:")
print(df_cleaned)


Final Cleaned Data:
   Student_ID     Name  Marks Gender
0           1    Aksha   85.0      F
1           2   Bickin   92.0      M
2           3    Clare   88.0      F
3           4   Dishon   90.0      M
4           5      Eva   74.0      F
5           6    Fiona   79.0      F
6           7  Griffin    0.0      M
7           8    Harry   91.0      M
8           9    Issac    0.0      M


Cleaning Steps Summary:

1. Dropped rows with missing Student_ID

2. Filled missing Marks with fillna()

---



Dataset Creation

In [35]:
# Create second dataset

dept_data = {
    'Student_ID': [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'Department': ['Science', 'Commerce', 'Arts', 'Science', 'Commerce', 'Arts', 'Science', 'Commerce', 'Science']
}

df_dept = pd.DataFrame(dept_data)
df_dept.to_csv('departments.csv', index=False)

In [36]:
df_dept = pd.read_csv('/content/departments.csv')

In [37]:
print(df_dept)

   Student_ID Department
0           1    Science
1           2   Commerce
2           3       Arts
3           4    Science
4           5   Commerce
5           6       Arts
6           7    Science
7           8   Commerce
8           9    Science


In [38]:
# Join datasets on Student_ID
merged_df = pd.merge(df_cleaned, df_dept, on = 'Student_ID', how = 'inner')

print("\nMerged Data (Inner Join):")
print(merged_df)


Merged Data (Inner Join):
   Student_ID     Name  Marks Gender Department
0           1    Aksha   85.0      F    Science
1           2   Bickin   92.0      M   Commerce
2           3    Clare   88.0      F       Arts
3           4   Dishon   90.0      M    Science
4           5      Eva   74.0      F   Commerce
5           6    Fiona   79.0      F       Arts
6           7  Griffin    0.0      M    Science
7           8    Harry   91.0      M   Commerce
8           9    Issac    0.0      M    Science


In [47]:
# Left Join
left_join = pd.merge(df_cleaned, df_dept, on='Student_ID', how='left')
print("Left Join:\n", left_join)

# Right Join
right_join = pd.merge(df_cleaned, df_dept, on='Student_ID', how='right')
print("\nRight Join:\n", right_join)

# Outer Join
outer_join = pd.merge(df_cleaned, df_dept, on='Student_ID', how='outer')
print("\nOuter Join:\n", outer_join)

Left Join:
    Student_ID     Name  Marks Gender Department
0           1    Aksha   85.0      F    Science
1           2   Bickin   92.0      M   Commerce
2           3    Clare   88.0      F       Arts
3           4   Dishon   90.0      M    Science
4           5      Eva   74.0      F   Commerce
5           6    Fiona   79.0      F       Arts
6           7  Griffin    0.0      M    Science
7           8    Harry   91.0      M   Commerce
8           9    Issac    0.0      M    Science

Right Join:
    Student_ID     Name  Marks Gender Department
0           1    Aksha   85.0      F    Science
1           2   Bickin   92.0      M   Commerce
2           3    Clare   88.0      F       Arts
3           4   Dishon   90.0      M    Science
4           5      Eva   74.0      F   Commerce
5           6    Fiona   79.0      F       Arts
6           7  Griffin    0.0      M    Science
7           8    Harry   91.0      M   Commerce
8           9    Issac    0.0      M    Science

Outer Join:
 

Pandas Join (Merge):
1. Inner
2. Left
3. Right
4. Outer

---

