In [1]:
# Lab 1: Data Extraction
# Student Name: INGABIRE CATHERINE
# Date: October 2025

# Step 1: Import Libraries
import pandas as pd
import json
import os

# Step 2: Generate Sample Data Files
# Create "data" folder if it does not exist
os.makedirs("data", exist_ok=True)

# (a) Create students.csv
students = {
    "student_id": [1, 2, 3, 4, 5],
    "name": ["Alice", "Bob", "Charlie", "Diana", "Ethan"],
    "age": [20, 22, 21, 23, 22],
    "gender": ["F", "M", "M", "F", "M"]
}

students_df = pd.DataFrame(students)
students_df.to_csv("data/students.csv", index=False)
print("students.csv created successfully\n")
print(students_df)

# (b) Create performance.json
performance = [
    {"student_id": 1, "math_score": 85, "science_score": 90, "english_score": 78},
    {"student_id": 2, "math_score": 76, "science_score": 83, "english_score": 88},
    {"student_id": 3, "math_score": 92, "science_score": 89, "english_score": 95},
    {"student_id": 4, "math_score": 65, "science_score": 72, "english_score": 70},
    {"student_id": 5, "math_score": 80, "science_score": 85, "english_score": 79}
]

with open("data/performance.json", "w") as f:
    json.dump(performance, f, indent=4)

print("performance.json created successfully\n")

# Step 3: Extract Data
students_df = pd.read_csv("data/students.csv")
performance_df = pd.read_json("data/performance.json")

print("Students Data:")
display(students_df.head())

print("\nPerformance Data:")
display(performance_df.head())

# Step 4: Inspect Data
print("\nStudents Info:")
students_df.info()

print("\nMissing Values in Students:")
print(students_df.isnull().sum())

print("\nPerformance Summary:")
print(performance_df.describe())

# Step 5: Merge Both Datasets
combined_df = pd.merge(students_df, performance_df, on="student_id")
print("\nCombined Extracted Data:")
display(combined_df)

# Step 6: Save Extracted Output
combined_df.to_csv("data/extracted_data.csv", index=False)
print("\nExtracted dataset saved as data/extracted_data.csv")

# Verify saved data
print("\nPreview of Saved Extracted Data:")
display(pd.read_csv("data/extracted_data.csv").head())


# Step 7: Reflection
"""
Reflection:
While working on this lab, I realized how important it is to organize data properly before analysis. 
One small challenge I faced was making sure the data files were saved in the right folder and that 
the JSON and CSV files matched correctly using 'student_id'. 

Inspecting the data before doing anything else also helped me confirm that everything was clean 
and consistent. I’ve learned that taking time to check for missing values or errors early 
saves a lot of trouble later when transforming or analyzing data.
"""


students.csv created successfully

   student_id     name  age gender
0           1    Alice   20      F
1           2      Bob   22      M
2           3  Charlie   21      M
3           4    Diana   23      F
4           5    Ethan   22      M
performance.json created successfully

Students Data:


Unnamed: 0,student_id,name,age,gender
0,1,Alice,20,F
1,2,Bob,22,M
2,3,Charlie,21,M
3,4,Diana,23,F
4,5,Ethan,22,M



Performance Data:


Unnamed: 0,student_id,math_score,science_score,english_score
0,1,85,90,78
1,2,76,83,88
2,3,92,89,95
3,4,65,72,70
4,5,80,85,79



Students Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   student_id  5 non-null      int64 
 1   name        5 non-null      object
 2   age         5 non-null      int64 
 3   gender      5 non-null      object
dtypes: int64(2), object(2)
memory usage: 292.0+ bytes

Missing Values in Students:
student_id    0
name          0
age           0
gender        0
dtype: int64

Performance Summary:
       student_id  math_score  science_score  english_score
count    5.000000    5.000000       5.000000        5.00000
mean     3.000000   79.600000      83.800000       82.00000
std      1.581139   10.114346       7.190271        9.66954
min      1.000000   65.000000      72.000000       70.00000
25%      2.000000   76.000000      83.000000       78.00000
50%      3.000000   80.000000      85.000000       79.00000
75%      4.000000   85.000000      89.000

Unnamed: 0,student_id,name,age,gender,math_score,science_score,english_score
0,1,Alice,20,F,85,90,78
1,2,Bob,22,M,76,83,88
2,3,Charlie,21,M,92,89,95
3,4,Diana,23,F,65,72,70
4,5,Ethan,22,M,80,85,79



Extracted dataset saved as data/extracted_data.csv

Preview of Saved Extracted Data:


Unnamed: 0,student_id,name,age,gender,math_score,science_score,english_score
0,1,Alice,20,F,85,90,78
1,2,Bob,22,M,76,83,88
2,3,Charlie,21,M,92,89,95
3,4,Diana,23,F,65,72,70
4,5,Ethan,22,M,80,85,79


"\nReflection:\nWhile working on this lab, I realized how important it is to organize data properly before analysis. \nOne small challenge I faced was making sure the data files were saved in the right folder and that \nthe JSON and CSV files matched correctly using 'student_id'. \n\nInspecting the data before doing anything else also helped me confirm that everything was clean \nand consistent. I’ve learned that taking time to check for missing values or errors early \nsaves a lot of trouble later when transforming or analyzing data.\n"