In [None]:
# Scenario: AI Model for Detecting Defective Products in a Factory

# A manufacturing company wants to build an AI system that detects defective products from camera images on the production line.

# They have collected 100 product images, and each image is labeled as:

# 0 → Non-defective product

# 1 → Defective product

# To build a reliable AI model, the data scientist must divide the dataset into three parts:

# Training set (70%) → Used to train the AI model

# Validation set (15%) → Used to tune and improve the model

# Test set (15%) → Used to evaluate final performance

# Your task is to help the data scientist split the dataset correctly.

In [None]:
from sklearn.model_selection import train_test_split

# X = images
# y = labels (0 or 1)

# Step 1: Train (70%) and Temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# Step 2: Split Temp into Validation (15%) and Test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print("Training set size:", len(X_train))
print("Validation set size:", len(X_val))
print("Test set size:", len(X_test))

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('housing_dataset.csv')

X = df[['size','bedrooms','age','location']]
Y = df[['price']]

X_train, X_temp, Y_train, Y_temp = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

X_train,X_val, Y_train, Y_val = train_test_split(
    X_train, Y_train, test_size=0.25, random_state=42
)    

print("Training set size:", len(X_train))
print("Validation set size:", len(X_val))
print("Test set size:", len(X_temp))

Training set size: 6
Validation set size: 2
Test set size: 2


In [None]:
# Scenario: AI System for Music Genre Classification

# A music streaming company wants to build an AI model that automatically identifies the genre of songs (for example: Rock, Pop, Classical, etc.).

# They have collected 100 audio clips, and each clip has already been labeled with its correct genre. To build a reliable AI model, the data science team must divide the dataset into three parts:

# Training set (70%) → Used to teach the AI model patterns in music

# Validation set (15%) → Used to tune model parameters and improve accuracy

# Test set (15%) → Used to evaluate how well the model works on completely new songs

# You are the AI engineer responsible for preparing the dataset.

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("music_genre_dataset.csv")

# Features and target
X = df.drop("genre_label", axis=1)
y = df["genre_label"]

# Step 1: Training (70%) + Temporary (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42,
    stratify=y
)

# Step 2: Validation (15%) + Test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    random_state=42,
    stratify=y_temp
)

# Verify sizes
print("Training:", len(X_train))
print("Validation:", len(X_val))
print("Test:", len(X_test))

Training: 70
Validation: 15
Test: 15


In [None]:
# Scenario: AI System for Predicting Used Car Condition
# A car resale company wants to build an AI model that predicts whether a used car is in good condition or needs repairs based on inspection data.

# They have collected 100 car inspection records, and each record includes details like:

# Mileage

# Engine performance score

# Fuel efficiency

# Age of the car

# Each car is labeled as:

# 0 → Needs Repair

# 1 → Good Condition

# To ensure the AI model performs reliably on new cars, the dataset must be divided into:

# Training set (70%) → Teach the AI model

# Validation set (15%) → Tune and improve the model

# Test set (15%) → Evaluate final performance

# You are the data scientist responsible for splitting the dataset correctly.


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("used_car_condition_dataset.csv")

# Features and Target
X = df.drop("condition_label", axis=1)
y = df["condition_label"]

# Step 1: Training (70%) + Temporary (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42,
    stratify=y
)

# Step 2: Validation (15%) + Test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    random_state=42,
    stratify=y_temp
)

print("Training samples:", len(X_train))
print("Validation samples:", len(X_val))
print("Test samples:", len(X_test))

Training samples: 70
Validation samples: 15
Test samples: 15
