Title: Data Splitting (Train-Test-Validation)


Task 1: House Prices Dataset (Regression)<br>
Use the House Prices dataset to predict house prices.<br>
Split the data into training, validation, and test sets (70% train, 15% validation, 15% test).

In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

# Step 1: Load dataset
data = fetch_california_housing()
X, y = data.data, data.target

# Step 2: Initial split - 70% train, 30% temp
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42)

# Step 3: Split temp into 15% validation and 15% test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 4: Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Step 5: Train Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Step 6: Evaluate on validation and test sets
y_val_pred = model.predict(X_val_scaled)
y_test_pred = model.predict(X_test_scaled)

val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
val_r2 = r2_score(y_val, y_val_pred)

test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

# Step 7: Print results
print("Validation RMSE:", round(val_rmse, 4))
print("Validation R² Score:", round(val_r2, 4))
print("Test RMSE:", round(test_rmse, 4))
print("Test R² Score:", round(test_r2, 4))


Validation RMSE: 0.7354
Validation R² Score: 0.5848
Test RMSE: 0.7213
Test R² Score: 0.6066


Task 2: Iris Dataset (Classification)<br>
Apply data splitting to the Iris dataset.<br>
Split it into train (70%), validation (15%), and test (15%).


In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter

# Load Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Step 1: Split 70% train and 30% temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y)

# Step 2: Split temp into 15% validation and 15% test (half-half of 30%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Optional: Check class distribution in each set
print("Training set class distribution:", Counter(y_train))
print("Validation set class distribution:", Counter(y_val))
print("Test set class distribution:", Counter(y_test))


Training set class distribution: Counter({np.int64(1): 35, np.int64(0): 35, np.int64(2): 35})
Validation set class distribution: Counter({np.int64(0): 8, np.int64(1): 7, np.int64(2): 7})
Test set class distribution: Counter({np.int64(2): 8, np.int64(1): 8, np.int64(0): 7})



Task 3: Customer Churn Dataset (Classification)<br>
Predict customer churn using the telecom dataset.<br>
Split the data into training, validation, and test sets.

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import urllib.request

# Step 1: Download dataset (if not present)
url = "https://raw.githubusercontent.com/blastchar/telco-customer-churn/master/WA_Fn-UseC_-Telco-Customer-Churn.csv"
filename = "telco_churn.csv"

try:
    with open(filename, 'r') as f:
        print(f"{filename} already exists.")
except FileNotFoundError:
    print(f"Downloading {filename} ...")
    urllib.request.urlretrieve(url, filename)
    print("Download complete.")

# Step 2: Load dataset
data = pd.read_csv(filename)

# Step 3: Drop customerID column
if 'customerID' in data.columns:
    data = data.drop('customerID', axis=1)

# Step 4: Encode categorical variables (except target)
for col in data.select_dtypes(include=['object']).columns:
    if col != 'Churn':
        data[col] = LabelEncoder().fit_transform(data[col])

# Step 5: Encode target variable
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

# Step 6: Handle missing values
data.fillna(data.median(), inplace=True)

# Step 7: Prepare features and target
X = data.drop('Churn', axis=1)
y = data['Churn']

# Step 8: Split into train (70%) and temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

# Step 9: Split temp into validation (15%) and test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Step 10: Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Step 11: Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Step 12: Evaluate on validation set
y_val_pred = model.predict(X_val_scaled)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

# Step 13: Evaluate on test set
y_test_pred = model.predict(X_test_scaled)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("Test Classification Report:\n", classification_report(y_test, y_test_pred))


Downloading telco_churn.csv ...


HTTPError: HTTP Error 404: Not Found