In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/housing/housing.csv


In [8]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import time

# Step 1: Load CSV using pandas
def load_csv_pandas(filepath):
    if not os.path.isfile(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")
    df = pd.read_csv(filepath)

    print("Columns in dataset:", df.columns.tolist())
    print("Initial data info:")
    print(df.info())
    print("First 5 rows:")
    print(df.head())

    # Drop rows with missing values
    df = df.dropna()
    print(f"Data shape after dropping missing values: {df.shape}")

    return df

# Step 2: Prepare features and labels, handle categorical encoding
def prepare_data(df):
    target_col = 'median_house_value'
    
    if target_col not in df.columns:
        raise ValueError(f"Target column '{target_col}' not found")

    y = pd.to_numeric(df[target_col], errors='coerce')
    X = df.drop(columns=[target_col])

    mask = ~y.isna()
    y = y[mask]
    X = X.loc[mask]

    X = pd.get_dummies(X, drop_first=True)

    print(f"Feature columns after encoding: {X.columns.tolist()}")
    print(f"X shape: {X.shape}, y shape: {y.shape}")

    return X.values.tolist(), y.values.tolist()  # for pure python, convert to list

# Step 3: Scale features
def scale_features(X_train, X_val):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    return X_train_scaled.tolist(), X_val_scaled.tolist()  # convert back to list

# Step 4: Pure Python Linear Regression Model
class LinearRegressionPurePython:
    def __init__(self, lr=0.01, n_iters=500):
        self.lr = lr
        self.n_iters = n_iters
        self.weights = []
        self.bias = 0

    def fit(self, X, y):
        start = time.time()
        n_samples, n_features = len(X), len(X[0])
        self.weights = [0.0] * n_features
        self.bias = 0

        for _ in range(self.n_iters):
            y_pred = [self._predict(x) for x in X]
            dw = [0.0] * n_features
            db = 0.0

            for i in range(n_samples):
                error = y_pred[i] - y[i]
                for j in range(n_features):
                    dw[j] += error * X[i][j]
                db += error

            for j in range(n_features):
                self.weights[j] -= self.lr * dw[j] / n_samples
            self.bias -= self.lr * db / n_samples

        end = time.time()
        return end - start

    def _predict(self, x):
        return sum(w * xi for w, xi in zip(self.weights, x)) + self.bias

    def predict(self, X):
        return [self._predict(x) for x in X]

# Step 5: MAE and MSE calculation
def calculate_mae_mse_python(y_true, y_pred):
    n = len(y_true)
    mae = sum(abs(y_t - y_p) for y_t, y_p in zip(y_true, y_pred)) / n
    mse = sum((y_t - y_p) ** 2 for y_t, y_p in zip(y_true, y_pred)) / n
    return mae, mse

# Step 6: Run pipeline
if __name__ == "__main__":
    filename = '/kaggle/input/housing/housing.csv'  # Change this path if needed

    df = load_csv_pandas(filename)
    X, y = prepare_data(df)

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Train samples: {len(X_train)}, Validation samples: {len(X_val)}")

    X_train_scaled, X_val_scaled = scale_features(X_train, X_val)

    model = LinearRegressionPurePython(lr=0.01, n_iters=500)
    duration = model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_val_scaled)
    r2 = r2_score(y_val, y_pred)
    mae, mse = calculate_mae_mse_python(y_val, y_pred)

    print(f"\n=== Evaluation Results ===")
    print(f"Training Time: {duration:.4f} seconds")
    print(f"R² Score: {r2:.4f}")
    print(f"MAE (Pure Python): {mae:.4f}")
    print(f"MSE (Pure Python): {mse:.4f}")


Columns in dataset: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity']
Initial data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None
First 5 rows:
   longitude  latitude  ho

In [10]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# ----------------------------------------
# Load and Clean CSV Data
def load_and_clean_csv(filepath):
    df = pd.read_csv(filepath)

    # Drop non-numeric columns like 'ocean_proximity'
    df = df.select_dtypes(include=[np.number])

    # Drop rows with missing values
    df = df.dropna()

    # Convert to NumPy array
    return df.values

# ----------------------------------------
# Custom Linear Regression (NumPy)
class LinearRegressionNumpy:
    def __init__(self, lr=0.01, n_iters=1000):  # FIXED
        self.lr = lr
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        start = time.time()
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for _ in range(self.n_iters):
            y_pred = np.dot(X, self.weights) + self.bias
            error = y_pred - y

            dw = (1 / n_samples) * np.dot(X.T, error)
            db = (1 / n_samples) * np.sum(error)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db

        end = time.time()
        return end - start

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

# ----------------------------------------
# MAE and MSE Evaluation using NumPy
def calculate_mae_mse_numpy(y_true, y_pred):
    mae = np.mean(np.abs(y_true - y_pred))
    mse = np.mean((y_true - y_pred) ** 2)
    return mae, mse

# ----------------------------------------
# Main Execution
if __name__ == "__main__":  # FIXED
    filepath = "/kaggle/input/housing/housing.csv"

    # Load and clean the dataset
    data = load_and_clean_csv(filepath)

    # Split into features and target
    X = data[:, :-1]
    y = data[:, -1]

    # Train-validation split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Model training
    model = LinearRegressionNumpy(lr=0.01, n_iters=500)
    training_time = model.fit(X_train_scaled, y_train)

    # Prediction and evaluation
    y_pred = model.predict(X_val_scaled)
    r2 = r2_score(y_val, y_pred)
    mae, mse = calculate_mae_mse_numpy(y_val, y_pred)

    # Final output
    print("\n=== Evaluation Results (NumPy Linear Regression) ===")
    print(f"Training Time: {training_time:.4f} seconds")
    print(f"R² Score: {r2:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")



=== Evaluation Results (NumPy Linear Regression) ===
Training Time: 0.0629 seconds
R² Score: 0.5870
MAE: 55162.3358
MSE: 5647828380.1286


In [5]:
import numpy as np
import pandas as pd
import time
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load and preprocess data
def load_and_prepare_data(filepath):
    df = pd.read_csv(filepath)

    # Drop non-numeric column
    if 'ocean_proximity' in df.columns:
        df = df.drop(columns=['ocean_proximity'])

    # Drop missing values
    df = df.dropna()

    # Split features and target
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    return X, y

# Main execution
if __name__ == "__main__":
    filepath = '/kaggle/input/housing/housing.csv'  # Ensure this path is correct

    X, y = load_and_prepare_data(filepath)

    # Train/val split
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Feature scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # Train model
    model = LinearRegression()
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()

    # Predict and evaluate
    preds = model.predict(X_val)
    print(f"Sklearn R²: {r2_score(y_val, preds):.4f}")
    print(f"MAE: {mean_absolute_error(y_val, preds):.4f}")
    print(f"MSE: {mean_squared_error(y_val, preds):.4f}")
    print(f"Fit Time: {end - start:.4f} seconds")


Sklearn R²: 0.6401
MAE: 51372.6722
MSE: 4921881237.6281
Fit Time: 0.0802 seconds
