In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set(style="whitegrid")

In [2]:
df =pd.read_csv("HousingData.csv")
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [16]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

def load_boston_data():
    data = fetch_california_housing()
    df = pd.DataFrame(data.data, columns=data.feature_names)
    df['MEDV'] = data.target  # still calling it MEDV for consistency
    return df


In [3]:
class CustomLinearRegression:
    def __init__(self):
        self.weights = None
        self.bias = None
    
    def fit(self, X, y):
        X_b = np.c_[np.ones(X.shape[0]), X]
        XTX = X_b.T @ X_b
        XTy = X_b.T @ y
        self.weights = np.linalg.solve(XTX, XTy)
        self.bias = self.weights[0]
        self.weights = self.weights[1:]
    
    def predict(self, X):
        return X @ self.weights + self.bias

In [4]:

# Custom Decision Tree
class CustomDecisionTree:
    def __init__(self, max_depth=3, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None
    
    def fit(self, X, y, depth=0):
        if depth >= self.max_depth or len(y) < self.min_samples_split:
            return {'value': np.mean(y)}
        
        best_split = self.find_best_split(X, y)
        if not best_split:
            return {'value': np.mean(y)}
        
        feature, threshold = best_split['feature'], best_split['threshold']
        left_idx = X[:, feature] <= threshold
        right_idx = ~left_idx
        
        if sum(left_idx) < self.min_samples_split or sum(right_idx) < self.min_samples_split:
            return {'value': np.mean(y)}
        
        return {
            'feature': feature,
            'threshold': threshold,
            'left': self.fit(X[left_idx], y[left_idx], depth + 1),
            'right': self.fit(X[right_idx], y[right_idx], depth + 1)
        }
    
    def find_best_split(self, X, y):
        best_mse = float('inf')
        best_split = None
        n_features = X.shape[1]
        
        for feature in range(n_features):
            thresholds = np.percentile(X[:, feature], np.linspace(10, 90, 10))
            for threshold in thresholds:
                left_idx = X[:, feature] <= threshold
                right_idx = ~left_idx
                if sum(left_idx) < self.min_samples_split or sum(right_idx) < self.min_samples_split:
                    continue
                y_left, y_right = y[left_idx], y[right_idx]
                mse = (len(y_left) * np.var(y_left) + len(y_right) * np.var(y_right)) / len(y)
                if mse < best_mse:
                    best_mse = mse
                    best_split = {'feature': feature, 'threshold': threshold}
        
        return best_split
    
    def predict(self, X):
        self.tree = self.tree or {'value': 0}  # Ensure tree exists
        return np.array([self.predict_one(x, self.tree) for x in X])
    
    def predict_one(self, x, node):
        if 'value' in node:
            return node['value']
        if x[node['feature']] <= node['threshold']:
            return self.predict_one(x, node['left'])
        return self.predict_one(x, node['right'])





In [5]:
# Custom Random Forest
class CustomRandomForest:
    def __init__(self, n_trees=10, max_depth=3, min_samples_split=2):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []
    
    def fit(self, X, y):
        n_samples = X.shape[0]
        for _ in range(self.n_trees):
            idx = np.random.choice(n_samples, n_samples, replace=True)
            X_sample, y_sample = X[idx], y[idx]
            tree = CustomDecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.tree = tree.fit(X_sample, y_sample)
            self.trees.append(tree)
    
    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(predictions, axis=0)

In [6]:
class CustomXGBoost:
    def __init__(self, n_estimators=10, max_depth=3, learning_rate=0.1, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.min_samples_split = min_samples_split
        self.trees = []
        self.initial_prediction = None
    
    def fit(self, X, y):
        self.initial_prediction = np.mean(y)
        predictions = np.full_like(y, self.initial_prediction)
        
        for _ in range(self.n_estimators):
            residuals = y - predictions
            tree = CustomDecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.tree = tree.fit(X, residuals)
            self.trees.append(tree)
            tree_preds = tree.predict(X)
            predictions += self.learning_rate * tree_preds
    
    def predict(self, X):
        predictions = np.full(X.shape[0], self.initial_prediction)
        for tree in self.trees:
            predictions += self.learning_rate * tree.predict(X)
        return predictions

In [7]:
# Evaluation Metrics
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def r2(y_true, y_pred):
    return r2_score(y_true, y_pred)

# Feature Importance
def get_feature_importance(model, X, feature_names):
    if isinstance(model, (CustomRandomForest, CustomXGBoost)):
        importance = np.zeros(X.shape[1])
        trees = model.trees
        n_trees = len(trees)
        for tree in trees:
            importance += compute_tree_importance(tree.tree, X.shape[1])
        importance /= n_trees
        return pd.Series(importance, index=feature_names)
    return None

def compute_tree_importance(node, n_features):
    importance = np.zeros(n_features)
    if 'value' in node:
        return importance
    feature = node['feature']
    importance[feature] += 1  # Count splits
    importance += compute_tree_importance(node['left'], n_features)
    importance += compute_tree_importance(node['right'], n_features)
    return importance



In [17]:
def main():
    # Load data
    df = load_boston_data()
    print("Dataset Shape:", df.shape)
    print("\nFirst 5 Rows:\n", df.head())
    
    # Preprocessing
    X = df.drop(columns=['MEDV']).values
    y = df['MEDV'].values
    
    # Normalize features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("\nTraining Set Shape:", X_train.shape)
    print("Test Set Shape:", X_test.shape)
    
    # Initialize models
    models = {
        'Linear Regression': CustomLinearRegression(),
        'Random Forest': CustomRandomForest(n_trees=10, max_depth=3, min_samples_split=2),
        'XGBoost': CustomXGBoost(n_estimators=10, max_depth=3, learning_rate=0.1, min_samples_split=2)
    }
    # Train and evaluate
    results = {}
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        results[name] = {
            'RMSE': rmse(y_test, y_pred),
            'R2': r2(y_test, y_pred)
        }
    
    # Print results
    print("\n=== Model Performance ===")
    for name, metrics in results.items():
        print(f"{name}:")
        print(f"  RMSE: {metrics['RMSE']:.4f}")
        print(f"  R2: {metrics['R2']:.4f}")
    
    # Visualize feature importance
    feature_names = df.drop(columns=['MEDV']).columns
    plt.figure(figsize=(12, 5))
    
    for i, name in enumerate(['Random Forest', 'XGBoost'], 1):
        importance = get_feature_importance(models[name], X, feature_names)
        if importance is not None:
            plt.subplot(1, 2, i)
            sns.barplot(x=importance.values, y=importance.index)
            plt.title(f'{name} Feature Importance')
            plt.xlabel('Importance (Split Count)')
    
    plt.tight_layout()
    plt.savefig('house_price_prediction/feature_importance.png')
    plt.show()

if __name__ == "__main__":
    main()

Dataset Shape: (20640, 9)

First 5 Rows:
    MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude   MEDV  
0    -122.23  4.526  
1    -122.22  3.585  
2    -122.24  3.521  
3    -122.25  3.413  
4    -122.25  3.422  


NameError: name 'train_test_split' is not defined