In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv('CleanedData3.csv')

In [4]:
df.head()

Unnamed: 0,Price,Floors,Area,Road_Width,City_Bhaktapur,City_Kathmandu,City_Lalitpur,Road_Type_Blacktopped,Road_Type_Gravelled,Road_Type_Soil Stabilized
0,3.132908,2.0,16.0,20.0,0,1,0,1,0,0
1,2.576956,2.0,21.0,20.0,0,1,0,1,0,0
2,2.021003,2.0,17.0,20.0,0,1,0,1,0,0
3,3.688861,2.0,19.5,20.0,0,1,0,1,0,0
4,3.688861,3.0,12.8125,13.0,0,1,0,1,0,0


In [5]:
X = df.drop('Price', axis=1)
y = df['Price']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [23]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the MLPRegressor
mlp_regressor = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)

# Fit the model to the training data
mlp_regressor.fit(X_train, y_train)

# Predict on the test data
y_pred = mlp_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 0.787106120878515
R^2 Score: 0.21072428122034426


In [24]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor

# Initialize the MLPRegressor with desired parameters
mlp = MLPRegressor(
    hidden_layer_sizes=(50, 50),  # Two hidden layers with 50 neurons each
    activation='relu',            # Activation function for hidden layers
    solver='adam',                # Optimization algorithm
    alpha=0.01,                  # L2 regularization strength
    max_iter=1000,               # Maximum number of iterations
    random_state=42              # For reproducibility
)

# Define the cross-validation strategy
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

# Perform cross-validation and get the scores
scores = cross_val_score(mlp, X, y, cv=cv, scoring='r2')

# Print the cross-validation scores
print("Cross-validation scores: %s" % scores)
print("Average cross-validation score: %.2f" % scores.mean())

Cross-validation scores: [0.74336967 0.11104259 0.73028506 0.63630999 0.61782073]
Average cross-validation score: 0.57


In [7]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

-0.8336183581874999

# Use K Fold cross validation to measure accuracy of our LinearRegression model


In [8]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([ 0.67290823, -1.05796964,  0.62014139,  0.54895838,  0.56768824])

We can see that in 5 iterations we get a score above 50 % all the time. This is not bad  but we want to test few other algorithms for regression to see if we can get even better score. We will use GridSearchCV or RandomizedSearchCV for this purpose

In [9]:
from sklearn.model_selection import RandomizedSearchCV, train_test_split, ShuffleSplit
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

def find_best_model_using_randomizedsearchcv(X, y):
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    algos = {
        'linear_regression': {
            'model': LinearRegression(),
            'params': {}
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1, 2],
                'selection': ['random', 'cyclic']
            }
        },
        'ridge': {
            'model': Ridge(),
            'params': {
                'alpha': [1, 2, 3]
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['mse', 'friedman_mse'],
                'splitter': ['best', 'random']
            }
        },
        'random_forest': {
            'model': RandomForestRegressor(),
            'params': {
                'n_estimators': [100, 200],
                'max_depth': [None, 5],
                'min_samples_split': [2, 5],
                'min_samples_leaf': [1, 2]
            }
        },
        'svr': {
            'model': SVR(),
            'params': {
                'kernel': ['linear', 'rbf'],
                'C': [0.1, 1],
                'epsilon': [0.1, 0.2]
            }
        },
        'k_neighbors': {
            'model': KNeighborsRegressor(),
            'params': {
                'n_neighbors': [3, 5],
                'weights': ['uniform', 'distance']
            }
        },
        'mlp': {
            'model': MLPRegressor(max_iter=1000),
            'params': {
                'hidden_layer_sizes': [(50,), (100,), (50, 50)],
                'activation': ['tanh', 'relu'],
                'solver': ['adam', 'sgd']
            }
        }
    }
    
    results = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    
    for algo_name, config in algos.items():
        # Perform randomized search to find the best parameters
        rs = RandomizedSearchCV(
            config['model'],
            config['params'],
            cv=cv,
            n_iter=10,
            n_jobs=-1,
            return_train_score=False,
            random_state=0
        )
        rs.fit(X_train, y_train)
        
        # Get the best model and make predictions on the test set
        best_model = rs.best_estimator_
        y_pred = best_model.predict(X_test)
        
        # Calculate evaluation metrics
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        #  the best score from cross-validation
        best_score = rs.best_score_
        
        # Store the results
        results.append({
            'model': algo_name,
            'Best Score': best_score,
            'MAE': mae,
            'MSE': mse,
            'RMSE': rmse,
            'R² Score': r2,
            'Best Parameters': rs.best_params_
        })
    
    # Return the results as a DataFrame
    return pd.DataFrame(results, columns=['model', 'Best Score', 'MAE', 'MSE', 'RMSE', 'R² Score'])

# Example usage:
df = find_best_model_using_randomizedsearchcv(X, y)
print(df)

10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\ProgramData\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterE

               model  Best Score       MAE       MSE      RMSE  R² Score
0  linear_regression    0.373811  0.415479  0.279395  0.528578  0.672908
1              lasso    0.387712  0.515180  0.416076  0.645039  0.512894
2              ridge    0.374192  0.415659  0.279371  0.528555  0.672937
3      decision_tree    0.525167  0.464950  0.549540  0.741310  0.356645
4      random_forest    0.706689  0.317164  0.182046  0.426668  0.786876
5                svr    0.708130  0.322946  0.193465  0.439846  0.773508
6        k_neighbors    0.677622  0.350750  0.270193  0.519801  0.683681
7                mlp    0.719916  0.309923  0.177139  0.420879  0.792621


In [10]:
def predict_price(location, area, floors, road_width, road_type):
    """
    Predict house price based on location, area, floors, road width, and road type
    
    Parameters:
    location (str): One of 'City_Bhaktapur', 'City_Kathmandu', or 'City_Lalitpur'
    area (float): Area of the property in square units
    floors (float): Number of floors
    road_width (float): Width of the road in front of the property
    road_type (str): One of 'Road_Type_Blacktopped', 'Road_Type_Gravelled', or 'Road_Type_Soil Stabilized'
    
    Returns:
    float: Predicted price
    """
    # Initialize array with zeros for all features
    x = np.zeros(len(X.columns))
    
    # Set the feature values
    x[0] = floors  # Floors is the first column
    x[1] = area    # Area is the second column
    x[2] = road_width  # Road_Width is the third column
    
    # Set the location to 1 (one-hot encoding)
    if location == 'City_Bhaktapur':
        x[3] = 1
    elif location == 'City_Kathmandu':
        x[4] = 1
    elif location == 'City_Lalitpur':
        x[5] = 1
    
    # Set the road type to 1 (one-hot encoding)
    if road_type == 'Road_Type_Blacktopped':
        x[6] = 1
    elif road_type == 'Road_Type_Gravelled':
        x[7] = 1
    elif road_type == 'Road_Type_Soil Stabilized':
        x[8] = 1
    
    return lr_clf.predict([x])[0]

# Test the model with sample data including road type
predicted_price = predict_price('City_Kathmandu', 16.0, 2.0, 20.0, 'Road_Type_Blacktopped')
print(f"Predicted price: {predicted_price}")


Predicted price: 2.3747830712125193




In [36]:
import numpy as np

def predict_price_mlp(location, area, floors, road_width, road_type):
    """
    Predict house price using the fitted MLPRegressor
    
    Parameters:
    -----------
    location : str
        One of 'City_Bhaktapur', 'City_Kathmandu', or 'City_Lalitpur'
    area : float
        Area of the property in square units
    floors : float
        Number of floors
    road_width : float
        Width of the road in front of the property
    road_type : str
        One of 'Road_Type_Blacktopped', 'Road_Type_Gravelled', or 'Road_Type_Soil Stabilized'
    
    Returns:
    --------
    float
        The MLP‐predicted price
    """
    # Create a zero‐vector as long as the number of features in X
    x = np.zeros(len(X.columns))
    
    # Numeric features
    x[ X.columns.get_loc('Floors')      ] = floors
    x[ X.columns.get_loc('Area')        ] = area
    x[ X.columns.get_loc('Road_Width')  ] = road_width
    
    # One‐hot encoding for location
    loc_col = f"City_{location.split('_')[1]}"  # e.g. 'City_Kathmandu'
    if loc_col in X.columns:
        x[ X.columns.get_loc(loc_col) ] = 1
    
    # One‐hot encoding for road type
    if road_type in X.columns:
        x[ X.columns.get_loc(road_type) ] = 1
    
    # Predict with the MLP
    return mlp_regressor.predict([x])[0]

# ---- prediction ----
predicted_price = predict_price_mlp(
    location='City_Kathmandu', 
    area=16.0, 
    floors=2.0, 
    road_width=20.0, 
    road_type='Road_Type_Blacktopped'
)
print(f"Predicted price (MLP): {predicted_price}")


Predicted price (MLP): 2.458860105703709




In [41]:
import pandas as pd

# 1. Generate predictions on the test set
y_pred_lr  = lr_clf.predict(X_test)
y_pred_mlp = mlp_regressor.predict(X_test)

# 2. Build a comparison DataFrame
df_compare = pd.DataFrame({
    'Actual_Price':       y_test,                        # true values
    'Predicted_LR':       y_pred_lr,                     # linear model
    'Predicted_MLP':      y_pred_mlp                     # neural net
}, index=y_test.index)                                   # keep the same index as y_test

# 3. Inspect the first few rows
print(df_compare.head(10))

# 4. (Optional) Save to CSV if you want to review externally
df_compare.to_csv('price_comparison.csv', index=True)

     Actual_Price  Predicted_LR  Predicted_MLP
343     -0.814354     -0.621697      -0.691087
563     -0.480782     -0.544464      -0.652334
285     -0.258401     -0.514646      -0.619863
348     -0.869949     -0.715978      -0.883592
228      3.688861      3.166022       2.243669
134     -0.936663     -0.738177      -0.923778
796     -1.203521     -0.883281      -0.912727
84      -0.647568     -0.738565      -0.815093
382      2.021003      1.042044       1.194719
28      -0.619771     -0.389996      -0.546847


In [44]:
import pickle

# Save the trained MLP model
with open('realstate_prices_mlp_model.pickle', 'wb') as f:
    pickle.dump(mlp_regressor, f)


In [43]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))