In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [9]:
import pandas as pd

data = pd.read_csv(r"c:\Users\Student\Desktop\Ml programs B-section\KNN PROBLEM DATASETS\Bengaluru_House_Data.csv")
print(data.head())

              area_type   availability                  location       size  \
0  Super built-up  Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot  Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up  Area  Ready To Move               Uttarahalli      3 BHK   
3  Super built-up  Area  Ready To Move        Lingadheeranahalli      3 BHK   
4  Super built-up  Area  Ready To Move                  Kothanur      2 BHK   

   society total_sqft  bath  balcony   price  
0  Coomee        1056   2.0      1.0   39.07  
1  Theanmp       2600   5.0      3.0  120.00  
2      NaN       1440   2.0      3.0   62.00  
3  Soiewre       1521   3.0      1.0   95.00  
4      NaN       1200   2.0      1.0   51.00  


In [10]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [11]:
data.shape

(13320, 9)

In [12]:
data.info()

<class 'pandas.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  str    
 1   availability  13320 non-null  str    
 2   location      13319 non-null  str    
 3   size          13304 non-null  str    
 4   society       7818 non-null   str    
 5   total_sqft    13320 non-null  str    
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), str(6)
memory usage: 936.7 KB


In [14]:
for column in data.columns:
    print(data[column].value_counts())
    print("*" * 20)

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
********************
availability
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: count, Length: 81, dtype: int64
********************
location
Whitefield                                         540
Sarjapur  Road                                     399
Electronic City                                    302
Kanakpura Road                                     273
Thanisandra                                        234
                                                  ... 
Pattegarhpalya                                       1
Tilak Nagar                                          1
12th cross srinivas nagar banshankari 3rd stage      

In [15]:
print(data.isna().sum())

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64


In [None]:
# Dropping unnecessary columns
data.drop(columns=['area_type', 'availability', 'society', 'balcony'], inplace=True, errors='ignore')

In [17]:
print(data.describe())

               bath         price
count  13247.000000  13320.000000
mean       2.692610    112.565627
std        1.341458    148.971674
min        1.000000      8.000000
25%        2.000000     50.000000
50%        2.000000     72.000000
75%        3.000000    120.000000
max       40.000000   3600.000000


In [18]:
# Fill missing values
data['location'] = data['location'].fillna('Sarjapur Road')
data['size'] = data['size'].fillna('2 BHK')
data['bath'] = data['bath'].fillna(data['bath'].median())

In [19]:
# Convert BHK to an integer
data['bhk'] = data['size'].str.split().str.get(0).astype(int)

In [None]:
# Handle total_sqft, convert ranges to average and remove anomalies
import re
def convertRange(x):
    try:
        if pd.isnull(x):
            return None
        if isinstance(x, (int, float)):
            return float(x)
        s = str(x).strip()
        if '-' in s:
            parts = s.split('-')
            return (float(parts[0].strip()) + float(parts[1].strip())) / 2
        s = s.replace(',', '')
        m = re.match(r'^(?:\d+\.?\d*)', s)
        if m:
            return float(m.group())
        return None
    except Exception:
        return None
data['total_sqft'] = data['total_sqft'].apply(convertRange)
# Drop rows with invalid or zero total_sqft
data = data[~data['total_sqft'].isnull()]
data = data[data['total_sqft'] > 0]

In [23]:
# Price per square foot calculation
data['price_per_sqft'] = data['price'] * 1000000 / data['total_sqft']

In [None]:
# Clean location names by stripping whitespaces
data['location'] = data['location'].apply(lambda x: x.strip() if isinstance(x, str) else x)
# Handling rare locations
location_count = data['location'].value_counts()
location_count_less_10 = location_count[location_count <= 10]
location_small = set(location_count_less_10.index)
data['location'] = data['location'].apply(lambda x: 'other' if x in location_small else x)

In [26]:
# Remove outliers based on total_sqft per BHK
data = data[((data['total_sqft'] / data['bhk']) >= 300)]

In [None]:
# Remove outliers based on price per sqft within each location
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        vals = subdf.price_per_sqft.dropna()
        if vals.empty:
            continue
        m = np.mean(vals)
        st = np.std(vals)
        gen_props = subdf[(subdf.price_per_sqft > (m - st)) & (subdf.price_per_sqft <= (m + st))]
        df_output = pd.concat([df_output, gen_props], ignore_index=True)
    return df_output
data = remove_outliers_sqft(data)

In [None]:
# Remove BHK outliers
def bhk_outlier_remover(df):
    exclude_indices = []
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk - 1)
            if stats and stats['count'] > 5:
                bad_idx = bhk_df[bhk_df.price_per_sqft < stats['mean']].index.tolist()
                exclude_indices.extend(bad_idx)
    if exclude_indices:
        exclude_indices = list(set(exclude_indices))
        return df.drop(labels=exclude_indices, axis='index')
    return df
data = bhk_outlier_remover(data)

In [30]:
# Dropping unnecessary columns
data.drop(columns=['size', 'price_per_sqft'], inplace=True)

In [None]:
# Selecting only numerical features
features = ['total_sqft', 'bath', 'bhk']
# Drop rows with missing feature or target values
data = data.dropna(subset=features + ['price'])
X = data[features].astype(float)
y = data['price'].astype(float)

In [33]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape)
print(X_test.shape)

(3, 3)
(1, 3)


In [34]:
# Feature Scaling for numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [35]:
from sklearn.neighbors import KNeighborsRegressor

In [36]:
# Initialize and train the KNN Regressor
knn_regressor = KNeighborsRegressor(n_neighbors=5) # You can tune n_neighbors
knn_regressor.fit(X_train_scaled, y_train)

0,1,2
,"n_neighbors  n_neighbors: int, default=5 Number of neighbors to use by default for :meth:`kneighbors` queries.",5
,"weights  weights: {'uniform', 'distance'}, callable or None, default='uniform' Weight function used in prediction. Possible values: - 'uniform' : uniform weights. All points in each neighborhood  are weighted equally. - 'distance' : weight points by the inverse of their distance.  in this case, closer neighbors of a query point will have a  greater influence than neighbors which are further away. - [callable] : a user-defined function which accepts an  array of distances, and returns an array of the same shape  containing the weights. Uniform weights are used by default. See the following example for a demonstration of the impact of different weighting schemes on predictions: :ref:`sphx_glr_auto_examples_neighbors_plot_regression.py`.",'uniform'
,"algorithm  algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm  based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force.",'auto'
,"leaf_size  leaf_size: int, default=30 Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.",30
,"p  p: float, default=2 Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.",2
,"metric  metric: str, DistanceMetric object or callable, default='minkowski' Metric to use for distance computation. Default is ""minkowski"", which results in the standard Euclidean distance when p = 2. See the documentation of `scipy.spatial.distance `_ and the metrics listed in :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric values. If metric is ""precomputed"", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only ""nonzero"" elements may be considered neighbors. If metric is a callable function, it takes two arrays representing 1D vectors as inputs and must return one value indicating the distance between those vectors. This works for Scipy's metrics, but is less efficient than passing the metric name as a string. If metric is a DistanceMetric object, it will be passed directly to the underlying computation routines.",'minkowski'
,"metric_params  metric_params: dict, default=None Additional keyword arguments for the metric function.",
,"n_jobs  n_jobs: int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. Doesn't affect :meth:`fit` method.",


In [44]:
y_pred = knn_regressor.predict(X_test_scaled)

ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 5, n_samples_fit = 3, n_samples = 1

In [45]:
# Evaluate the performance of the KNN Regressor
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

NameError: name 'mean_squared_error' is not defined

In [46]:
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

NameError: name 'mse' is not defined