In [None]:
import numpy as np
import pandas as pd

In [None]:
np.random.seed(202)


In [None]:
n_samples = 1200
square_feet = np.random.normal(2000, 500, n_samples).astype(float)
num_bedrooms = np.random.randint(1, 6, n_samples).astype(float)
num_bathrooms = np.random.randint(1, 4, n_samples).astype(float)
price = np.random.normal(300000, 75000, n_samples).astype(float)

In [None]:
np.random.seed(202)
missing_indices_square_feet = np.random.choice(n_samples, 70, replace=False)
missing_indices_bedrooms = np.random.choice(n_samples, 50, replace=False)
missing_indices_price = np.random.choice(n_samples, 40, replace=False)

square_feet[missing_indices_square_feet] = np.nan
num_bedrooms[missing_indices_bedrooms] = np.nan
price[missing_indices_price] = np.nan

In [None]:
data = pd.DataFrame({
    'SquareFeet': square_feet,
    'NumBedrooms': num_bedrooms,
    'NumBathrooms': num_bathrooms,
    'Price': price
})


In [None]:
print(data.head())
print("Missing values in each column:\n", data.isnull().sum())

    SquareFeet  NumBedrooms  NumBathrooms          Price
0  2433.541377          3.0           2.0  377303.381143
1  1909.133034          2.0           1.0  275206.613039
2  1539.275650          2.0           3.0  274299.655131
3  2725.165181          2.0           1.0  128238.075187
4  1783.370346          2.0           1.0  282594.973096
Missing values in each column:
 SquareFeet      70
NumBedrooms     50
NumBathrooms     0
Price           40
dtype: int64


In [None]:
data['SquareFeet'].fillna(data['SquareFeet'].median(), inplace=True)
data['NumBedrooms'].fillna(data['NumBedrooms'].mode()[0], inplace=True)
data['Price'].fillna(data['Price'].median(), inplace=True)

In [None]:
print("Missing values after imputation:\n", data.isnull().sum())


Missing values after imputation:
 SquareFeet      0
NumBedrooms     0
NumBathrooms    0
Price           0
dtype: int64


In [None]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]


In [None]:
data = remove_outliers(data, 'SquareFeet')
data = remove_outliers(data, 'Price')

print("Data shape after outlier removal:", data.shape)

Data shape after outlier removal: (1173, 4)


In [None]:
from sklearn.preprocessing import StandardScaler


In [None]:
features = ['SquareFeet', 'NumBedrooms', 'NumBathrooms']
target = 'Price'

In [None]:
scaler = StandardScaler()


In [None]:
data[features] = scaler.fit_transform(data[features])


In [None]:
print(data.head())


   SquareFeet  NumBedrooms  NumBathrooms          Price
0    0.945534     0.007463     -0.011583  377303.381143
1   -0.206761    -0.722087     -1.246719  275206.613039
2   -1.019458    -0.722087      1.223554  274299.655131
3    1.586326    -0.722087     -1.246719  128238.075187
4   -0.483103    -0.722087     -1.246719  282594.973096


In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X = data[features]
y = data[target]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=202)


In [None]:
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Testing set shape: (294, 3)
