In [1]:
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
import matplotlib.pyplot as plt
import math

In [2]:
data = pd.read_csv('/content/raw_house_data.csv')

In [3]:
# Check for missing values
missing_values = data.isnull().sum()

In [4]:
# Print the number of missing values for each column
print(missing_values)

MLS                  0
sold_price           0
zipcode              0
longitude            0
latitude             0
lot_acres           10
taxes                0
year_built           0
bedrooms             0
bathrooms            0
sqrt_ft              0
garage               0
kitchen_features     0
fireplaces          25
floor_covering       0
HOA                  0
dtype: int64


In [5]:
# Fill in missing values in lot_acres with the median
median_lot_acres = data['lot_acres'].median()
data['lot_acres'].fillna(median_lot_acres, inplace=True)

In [6]:
# Fill in missing values in fireplaces with the mode
mode_fireplaces = data['fireplaces'].mode()[0]
data['fireplaces'].fillna(mode_fireplaces, inplace=True)

In [7]:
# Verify that there are no more missing values
missing_values = data.isnull().sum()
print(missing_values)

MLS                 0
sold_price          0
zipcode             0
longitude           0
latitude            0
lot_acres           0
taxes               0
year_built          0
bedrooms            0
bathrooms           0
sqrt_ft             0
garage              0
kitchen_features    0
fireplaces          0
floor_covering      0
HOA                 0
dtype: int64


In [8]:
data.dtypes

MLS                   int64
sold_price          float64
zipcode               int64
longitude           float64
latitude            float64
lot_acres           float64
taxes               float64
year_built            int64
bedrooms              int64
bathrooms            object
sqrt_ft              object
garage               object
kitchen_features     object
fireplaces          float64
floor_covering       object
HOA                  object
dtype: object

##How to use:

Standard usage for my library:

**To instantiate the class**

- preprocessor = DataPreprocessor()

**Using the convert_to_int method**

 - preprocessor.convert_to_int(**dataframe**, '**column**')

 - By default this should also run the **check_non_numeric** as a sub method.

**Using the resolve_categorical method**

- preprocessor.resolve_categorical(**dataframe**, '**existing_column_name**')

- existing_column_name = dataframe['existing_column_name'].unique()

- data['new_column_name'] = data['existing_column_name'].map({feature: i for i, feature in enumerate(kitchen_features)})

**Using the round floats method**

- preprocessor.round_floats(**dataframe**, '**column**')

**Using scale_features**
- data_scaled = preprocessor.scale_features(**dataframe**, **list_of_columns**)

**Using remove_outliers**

- data_no_outliers = preprocessor.remove_outliers(**data_scaled_dataframe**, **list_of_columns**k, threshold=3)

**Using display_correlation_matrix method**

- data_processor.display_correlation_matrix(**list_of_columns**)

In [9]:
#import pandas as pd
#import numpy as np
#import math
#import matplotlib.pyplot as plt

class DataPreprocessor:
    def __init__(self, data):
        self.data = data

    def remove_negative_sign(self, X):
        """
        Removes negative sign from numerical data.
        """
        X_copy = X.copy()
        if isinstance(X_copy, pd.Series):
            X_copy = X_copy.to_numpy().reshape(-1, 1)
        for i in range(X_copy.shape[1]):
            if np.issubdtype(X_copy[:, i].dtype, np.number):
                X_copy[:, i] = np.abs(X_copy[:, i])
        return X_copy

    def scale_features(self, data, columns, method='standardization'):
        """
        Scales or normalizes the features of a DataFrame.

        Parameters:
        - data: Pandas DataFrame containing the data.
        - columns: list of columns to scale or normalize.
        - method: scaling method to use. Can be 'standardization' or 'min-max'.

        Returns:
        - Pandas DataFrame with the scaled or normalized features.
        """
        # Make a copy of the data to avoid modifying the original DataFrame
        data_scaled = data.copy()

        # Scale or normalize the selected columns
        if method == 'standardization':
            for col in columns:
                data_scaled[col] = (data_scaled[col] - data_scaled[col].mean()) / data_scaled[col].std()

        elif method == 'min-max':
            for col in columns:
                data_scaled[col] = (data_scaled[col] - data_scaled[col].min()) / (data_scaled[col].max() - data_scaled[col].min())

        return data_scaled

    def remove_outliers(self, data, columns, threshold=2):
        """
        Removes outliers from a DataFrame using the z-score method.

        Parameters:
        - data: Pandas DataFrame containing the data.
        - columns: list of columns to check for outliers.
        - threshold: number of standard deviations from the mean to consider an outlier.

        Returns:
        - Pandas DataFrame without the outliers.
        """
        # Make a copy of the data to avoid modifying the original DataFrame
        data_clean = data.copy()

        # Iterate over the columns and remove outliers using the z-score method
        for col in columns:
            z_scores = np.abs((data_clean[col] - data_clean[col].mean()) / data_clean[col].std())
            data_clean = data_clean[z_scores <= threshold * data_clean[col].std()]

        return data_clean

    @staticmethod
    def check_non_numeric(data, column):
        # Select all rows in the column that cannot be converted to a numeric type
        non_numeric = data[pd.to_numeric(data[column], errors='coerce').isna()]

        # Check if there are any non-numeric values in the selected rows
        if len(non_numeric) > 0:
            # If there are non-numeric values, print them out
            print(f"Found non-numeric values in column '{column}':")
            print(non_numeric)
        else:
            # If there are no non-numeric values, print a message indicating so
            print(f"No non-numeric values found in column '{column}'")
            
    @staticmethod
    def resolve_categorical(data, column):
        # Check if the column has any missing values
        if data[column].isna().sum() > 0:
            # If there are missing values, drop them
            data.dropna(subset=[column], inplace=True)
        
        # Replace "None" with 0
        data[column] = data[column].replace("None", 0)
        
        # Count the categorical values
        cat_counts = data[column].value_counts().to_dict()

        # Replace the original values with their counts
        data[column] = data[column].map(cat_counts)

        # Verify the result
        print(data[column].unique())


    def convert_to_int(self, data, column):
        # Check for non-numeric values
        DataPreprocessor.check_non_numeric(data, column)

        # Replace "None" with NaN
        data[column] = data[column].replace("None", 0)

        # Convert the column to string
        data[column] = data[column].astype(str)

        # Remove commas from numbers
        data[column] = data[column].str.replace(",", "")

        # Convert the column to float
        data[column] = data[column].astype(float)

        # Replace NaN with 0
        data[column] = data[column].fillna(0)

        # Round down to the nearest integer
        data[column] = data[column].apply(lambda x: math.floor(x))

        # Convert to int
        data[column] = data[column].astype(int)

        # Verify the result
        print(data[column].unique())
        
    def convert_to_float(self, data, column):
        # Check for non-numeric values
        DataPreprocessor.check_non_numeric(data, column)

        # Replace "None" with NaN
        data[column] = data[column].replace("None", 0)


        # Convert the column to string
        data[column] = data[column].astype(str)

        # Remove commas from numbers
        data[column] = data[column].str.replace(",", "")

        # Check for non-numeric values
        DataPreprocessor.check_non_numeric(data, column)
        
        # Replace "None" with NaN
        data[column] = data[column].replace("None", np.nan)

        # Convert the column to float
        data[column] = data[column].astype(float)

        # Replace NaN with 0
        data[column] = data[column].fillna(0)

        # Verify the result
        print(data[column].unique())

    def round_floats(self, data, column):
        # Check if the column has any missing values
        if data[column].isna().sum() > 0:
            # If there are missing values, drop them
            data.dropna(subset=[column], inplace=True)
        
        # Round floats to 2 decimal points or add 0 if only one
        data[column] = data[column].apply(lambda x: '{:.2f}'.format(x) if isinstance(x, float) and x.is_integer() == False else '{:.2f}0'.format(x) if isinstance(x, float) and x.is_integer() else x)
        
        # Verify the result
        print(data[column].unique())

    def display_correlation_matrix(self, columns_to_check):
      
        # Calculate the correlation matrix
        correlation_matrix = self.data[columns_to_check].corr()

        fig, ax = plt.subplots(figsize=(10, 8))
        im = ax.imshow(correlation_matrix, cmap='coolwarm')

        # Display the colorbar
        cbar = ax.figure.colorbar(im, ax=ax)

        # Set the tick labels and axis labels
        ax.set_xticks(np.arange(len(columns_to_check)))
        ax.set_yticks(np.arange(len(columns_to_check)))
        ax.set_xticklabels(columns_to_check, fontsize=12)
        ax.set_yticklabels(columns_to_check, fontsize=12)
        ax.set_xlabel('Features', fontsize=14)
        ax.set_ylabel('Features', fontsize=14)

        # Rotate the tick labels and set their alignment
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
                 rotation_mode="anchor")

        # Display the correlation values in the heatmap
        for i in range(len(columns_to_check)):
            for j in range(len(columns_to_check)):
                text = ax.text(j, i, f'{correlation_matrix.iloc[i, j]:.2f}', ha="center", va="center", color="w", fontsize=10)

        # Set the title of the plot
        ax.set_title("Correlation Matrix", fontsize=16)

        # Show the plot
        plt.show()

In [10]:
preprocessor = DataPreprocessor(data)

In [11]:
data['sold_price'] = preprocessor.remove_negative_sign(data['sold_price'])

In [12]:
preprocessor.convert_to_float(data, 'sold_price')

No non-numeric values found in column 'sold_price'
No non-numeric values found in column 'sold_price'
[5300000. 4200000. 4500000. ...  539100.  534000.  526710.]


In [13]:
preprocessor.convert_to_int(data, 'year_built')

No non-numeric values found in column 'year_built'
[1941 1997 1930 1995 1999 2001 2011 2002 2007 1935 2009 2015 2008 2010
 1983 2005 1936 1998 2004 1985 2018 2019 2000    0 2012 2016 2003 1954
 1994 2014 2017 1967 2013 1943 1953 2006 1968 1982 1981 1993 1989 1928
 1900 1996 1977 1984 1992 1922 1952 1991 1959 1970 1911 1979 1986 1990
 1974 1964 1925 1972 1987 1980 1988 1929 1951 1958 1973 1961 1946 1963
 1969 1976 1947 1971 1940 1948 1937 1965 1966 1978 1960 1934 1956 1907
 1938 1950 1955 1932 1939 1957 1893 1962 1975 1924 1901 1949 1902 1942
 1945 1923 1913 1944 1931 1918 1921 1926 1919 1914 1917 1910 1905 1927]


In [14]:
preprocessor.convert_to_int(data, 'bathrooms')

Found non-numeric values in column 'bathrooms':
          MLS  sold_price  zipcode   longitude   latitude  lot_acres    taxes  \
2025  3044867    660000.0    85614 -110.969465  31.836723       3.60  5526.00   
2766  3042851    575000.0    85614 -110.960497  31.854446       0.87  4623.05   
3108  3047540    610000.0    85614 -111.002544  31.840061       1.70  3800.00   
3529  3046317    535000.0    85614 -110.986426  31.806614       4.27  3826.25   
3822  3045347    550000.0    85614 -111.008754  31.841141       0.99  3702.07   
4812  3046287    500000.0    85646 -111.051431  31.636207       1.03  8102.00   

      year_built  bedrooms bathrooms sqrt_ft garage  \
2025        2007         3      None    None      3   
2766        2002         3      None    None      3   
3108        2007         3      None    None      3   
3529        2006         2      None    None      3   
3822        2007         2      None    None      3   
4812        1999         4      None    None      2   

In [15]:
preprocessor.convert_to_int(data, 'garage')

Found non-numeric values in column 'garage':
          MLS  sold_price  zipcode   longitude   latitude  lot_acres  \
2     3054672   4200000.0    85646 -111.040707  31.594844    1707.00   
263   4115554   1200000.0    85646 -111.040612  31.594683      91.70   
590   4113651    900000.0    85646 -111.037399  31.574636      34.45   
2106  3056848    550000.0    85645 -111.047608  31.700763      50.00   
4273  3056944    490000.0    85601 -111.299661  31.584170      38.98   
4275  4116010    490000.0    85601 -111.299663  31.584173      38.98   
4999  4111490    450000.0    85621 -110.913054  31.385259       4.16   

         taxes  year_built  bedrooms  bathrooms sqrt_ft garage  \
2     10482.00        1997         2          3    None   None   
263    8473.86        1925         6          5    6884   None   
590    3354.10        1964         3          3    2759   None   
2106  25113.45        1936        36         35    None   None   
4273   5739.00        1977         4          3 

In [16]:
preprocessor.convert_to_int(data, 'sqrt_ft')

Found non-numeric values in column 'sqrt_ft':
          MLS  sold_price  zipcode   longitude   latitude  lot_acres  \
2     3054672   4200000.0    85646 -111.040707  31.594844    1707.00   
490   3055989    950000.0    85646 -111.073405  31.619537       4.40   
967   3058213    695000.0    85645 -111.183593  31.702330       0.99   
1064  3056708    785045.0    85646 -110.942060  31.552399      73.42   
1373  3059704    750000.0    85622 -111.001762  31.841975       2.72   
1659  3055188    700000.0    85646 -111.046366  31.623839       0.99   
1728  3057818    565000.0    85646 -111.050885  31.627210       0.72   
1729  3044500    675000.0    85629 -110.961128  31.869810       1.02   
1730  3053678    700000.0    85645 -111.239637  31.662369     172.76   
1731  3059581    715000.0    85622 -111.040615  31.804808       4.72   
1863  3052969    750000.0    85622 -111.002640  31.846861       4.58   
2025  3044867    660000.0    85614 -110.969465  31.836723       3.60   
2106  3056848    5

In [17]:
# check the unique values present in the column and investigate if there are any non-numeric values
data['kitchen_features'].unique()

array(['Dishwasher, Freezer, Refrigerator, Oven',
       'Dishwasher, Garbage Disposal',
       'Dishwasher, Garbage Disposal, Refrigerator', ...,
       'Dishwasher, Electric Range, Island, Refrigerator, Reverse Osmosis, Appliance Color: Stainless',
       'Dishwasher, Double Sink, Garbage Disposal, Gas Range, Pantry: Cabinet, Appliance Color: Stainless, Countertops: Granite slab, Microwave: Stainless over oven, Oven: wall',
       'Compactor, Dishwasher, Double Sink, Island, Appliance Color: Stainless'],
      dtype=object)

In [18]:
preprocessor.convert_to_float(data, 'taxes')

No non-numeric values found in column 'taxes'
No non-numeric values found in column 'taxes'
[ 5272.   10422.36 10482.   ...  1000.    5822.93  2814.48]


In [19]:
# Example usage on kitchen_features column
preprocessor.resolve_categorical(data, 'kitchen_features')
kitchen_features = data['kitchen_features'].unique()
data['kitchen_features_values'] = data['kitchen_features'].map({feature: i for i, feature in enumerate(kitchen_features)})

[   2   21   68    1 1719   71   22  189  127   97  181    5    9    4
    3   17    7   15   33  270   27   11   14   12   19   10    6   24
    8]


In [20]:
# Example usage on kitchen_features column
preprocessor.resolve_categorical(data, 'floor_covering')
floor_covering = data['floor_covering'].unique()
data['floor_covering_values'] = data['floor_covering'].map({feature: i for i, feature in enumerate(floor_covering)})

[  29    4    1    3   87   11  211   12   14  147  101  579   15  120
  242   67   45  247    2   19   13   27   92   31   41   23 1235  258
   44   57   25    8  115   16   30    7   10   22    6   33   37   21
   24   20    5    9]


In [21]:
# check the unique values present in the column and investigate if there are any non-numeric values
data['floor_covering'].unique()

array([  29,    4,    1,    3,   87,   11,  211,   12,   14,  147,  101,
        579,   15,  120,  242,   67,   45,  247,    2,   19,   13,   27,
         92,   31,   41,   23, 1235,  258,   44,   57,   25,    8,  115,
         16,   30,    7,   10,   22,    6,   33,   37,   21,   24,   20,
          5,    9])

In [22]:
preprocessor.convert_to_int(data, 'HOA')

Found non-numeric values in column 'HOA':
           MLS  sold_price  zipcode   longitude   latitude  lot_acres  \
2      3054672   4200000.0    85646 -111.040707  31.594844    1707.00   
3     21919321   4500000.0    85646 -111.035925  31.645878     636.67   
10    21900396   2776518.0    85640 -111.045441  31.562121     147.18   
14    21534099   3000000.0    85739 -110.883068  32.508778       0.00   
26    21830939   2600000.0    85739 -110.883055  32.508861       0.15   
...        ...         ...      ...         ...        ...        ...   
4990  21906583    526710.0    85748 -110.728390  32.221871       9.18   
4995  21810382    495000.0    85641 -110.661829  31.907917       4.98   
4997  21832452    475000.0    85192 -110.755428  32.964708      12.06   
4998  21900515    550000.0    85745 -111.055528  32.296871       1.01   
4999   4111490    450000.0    85621 -110.913054  31.385259       4.16   

         taxes  year_built  bedrooms  bathrooms  sqrt_ft  garage  \
2     10482.0

In [23]:
data.dtypes

MLS                          int64
sold_price                 float64
zipcode                      int64
longitude                  float64
latitude                   float64
lot_acres                  float64
taxes                      float64
year_built                   int64
bedrooms                     int64
bathrooms                    int64
sqrt_ft                      int64
garage                       int64
kitchen_features             int64
fireplaces                 float64
floor_covering               int64
HOA                          int64
kitchen_features_values      int64
floor_covering_values        int64
dtype: object

In [24]:
# Check for missing values
print(data.isnull().sum())

MLS                        0
sold_price                 0
zipcode                    0
longitude                  0
latitude                   0
lot_acres                  0
taxes                      0
year_built                 0
bedrooms                   0
bathrooms                  0
sqrt_ft                    0
garage                     0
kitchen_features           0
fireplaces                 0
floor_covering             0
HOA                        0
kitchen_features_values    0
floor_covering_values      0
dtype: int64


In [25]:
# Create a Linear Regression class
class LinearRegression:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for i in range(self.n_iterations):
            y_pred = np.dot(X, self.weights) + self.bias
            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / n_samples) * np.sum(y_pred - y)
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

In [26]:
# Create a KNN Regressor class
class KNNRegressor():
    def fit(self, X, y):
        self.X = X
        self.y = y

    def predict(self, X, K, epsilon=1e-3):
        N = len(X)
        y_hat = np.zeros(N)

        for i in range(N):
            dist2 = np.sum((self.X - X[i])**2, axis=1)
            idxt = np.argsort(dist2)[:K]
            gamma_K = np.exp(-dist2[idxt]) / np.exp(-dist2[idxt]).sum()
            y_hat[i] = gamma_K.dot(self.y[idxt])

        return y_hat

In [27]:
def gradient_descent(m_now, b_now, data, learning_rate):
    m_gradient = 0
    b_gradient = 0

    n = len(data)

    for i in range(n):
        x = data.iloc[i].bedrooms
        y = data.iloc[i].sold_price - data.iloc[i].price_after_renovations
        
        m_gradient += -(2/n) * x * (y - (m_now * x + b_now))
        b_gradient += -(2/n) * (y - (m_now * x + b_now))

    m = m_now - m_gradient * learning_rate
    b = b_now - b_gradient * learning_rate
    return m, b


In [28]:
# Select the columns for linear regression
cols_lr = ['fireplaces', 'bathrooms', 'bedrooms', 'garage', 'floor_covering_values', 'bedrooms_new']
#X_lr = data[cols_lr].values
#y_lr = data['sold_price'].values


In [29]:
# Select the columns for KNN regression
cols_knn = ['fireplaces', 'bathrooms', 'bedrooms', 'garage', 'floor_covering_values']

In [30]:
data['bedrooms_new'] = data['bedrooms']

In [31]:
# Extract the features and target variable for linear regression
X_lr = data[cols_lr].values
y_lr = data['sold_price'].values

In [32]:
# Scale the features using min-max scaling
X_lr = (X_lr - X_lr.min(axis=0)) / (X_lr.max(axis=0) - X_lr.min(axis=0))

The LinearRegression class has two methods, fit and predict. The fit method takes in the input matrix X and the target vector y and updates the weights and bias using gradient descent. The predict method takes in a new input matrix X and returns the predictions for each sample.


To use the LinearRegression class, you can create an instance of the class and call the fit method to train the model on some data, and then call the predict method to make predictions on new data:

In [33]:
# Train the linear regression model
lr = LinearRegression()
lr.fit(X_lr, y_lr)

In [34]:
# Train the KNN regressor model
knn = KNNRegressor()

In [35]:
# Select the columns for KNN regression
cols_knn = ['fireplaces', 'bathrooms', 'bedrooms', 'garage', 'floor_covering_values']
X_knn = data[cols_knn].values
y_knn = data['sold_price'].values

In [36]:
# Scale the features using min-max scaling
X_knn = (X_knn - X_knn.min(axis=0)) / (X_knn.max(axis=0) - X_knn.min(axis=0))

In [37]:
# Train the KNN regressor model
knn.fit(X_knn, y_knn)

In [38]:
# Get the user input for the property address
address = input("Enter the address of the property: ")

Enter the address of the property: 3042 Placita, Tucson, AZ 85716


In [39]:
geolocator = Nominatim(user_agent="mAIstros")
location = geolocator.geocode(address)

In [40]:
if location is not None:
    lat = location.latitude
    lon = location.longitude
    print(f"Latitude: {lat}, Longitude: {lon}")
else:
    print(f"Could not find location for address: {address}")

Latitude: 32.27346847541859, Longitude: -110.92735088090271


In [41]:
#Create a new data point with user input and geocoded location
new_data = np.array([[1, 2, 3, 1, 0.5, 1], [2, 1, 4, 2, 0.2, 2], [0, 1, 2, 0, 0.8, 0]])

In [42]:
# Predict the sale price using linear regression
sale_price_lr = lr.predict(new_data)

In [43]:
# Remove the extra column from new_data
new_data = new_data[:, :-1]

# Predict the sale price using KNN regression
K = 3
sale_price_knn = knn.predict(new_data, K)

In [44]:
# Create a new data point with the user input
bedrooms_new = int(input("Enter the number of bedrooms: "))

# Create a new DataFrame with the user input
new_data = pd.DataFrame({'fireplaces': [0], 'bathrooms': [0], 'bedrooms': [0], 'garage': [0], 
                         'floor_covering_values': [0], 'bedrooms_new': [bedrooms_new]})

# Select the same columns as X_lr
new_X = new_data[cols_lr].values

# Scale the new data point
new_X_scaled = (new_X - X_lr.min(axis=0)) / (X_lr.max(axis=0) - X_lr.min(axis=0))

# Make a prediction
price_after_renovations = lr.predict(new_X_scaled.reshape(1,-1))[0]

print(f"The estimated price after renovations for a {bedrooms_new}-bedroom house is ${float(price_after_renovations):.2f}")

Enter the number of bedrooms: 5
The estimated price after renovations for a 5-bedroom house is $997572.86


In [45]:
#Print the predicted sale price using both models
print(f"Predicted sale price using Linear Regression: {sale_price_lr}")
print(f"Predicted sale price using KNN Regression: {sale_price_knn}")


Predicted sale price using Linear Regression: [1509975.19104714 1863049.39131337  941572.71238154]
Predicted sale price using KNN Regression: [598333.33333333 598333.33333333 598333.33333333]


In [48]:
data['price_after_renovations'] = data['sold_price']

In [None]:
m = 0
b = 0
learning_rate = 0.02
epochs = 226

for i in range(epochs):
    m, b = gradient_descent(m, b, data, learning_rate)

# calculate predicted prices for all data points using the learned slope and y-intercept
predicted_prices = [m*x + b for x in data['bedrooms']]

# calculate the price difference for all data points
price_diff = data['price_after_renovations'] - data['sold_price']

# plot the difference between the original sales price and the predicted price
plt.scatter(data['bedrooms'], 'sold_price', color='black', alpha=0.5)
plt.plot(data['new_bedrooms'], predicted_prices, color='red')
plt.xlabel('Number of Bedrooms')
plt.ylabel('Price Difference ($)')
plt.title('Difference between Original Sales Price and Predicted Price after Renovations')
plt.show()


In [None]:
#Plot the relationship between the number of bedrooms and the sold price

plt.scatter(data['bedrooms'], data['sold_price'], alpha=0.5)
plt.xlabel('Number of Bedrooms')
plt.ylabel('Sold Price ($)')
plt.title('Relationship between Number of Bedrooms and Sold Price')
plt.show()

In [None]:
data['price_diff'] = data['sold_price'] - data['price_after_renovations']

In [None]:
plt.scatter(data['bedrooms'], data['price_diff'], alpha=0.5)
plt.xlabel('Number of Bedrooms')
plt.ylabel('Price Difference ($)')
plt.title('Difference between Original Sales Price and Predicted Price after Renovations')
plt.show()
