<a href="https://colab.research.google.com/github/Anuhya0313/Machen-Learning/blob/main/HousepricePrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# load and inspect data
import pandas as pd

# Load the dataset
data = pd.read_csv('data.csv')

# Display the first few rows of the dataset
print(data.head())

# Display summary statistics to understand the data distribution
print(data.describe())

# Display data types and check for missing values
print(data.info())


                  date      price  bedrooms  bathrooms  sqft_living  sqft_lot  \
0  2014-05-02 00:00:00   313000.0       3.0       1.50         1340      7912   
1  2014-05-02 00:00:00  2384000.0       5.0       2.50         3650      9050   
2  2014-05-02 00:00:00   342000.0       3.0       2.00         1930     11947   
3  2014-05-02 00:00:00   420000.0       3.0       2.25         2000      8030   
4  2014-05-02 00:00:00   550000.0       4.0       2.50         1940     10500   

   floors  waterfront  view  condition  sqft_above  sqft_basement  yr_built  \
0     1.5           0     0          3        1340              0      1955   
1     2.0           0     4          5        3370            280      1921   
2     1.0           0     0          4        1930              0      1966   
3     1.0           0     0          4        1000           1000      1963   
4     1.0           0     0          4        1140            800      1976   

   yr_renovated                    str

In [2]:
# Handle Missing Values and Data Quality Issues
# Check for missing values
missing_values = data.isnull().sum()
print(missing_values)

# Drop rows with missing target values
data = data.dropna(subset=['price'])

# Optionally fill missing values for features with mean/median or mode
# Example: data['bedrooms'] = data['bedrooms'].fillna(data['bedrooms'].median())
for column in ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition']:
    if data[column].isnull().sum() > 0:
        data[column] = data[column].fillna(data[column].median())

# Verify that there are no missing values left
print(data.isnull().sum())


date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
country          0
dtype: int64
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
country          0
dtype: int64


In [3]:
#Encode Categorical Variables
# Encode 'waterfront' as it's likely categorical
data['waterfront'] = data['waterfront'].astype('category').cat.codes


In [4]:
#Split the Data
from sklearn.model_selection import train_test_split

# Define features and target
X = data[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition']]
y = data['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")


Training set size: 3680
Testing set size: 920


In [5]:
#Train the Model
from sklearn.linear_model import LinearRegression

# Create the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)


In [6]:
#Evaluate the Model
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Calculate the mean squared error and R-squared score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")


Mean Squared Error: 986869414953.98
R-squared Score: 0.03233518995632512


In [7]:
#Make Predictions
# Example new data
new_data = pd.DataFrame({
    'bedrooms': [3, 4],
    'bathrooms': [2, 3],
    'sqft_living': [2000, 3000],
    'sqft_lot': [5000, 7500],
    'floors': [1, 2],
    'waterfront': [0, 1],
    'view': [0, 1],
    'condition': [3, 4]
})

# Predict prices for the new data
predictions = model.predict(new_data)
print(predictions)


[ 473110.24950225 1250433.61667921]


In [8]:
# Calculate "accuracy" within a tolerance level
def accuracy_within_tolerance(y_true, y_pred, tolerance=0.10):
    return ((abs(y_pred - y_true) / y_true) < tolerance).mean()

# Define the tolerance level (e.g., 10%)
tolerance = 0.10
accuracy = accuracy_within_tolerance(y_test, y_pred, tolerance)
print(f"Accuracy within {tolerance*100}% tolerance: {accuracy * 100:.2f}%")


Accuracy within 10.0% tolerance: 20.54%
