In [58]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [90]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error 


In [91]:
# Download a sample dataset
url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
data = pd.read_csv(url)

In [92]:
#Define features and target
x = data.drop(columns=["median_house_value"])
y = data["median_house_value"]

In [93]:
X = data.drop(columns=['median_house_value'])  # Replace 'sales_price' with the actual column name

In [94]:
print(X.head())  # See the first few rows
print(X.dtypes)  # Check data types of all columns
print(X.isnull().sum())  # Look for missing values


   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income ocean_proximity  
0       322.0       126.0         8.3252        NEAR BAY  
1      2401.0      1138.0         8.3014        NEAR BAY  
2       496.0       177.0         7.2574        NEAR BAY  
3       558.0       219.0         5.6431        NEAR BAY  
4       565.0       259.0         3.8462        NEAR BAY  
longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households     

In [95]:
# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns
print("Categorical Columns:", categorical_cols)

# One-hot encode categorical columns
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Remove spaces from column names
X.columns = [col.replace(" ", "_") for col in X.columns]

# Convert all columns to float
X = X.astype(float)

print(X.head())  # Verify the changes


Categorical Columns: Index(['ocean_proximity'], dtype='object')
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  ocean_proximity_INLAND  \
0       322.0       126.0         8.3252                     0.0   
1      2401.0      1138.0         8.3014                     0.0   
2       496.0       177.0         7.2574                     0.0   
3       558.0       219.0         5.6431                     0.0   
4       565.0       259.0         3.8462                     0.0   

   ocean_proximity_ISLAND  ocean_proximity_NEAR_BAY  \
0                    

In [96]:
X = X.astype(float)  # Convert everything to numerical format

In [97]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [98]:
print(model)  # Check if the model exists

RandomForestRegressor(random_state=42)


In [99]:
print(X_test.shape)  # Check if it has rows & columns
print(type(X_test))  # Should be a DataFrame or NumPy array

(4128, 12)
<class 'pandas.core.frame.DataFrame'>


In [100]:
# Retrain model if necessary
if not hasattr(model, "estimators_"):
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

# Ensure X_test is in the correct format
X_test = X_test.fillna(X_test.mean())

# Make predictions
y_pred = model.predict(X_test)

# Display first 10 predictions
print(y_pred[:10])

[ 54088.    69552.   467524.39 247720.   267291.   162848.   245680.03
 169663.   286710.03 484536.68]


In [101]:
# Example: Predict for a new house (modify values as needed)
new_house = [[-122.23, 37.88, 41.0, 880.0, 129.0, 322.0, 126.0, 8.3252, 0, 0, 1, 0]]
price_prediction = model.predict(new_house)

print("Predicted price:", price_prediction[0])

Predicted price: 431413.33




In [102]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Make predictions
y_pred = model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R²): {r2:.4f}")


Mean Absolute Error (MAE): 31666.16
Mean Squared Error (MSE): 2403100098.18
Root Mean Squared Error (RMSE): 49021.42
R-squared (R²): 0.8166
