<a href="https://colab.research.google.com/github/4708740807/New-website/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

# Step 1: Load the dataset
data = pd.read_csv('/house_prices (1).csv')

In [3]:
# Step 2: Data Cleaning
# Check for missing values
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)

numeric_cols = data.select_dtypes(include=[np.number]).columns
data[numeric_cols].fillna(data[numeric_cols].mean(), inplace=True)

# Fill categorical/object columns with their mode
object_cols = data.select_dtypes(include=[object]).columns
for col in object_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

# Step 3: Remove duplicates
data = data.drop_duplicates()

Missing values in each column:
 id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[numeric_cols].fillna(data[numeric_cols].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)  # Fill with the most frequent value


In [6]:

# Step 3: Feature Engineering
# Assume 'price' is the target variable and the rest are features
X = data.drop('price', axis=1)  # Features
y = data['price']                # Target variable
# Step 1: Inspect the DataFrame
print("Data types of each column:\n", X.dtypes)
print("First few rows of the DataFrame:\n", X.head())

# Step 2: Identify non-numeric columns
non_numeric_cols = X.select_dtypes(include=[object]).columns
print("Non-numeric columns:", non_numeric_cols)

# Step 3: Handle non-numeric data
for col in non_numeric_cols:
    # Example mapping for categorical variables
    if X[col].nunique() == 2:  # If there are only two unique values
        X[col] = X[col].map({'Y': 1, 'N': 0})  # Adjust mapping based on your data
    else:
        # For other categorical variables, you might want to use one-hot encoding
        X = pd.get_dummies(X, columns=[col], drop_first=True)

# Step 4: Check for missing values
missing_values = X.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Fill missing values if any (example: fill with mean for numerical columns)
X.fillna(X.mean(), inplace=True)

# Step 5: Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Now X_scaled is ready for use in your model
print("Scaled features:\n", X_scaled)

Data types of each column:
 id                 int64
date              object
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront        object
view               int64
condition         object
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object
First few rows of the DataFrame:
            id             date  bedrooms  bathrooms  sqft_living  sqft_lot  \
0  7129300520  20141013T000000         3       1.00         1180      5650   
1  6414100192  20141209T000000         3       2.25         2570      7242   
2  5631500400  20150225T000000         2       1.00          770     10000   
3  2487200875  20141209T000000         4       3.00         1960      5000   
4  1954400510  20150218T000000    

In [7]:
# Step 4: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 5: Build the XGBoost Model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
xgb_model.fit(X_train, y_train)

In [8]:
# Step 6: Evaluate the XGBoost Model
y_pred_xgb = xgb_model.predict(X_test)
xgb_r2 = r2_score(y_test, y_pred_xgb)
print(f'XGBoost R-squared: {xgb_r2}')

XGBoost R-squared: 0.8750913285568624


In [10]:
# Step 7: Compare with Other Algorithms

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
lr_r2 = r2_score(y_test, y_pred_lr)
print(f'Linear Regression R-squared: {lr_r2}')

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_r2 = r2_score(y_test, y_pred_rf)
print(f'Random Forest R-squared: {rf_r2}')


Linear Regression R-squared: 0.6968616708535988
Random Forest R-squared: 0.858752823483942


In [11]:
# Step 8: Summarize Performance
performance_summary = pd.DataFrame({
    'Model': ['XGBoost', 'Linear Regression', 'Random Forest'],
    'R-squared Value': [xgb_r2, lr_r2, rf_r2]
})

print("\nPerformance Summary:")
print(performance_summary)


Performance Summary:
               Model  R-squared Value
0            XGBoost         0.875091
1  Linear Regression         0.696862
2      Random Forest         0.858753
