In [None]:
import pandas as pd
import numpy as np
from scipy import stats

# Load the data
df = pd.read_csv('C:/Users/USER/Downloads/Housing.csv')

In [None]:
# Display basic info about the dataset
print("Dataset Information:")
print(df.info())

In [None]:
# Display initial rows of the dataset
print("\nInitial Rows of the Dataset:")
print(df.head())

In [None]:
# Drop duplicates
df = df.drop_duplicates()

In [None]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
# Handle missing values
# Option 1: Drop rows with missing values
df = df.dropna()

In [None]:
# Handle outliers (example: remove rows with outliers in numerical columns)
df = df[(np.abs(stats.zscore(df.select_dtypes(include=['float64', 'int64']))) < 3).all(axis=1)]

In [None]:
# Display cleaned dataset info
print("\nCleaned Dataset Information:")
print(df.info())

In [None]:
#Save cleaned data to a new CSV file in the current working directory
df.to_csv('cleaned_traindata.csv', index=False)

In [None]:
print("Data cleaning complete. Cleaned data saved to 'cleaned_traindata.csv'.")

In [None]:
#Summary statistics for the dataset
print(df.describe())

In [None]:
# Summary statistics for the target variable (price)
print(df['price'].describe())

In [None]:
# Distribution of house prices
import matplotlib.pyplot as plt
import seaborn as sns
# Distribution of house prices 
sns.histplot(df['price'], kde=True)
plt.title('Distribution of Price')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Skewness and kurtosis
from scipy.stats import skew, kurtosis

print('Skewness:', skew(df['price'].dropna()))
print('Kurtosis:', kurtosis(df['price'].dropna()))

In [None]:
# Check for non-numeric values in numeric columns
for column in df.columns:
    if df[column].dtype == object:
        print(f"Non-numeric values in column {column}:")
        print(df[column].unique())

In [None]:
from sklearn.preprocessing import LabelEncoder

# Convert object columns to numeric using label encoding
le = LabelEncoder()
for column in df.select_dtypes(include=[object]):  # Object type usually indicates strings
    df[column] = le.fit_transform(df[column].astype(str))

In [None]:
# Check data types again
print(df.dtypes)

# Select only numeric columns for correlation
df_numeric = df.select_dtypes(include=[np.number])

# Compute the correlation matrix
corr_matrix = df_numeric.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('C:/Users/USER/Downloads/Housing.csv')

# Verify the data is loaded correctly
print(data.head())
print(data.columns)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Scatter plot of 'bedrooms' vs 'price'
plt.figure(figsize=(10, 6))
sns.scatterplot(x='bedrooms', y='price', data=data)
plt.title('Bedrooms vs SalePrice')
plt.xlabel('Bedrooms')
plt.ylabel('SalePrice')
plt.show()

In [None]:
# Histogram of 'Price'
plt.figure(figsize=(10, 6))
sns.histplot(data['price'], kde=True)
plt.title('Distribution of price')
plt.xlabel('price')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Box plot of 'Area' vs 'Price'
plt.figure(figsize=(10, 6))
sns.boxplot(x='area', y='price', data=data)
plt.title('area vs SalePrice')
plt.xlabel('area')
plt.ylabel('price')
plt.show()

In [None]:
# Pairplot for a few selected features
selected_features = ['price', 'bathrooms', 'area', 'airconditioning']
sns.pairplot(data[selected_features])
plt.show()

In [None]:
# Identify outliers in 'bathrooms'
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['bathrooms'])
plt.title('Box plot of bathrooms')
plt.xlabel('bathrooms')
plt.show()

In [None]:
# Identify outliers in 'Price'
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['price'])
plt.title('Box plot of price')
plt.xlabel('price')
plt.show()

In [None]:
# Compute IQR for 'bathrooms'
Q1 = data['bathrooms'].quantile(0.25)
Q3 = data['bathrooms'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
# Filter data based on IQR for 'bathrooms'
data = data[(data['bathrooms'] >= lower_bound) & (data['bathrooms'] <= upper_bound)]

In [None]:
# Compute IQR for 'Price'
Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
# Filter data based on IQR for 'Price'
data = data[(data['price'] >= lower_bound) & (data['price'] <= upper_bound)]

In [None]:
# Check data size after removing outliers
print("Size after removing outliers:", data.shape)

In [None]:
# Box plot of 'bathrooms' after removing outliers
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['bathrooms'])
plt.title('Box plot of bathrooms (after removing outliers)')
plt.xlabel('bathrooms')
plt.show()

In [None]:
# Box plot of 'Price' after removing outliers
plt.figure(figsize=(10, 6))
sns.boxplot(x=data['price'])
plt.title('Box plot of price (after removing outliers)')
plt.xlabel('price')
plt.show()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
# Load the dataset
data = pd.read_csv('C:/Users/USER/Downloads/Housing.csv')

# Verify column names
print("Column names in the dataset:", data.columns)

# Check the first few rows of the dataset
print(data.head())

# Create interaction features if 'bedrooms' and 'bathrooms' columns exist
if 'bedrooms' in data.columns and 'bathrooms' in data.columns:
    data['RoomsPerArea'] = data['bedrooms'] / data['bathrooms']
else:
    print("Columns 'bedrooms' and 'bathrooms' are missing. Check the dataset.")

In [None]:
# Create new features
data['RoomsPerArea'] = data['bedrooms'] / data['bathrooms']

In [None]:
#Create Polynomial Features
data['bathrooms_Squared'] = data['bathrooms'] ** 2

In [None]:
#Create bins or categories from numerical features to capture non-linear relationships. For example, categorize house sizes into bins.
data['areaBin'] = pd.cut(data['bathrooms'], bins=[0, 1000, 2000, 3000, 4000, 5000], labels=['Very Small', 'Small', 'Medium', 'Large', 'Very Large'])

In [None]:
#Normalization
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

# Load the dataset
df = pd.read_csv('C:/Users/USER/Downloads/Housing.csv')

# Identify nominal and numerical features
nominal_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
numerical_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

# Apply One-Hot Encoding for nominal features
onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # Updated parameter name
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', onehot_encoder)])

# Preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])

# Combine numerical and categorical transformers
full_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('nominal', nominal_transformer, nominal_features)])

# Split data into features and target
X = df.drop('price', axis=1)  # Features
y = df['price']  # Target variable

# Transform the features
X_transformed = full_preprocessor.fit_transform(X)

# Encode the target variable if it's categorical
if y.dtype == 'object':  # Check if target is categorical
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
else:
    y_encoded = y

# Print the shape of transformed data
print("Transformed feature matrix shape:", X_transformed.shape)
print("Encoded target variable shape:", y_encoded.shape)

# Train a model (example)
model = LinearRegression()
model.fit(X_transformed, y_encoded)

# Print a message to confirm the model has been trained
print("Model training complete.")

In [None]:
#Standardization (Z-score Normalization)
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

# Load the dataset
df = pd.read_csv('C:/Users/USER/Downloads/Housing.csv')

# Identify nominal and numerical features
nominal_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
numerical_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

# Apply One-Hot Encoding for nominal features
onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False) 
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', onehot_encoder)])

# Preprocessing for numerical features with standardization
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Combine numerical and categorical transformers
full_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('nominal', nominal_transformer, nominal_features)])

# Split data into features and target
X = df.drop('price', axis=1)  # Features
y = df['price']  # Target variable

# Transform the features
X_transformed = full_preprocessor.fit_transform(X)

# Encode the target variable if it's categorical
if y.dtype == 'object':  # Check if target is categorical
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
else:
    y_encoded = y

# Print the shape of transformed data
print("Transformed feature matrix shape:", X_transformed.shape)
print("Encoded target variable shape:", y_encoded.shape)

# Train a model (example)
model = LinearRegression()
model.fit(X_transformed, y_encoded)

# Print a message to confirm the model has been trained
print("Model training complete.")

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_encoded, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets
print("Training feature matrix shape:", X_train.shape)
print("Testing feature matrix shape:", X_test.shape)
print("Training target variable shape:", y_train.shape)
print("Testing target variable shape:", y_test.shape)

In [None]:
#Choose and Justify the Selection of Machine Learning Algorithms
#1.Linear Regression:
#Justification:It is simple and interpretable, suitable for a baseline model.
#2.Decision Tree:
#Justification:It can capture non-linear relationships, but may overfit if not tuned properly.
#3.Gradient Boosting (e.g., XGBoost):
#Justification: It often provides high performance and is effective for both linear and non-linear data, but can be more complex to tune.
#4.Random Forest
#Justification: It reduces overfitting by averaging multiple decision trees. Handles both linear and non-linear data well.


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Load the dataset
df = pd.read_csv('C:/Users/USER/Downloads/Housing.csv')

# Identify nominal and numerical features
nominal_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
numerical_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

# Apply One-Hot Encoding for nominal features
onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', onehot_encoder)
])

# Preprocessing for numerical features with normalization
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())  # Use StandardScaler() for standardization
])

# Combine numerical and categorical transformers
full_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('nominal', nominal_transformer, nominal_features)
    ])

# Split data into features and target
X = df.drop('price', axis=1)  # Features
y = df['price']  # Target variable

# Transform the features
X_transformed = full_preprocessor.fit_transform(X)

# Encode the target variable if it's categorical
if y.dtype == 'object':  # Check if target is categorical
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
else:
    y_encoded = y

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_encoded, test_size=0.2, random_state=42)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Train and evaluate each model
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate performance
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Model: {name}")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  MAE: {mae:.2f}")
    print(f"  R²: {r2:.2f}\n")


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Load the dataset
df = pd.read_csv('C:/Users/USER/Downloads/Housing.csv')

# Identify nominal and numerical features
nominal_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
numerical_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

# Apply One-Hot Encoding for nominal features
onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # Updated parameter name
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', onehot_encoder)
])

# Preprocessing for numerical features with normalization
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())  # Use StandardScaler() for standardization
])

# Combine numerical and categorical transformers
full_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('nominal', nominal_transformer, nominal_features)
    ])

# Split data into features and target
X = df.drop('price', axis=1)  # Features
y = df['price']  # Target variable

# Transform the features
X_transformed = full_preprocessor.fit_transform(X)

# Encode the target variable if it's categorical
if y.dtype == 'object':  # Check if target is categorical
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
else:
    y_encoded = y

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_encoded, test_size=0.2, random_state=42)

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(RandomForestRegressor(), param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

print("Best parameters for Random Forest:", grid_search_rf.best_params_)
print("Best score (neg MSE) for Random Forest:", grid_search_rf.best_score_)

# Evaluate the best Random Forest model on the test set
best_rf_model = grid_search_rf.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Evaluation")
print(f"  RMSE: {rmse_rf:.2f}")
print(f"  MAE: {mae_rf:.2f}")
print(f"  R²: {r2_rf:.2f}")

# Define the parameter grid for Gradient Boosting
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Initialize GridSearchCV for Gradient Boosting
grid_search_gb = GridSearchCV(GradientBoostingRegressor(), param_grid_gb, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_gb.fit(X_train, y_train)

print("Best parameters for Gradient Boosting:", grid_search_gb.best_params_)
print("Best score (neg MSE) for Gradient Boosting:", grid_search_gb.best_score_)

# Evaluate the best Gradient Boosting model on the test set
best_gb_model = grid_search_gb.best_estimator_
y_pred_gb = best_gb_model.predict(X_test)
rmse_gb = np.sqrt(mean_squared_error(y_test, y_pred_gb))
mae_gb = mean_absolute_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print("Gradient Boosting Evaluation")
print(f"  RMSE: {rmse_gb:.2f}")
print(f"  MAE: {mae_gb:.2f}")
print(f"  R²: {r2_gb:.2f}")

In [None]:
#Selecting the Best-Performing Model
#1.Compare Metrics: Compare the RMSE, MAE, and R² values of the Random Forest and Gradient Boosting models.
#3.Choose the Best Model: Select the model with the lowest RMSE (or the highest R² if preferred).
#4.Evaluate on Testing Set: Assess the performance of the selected model on the testing set.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Load the dataset
df = pd.read_csv('C:/Users/USER/Downloads/Housing.csv')

# Identify nominal and numerical features
nominal_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
numerical_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

# Apply One-Hot Encoding for nominal features
onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # Updated parameter name
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', onehot_encoder)
])

# Preprocessing for numerical features with normalization
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())  # Use StandardScaler() for standardization
])

# Combine numerical and categorical transformers
full_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('nominal', nominal_transformer, nominal_features)
    ])

# Split data into features and target
X = df.drop('price', axis=1)  # Features
y = df['price']  # Target variable

# Transform the features
X_transformed = full_preprocessor.fit_transform(X)

# Encode the target variable if it's categorical
if y.dtype == 'object':  # Check if target is categorical
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
else:
    y_encoded = y

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_encoded, test_size=0.2, random_state=42)

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(RandomForestRegressor(), param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

# Define the parameter grid for Gradient Boosting
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Initialize GridSearchCV for Gradient Boosting
grid_search_gb = GridSearchCV(GradientBoostingRegressor(), param_grid_gb, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_gb.fit(X_train, y_train)

# Evaluate the best Random Forest model on the test set
best_rf_model = grid_search_rf.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Evaluate the best Gradient Boosting model on the test set
best_gb_model = grid_search_gb.best_estimator_
y_pred_gb = best_gb_model.predict(X_test)
rmse_gb = np.sqrt(mean_squared_error(y_test, y_pred_gb))
mae_gb = mean_absolute_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

# Compare the performance metrics
print("Random Forest Evaluation")
print(f"  RMSE: {rmse_rf:.2f}")
print(f"  MAE: {mae_rf:.2f}")
print(f"  R²: {r2_rf:.2f}\n")

print("Gradient Boosting Evaluation")
print(f"  RMSE: {rmse_gb:.2f}")
print(f"  MAE: {mae_gb:.2f}")
print(f"  R²: {r2_gb:.2f}\n")

# Select the best-performing model
if rmse_gb < rmse_rf:
    best_model = best_gb_model
    best_model_name = 'Gradient Boosting'
    best_rmse = rmse

In [None]:
#Model Interpretation and Reporting

In [None]:
#The best-performing model provides insights into which features most influence house prices. By understanding these critical features, stakeholders such as real estate agents, property developers, and buyers can make informed decisions. The model's performance metrics confirm its reliability and accuracy in predicting house prices, making it a valuable tool in the real estate market.

In [None]:
#Importances of Interpreting Feature
#Area: High importance might indicate that larger areas significantly increase house prices.
#Bedrooms/Bathrooms: More bedrooms and bathrooms might correspond to higher house prices, indicating their value to buyers.
#Parking: Availability of parking space can be a crucial factor in urban areas.
#Airconditioning/Hotwaterheating: These features can enhance living conditions, thus increasing house prices.
#Mainroad/Guestroom/Basement: Presence of these features might contribute to higher property values due to added convenience or space.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor

# Load the dataset
df = pd.read_csv('C:/Users/USER/Downloads/Housing.csv')

# Identify nominal and numerical features
nominal_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
numerical_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

# Preprocessing pipeline for nominal features with One-Hot Encoding
onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', onehot_encoder)
])

# Preprocessing pipeline for numerical features with normalization
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())  # Use StandardScaler() for standardization if needed
])

# Combine numerical and nominal transformers into a single preprocessor
full_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('nominal', nominal_transformer, nominal_features)
    ])

# Split data into features and target variable
X = df.drop('price', axis=1)  # Features
y = df['price']  # Target variable

# Transform the features using the preprocessor
X_transformed = full_preprocessor.fit_transform(X)

# Encode the target variable if it's categorical
if y.dtype == 'object':  # Check if target is categorical
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
else:
    y_encoded = y

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_encoded, test_size=0.2, random_state=42)

# Train the best model (Example with GradientBoostingRegressor)
best_model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=5)  # Example parameters
best_model.fit(X_train, y_train)

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate performance metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Model: Gradient Boosting")
print(f"  RMSE: {rmse:.2f}")
print(f"  MAE: {mae:.2f}")
print(f"  R²: {r2:.2f}")

# Extract feature importances from the best model
feature_importances = best_model.feature_importances_

# Get the fitted OneHotEncoder from the full_preprocessor
fitted_onehot_encoder = full_preprocessor.named_transformers_['nominal'].named_steps['onehot']

# Get feature names from the fitted encoder
onehot_feature_names = fitted_onehot_encoder.get_feature_names_out(nominal_features)
feature_names = numerical_features + list(onehot_feature_names)

# Create a DataFrame for feature importances
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort features by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.gca().invert_yaxis()
plt.title('Feature Importances in Gradient Boosting Model')
plt.xlabel('Importance')
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor

# Load the dataset
df = pd.read_csv('C:/Users/USER/Downloads/Housing.csv')

# Identify nominal and numerical features
nominal_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
numerical_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

# Preprocessing pipeline for nominal features with One-Hot Encoding
onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', onehot_encoder)
])

# Preprocessing pipeline for numerical features with normalization
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())  # Use StandardScaler() for standardization if needed
])

# Combine numerical and nominal transformers into a single preprocessor
full_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('nominal', nominal_transformer, nominal_features)
    ])

# Split data into features and target variable
X = df.drop('price', axis=1)  # Features
y = df['price']  # Target variable

# Transform the features using the preprocessor
X_transformed = full_preprocessor.fit_transform(X)

# Encode the target variable if it's categorical
if y.dtype == 'object':  # Check if target is categorical
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
else:
    y_encoded = y

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_encoded, test_size=0.2, random_state=42)

# Train the best model (Example with GradientBoostingRegressor)
best_model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=5)  # Example parameters
best_model.fit(X_train, y_train)

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate performance metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Model: Gradient Boosting")
print(f"  RMSE: {rmse:.2f}")
print(f"  MAE: {mae:.2f}")
print(f"  R²: {r2:.2f}")

# Extract feature importances from the best model
feature_importances = best_model.feature_importances_

# Get the fitted OneHotEncoder from the full_preprocessor
fitted_onehot_encoder = full_preprocessor.named_transformers_['nominal'].named_steps['onehot']

# Get feature names from the fitted encoder
onehot_feature_names = fitted_onehot_encoder.get_feature_names_out(nominal_features)
feature_names = numerical_features + list(onehot_feature_names)

# Create a DataFrame for feature importances
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort features by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.gca().invert_yaxis()
plt.title('Feature Importances in Gradient Boosting Model')
plt.xlabel('Importance')
plt.show()

# Create a DataFrame with the preprocessed features for correlation matrix
X_preprocessed_df = pd.DataFrame(X_transformed, columns=feature_names)

# Add the target variable to the DataFrame
X_preprocessed_df['price'] = y_encoded

# Correlation Heatmap
correlation_matrix = X_preprocessed_df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# Actual vs. Predicted Values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.title('Actual vs. Predicted Values')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.show()

# Residual Plot
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.3)
plt.axhline(y=0, color='red', linestyle='--')
plt.title('Residual Plot')
plt.xlabel('Predicted Prices')
plt.ylabel('Residuals')
plt.show()

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv('C:/Users/USER/Downloads/Housing.csv')

# Identify numerical and categorical columns
numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

# Remove the target variable from the feature lists if it exists
if 'price' in numerical_features:
    numerical_features.remove('price')
if 'price' in categorical_features:
    categorical_features.remove('price')

# Define the preprocessing for numerical data (scaling)
numerical_transformer = StandardScaler()

# Define the preprocessing for categorical data (one-hot encoding)
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split the data into features and target
X = df.drop('price', axis=1)
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
gbr = GradientBoostingRegressor()

# Create a pipeline that first transforms the data then fits the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', gbr)
])

# Define the parameter grid for Gradient Boosting Regressor
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 4, 5],
    'model__subsample': [0.8, 1.0],
    'model__min_samples_split': [2, 5, 10]
}

# Set up the GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from GridSearchCV
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Train the model with the best parameters
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Model Performance on Test Set: RMSE={rmse}, MAE={mae}, R²={r2}")

In [None]:
#Final Evaluation
# Import necessary libraries
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Assuming best_model is the final model and X_test, y_test are the test datasets

# Predict on the test set
y_pred = best_model.predict(X_test)

# Evaluate the performance
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Final Model Performance on Test Set:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.2f}")

# Save the trained model and preprocessing pipeline
joblib.dump(best_model, 'final_model.joblib')
print("Model saved as 'final_model.joblib'")

# Save the preprocessing pipeline separately
joblib.dump(best_model.named_steps['preprocessor'], 'preprocessor.joblib')
print("Preprocessor saved as 'preprocessor.joblib'")

In [None]:
import os

# Define the file path
file_path = 'house_price_model.pkl'

# Print the current working directory
print("Current working directory:", os.getcwd())

# Check if the file exists
if os.path.exists(file_path):
    print(f"File '{file_path}' found.")
else:
    print(f"File '{file_path}' not found.")

In [None]:
import joblib
from sklearn.ensemble import RandomForestRegressor

# Example model
model = RandomForestRegressor(n_estimators=100, max_depth=10)

# Save the model
joblib.dump(model, 'house_price_model.pkl')

In [None]:
import joblib
import os
from sklearn.ensemble import RandomForestRegressor

# Example model
model = RandomForestRegressor(n_estimators=100, max_depth=10)

# Save the model
model_path = 'house_price_model.pkl'
joblib.dump(model, model_path)

# Verify the file creation
if os.path.exists(model_path):
    print(f"{model_path} has been successfully saved.")
else:
    print(f"{model_path} was not saved.")

In [None]:
import joblib
from sklearn.ensemble import RandomForestRegressor

# Example model
model = RandomForestRegressor(n_estimators=100, max_depth=10)

# Save the model
model_path = r'C:\Users\USER\Documents\house_price_model.pkl'
joblib.dump(model, model_path)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib
import pandas as pd

# Example DataFrame
df = pd.read_csv('C:/Users/USER/Downloads/Housing.csv')

# Split the data into features and target
X = df.drop('price', axis=1)
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing
numerical_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']
categorical_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']

# Define transformers
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Train the model
model.fit(X_train, y_train)

# Save the model and preprocessor
joblib.dump(model, 'model.pkl')

In [None]:
import joblib
import pandas as pd

# Load the trained model
try:
    model = joblib.dump(model, 'model.pkl')
    print(f"Model type: {type(model)}")
except FileNotFoundError:
    print("Error: 'model.joblib' not found. Please check the file path.")
    exit(1)
except Exception as e:
    print(f"Error loading model: {e}")
    exit(1)

In [None]:
import joblib

# Assuming `model` is your trained model
joblib.dump(model, 'model.joblib')

In [None]:
import joblib

# Load the model
model = joblib.load('model.joblib')
print(f"Loaded model type: {type(model)}")

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
import joblib

# Load the dataset
file_path = 'C:/Users/USER/Downloads/Housing.csv'
data = pd.read_csv(file_path)

# Example preprocessing
# Identify categorical and numeric columns
categorical_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
numeric_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

# Create transformers for preprocessing
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine preprocessing steps into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Prepare data
X = data.drop('price', axis=1)  # Assuming 'price' is your target variable
y = data['price']

# Fit the model
model.fit(X, y)

# Save the model
joblib.dump(model, 'model.joblib')

In [None]:
import pandas as pd
import joblib

# Load the trained model
model = joblib.load('model.joblib')

def predict_from_input(area, bedrooms, bathrooms, stories, parking,
                       mainroad, guestroom, basement, hotwaterheating,
                       airconditioning, prefarea, furnishingstatus):
    # Create DataFrame for prediction
    df = pd.DataFrame([[
        area, bedrooms, bathrooms, stories, parking,
        mainroad, guestroom, basement, hotwaterheating,
        airconditioning, prefarea, furnishingstatus
    ]], columns=[
        'area', 'bedrooms', 'bathrooms', 'stories', 'parking',
        'mainroad', 'guestroom', 'basement', 'hotwaterheating',
        'airconditioning', 'prefarea', 'furnishingstatus'
    ])

    # Make predictions
    prediction = model.predict(df)
    print(f'Predicted price: {prediction[0]}')

# Example usage
predict_from_input(1500, 3, 2, 2, 1, 'yes', 'no', 'no', 'no', 'yes', 'yes', 'no')

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assume X_test and y_test are your test features and target values
y_pred = model.predict(X_test)

# Compute performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

In [None]:
from sklearn.model_selection import cross_val_score

# Perform k-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

print(f"Cross-Validation Scores: {-cv_scores}")
print(f"Mean CV Score: {-cv_scores.mean()}")

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import joblib

# Load and prepare the data
data = pd.read_csv('C:/Users/USER/Downloads/Housing.csv')

# Define feature columns and target
X = data[['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']]
y = data['price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']),
        ('cat', OneHotEncoder(), ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus'])
    ])

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

# Train the model
pipeline.fit(X_train, y_train)

# Save the model
joblib.dump(pipeline, 'model.joblib')

In [None]:
import pandas as pd
import joblib

# Load the model
pipeline = joblib.load('model.joblib')

# Prepare input data
input_data = pd.DataFrame({
    'area': [1500],
    'bedrooms': [3],
    'bathrooms': [2],
    'stories': [2],
    'parking': [1],
    'mainroad': ['yes'],  # Ensure categorical variables are in string format
    'guestroom': ['no'],
    'basement': ['no'],
    'hotwaterheating': ['no'],
    'airconditioning': ['yes'],
    'prefarea': ['yes'],
    'furnishingstatus': ['furnished']  # Example category name
})

# Convert categorical variables to the same format as used during training
# Ensure categorical features are in the same encoding format used in the pipeline

# Make predictions
try:
    predictions = pipeline.predict(input_data)
    print(f'Predicted price: {predictions[0]}')
except Exception as e:
    print(f'An error occurred: {e}')

In [None]:
#Implementing a feedback loop:This involves several steps, including capturing new data, retraining the model, and updating the system with the improved model.
#1.Capture New Data :You’ll need a way to capture and store new data. This could be through a user interface, an automated data collection system, or other means.
import pandas as pd

# Function to capture new data
def capture_new_data(data_source):
    new_data = pd.read_csv(data_source)  # Replace with your actual data source
    return new_data

# Example usage
data_source = 'C:/Users/USER/AppData/Local/Temp/Rar$DI80.288/nigeria_houses_data.csv'
new_data = capture_new_data(data_source)
print(new_data.head())

In [None]:
#Update the Training Dataset
#Combine the new data with existing training data.
# Load existing training data
existing_data = pd.read_csv('C:/Users/USER/Downloads/Housing.csv')

# Capture new data
new_data = capture_new_data('C:/Users/USER/AppData/Local/Temp/Rar$DI80.288/nigeria_houses_data.csv')

# Combine datasets
updated_data = pd.concat([existing_data, new_data])

# Optionally, save the updated dataset
updated_data.to_csv('updated_training_data.csv', index=False)

In [None]:
#Retrain the Model
#Retrain model using the updated dataset. 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import joblib

# Path to your CSV file
file_path = 'C:/Users/USER/AppData/Local/Temp/Rar$DI80.288/nigeria_houses_data.csv'

# Load the data
new_data = pd.read_csv(file_path)

# Identify and print non-numeric columns
non_numeric_cols = new_data.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_cols)

# Convert categorical columns to numeric
label_encoders = {}
for col in non_numeric_cols:
    le = LabelEncoder()
    new_data[col] = le.fit_transform(new_data[col])
    label_encoders[col] = le

# Assuming 'price' is the target variable
X = new_data.drop('price', axis=1)
y = new_data['price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Save the retrained model
joblib.dump(model, 'path_to_your_saved_model.pkl')

print("Model retrained and saved successfully.")

In [None]:
#Update the System
#Update system to use the new model for predictions.
import os

file_path = 'path_to_your_saved_model.pkl'
if os.path.isfile(file_path):
    print(f"File found: {file_path}")
else:
    print(f"File not found: {file_path}")


In [None]:
joblib.dump(model, 'path_to_your_saved_model.pkl')

In [None]:
import joblib
import os

# Path to your saved model
model_path = 'path_to_your_saved_model.pkl'

# Check if the file exists
if os.path.isfile(model_path):
    print("File found:", model_path)
    try:
        # Load the updated model
        model = joblib.load(model_path)
        print("Model loaded successfully.")
    except Exception as e:
        print("Error loading model:", e)
else:
    raise FileNotFoundError(f"Model file not found: {model_path}")

In [None]:
print(type(model))

In [None]:
def make_predictions(new_data):
    # Example preprocessing if needed
    # new_data = preprocess(new_data)
    
    try:
        # Make predictions
        predictions = model.predict(new_data)
        return predictions
    except Exception as e:
        print("Error making predictions:", e)

In [None]:
import pandas as pd

# Sample new data
sample_data = pd.DataFrame({
    'bedrooms': [3],
    'bathrooms': [2],
    'toilets': [2],
    'parking_space': [1],
    'title': ['Sample Title'],
    'town': ['Sample Town'],
    'state': ['Sample State'],
    })

# Make predictions
predictions = make_predictions(sample_data)
print(predictions)

In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Example label encoders (these should match those used during training)
label_encoders = {
    'title': LabelEncoder(),
    'town': LabelEncoder(),
    'state': LabelEncoder()
}

# Function to preprocess new data
def preprocess_new_data(new_data):
    # Convert categorical columns to numerical using the label encoders
    for column, encoder in label_encoders.items():
        if column in new_data.columns:
            new_data[column] = encoder.transform(new_data[column])
    return new_data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import joblib

# Load training data
train_data = pd.read_csv('C:/Users/USER/updated_training_data.csv')

# Define features and target
features = ['title', 'town', 'state', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom', 
            'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus', 'toilets', 
            'parking_space']
target = 'price'

# Ensure all columns exist in the dataset
existing_features = [col for col in features if col in train_data.columns]
if not existing_features:
    raise ValueError("None of the features are in the dataset columns")

X = train_data[existing_features].copy()
y = train_data[target]

# Separate numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Impute numerical features with mean
num_imputer = SimpleImputer(strategy='mean')
X[numerical_features] = num_imputer.fit_transform(X[numerical_features])

# Impute categorical features with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = cat_imputer.fit_transform(X[categorical_features])

# Encode categorical features
label_encoders = {}
for column in categorical_features:
    le = LabelEncoder()
    X.loc[:, column] = le.fit_transform(X[column])
    label_encoders[column] = le

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Save the model and encoders
joblib.dump(model, 'model.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')
joblib.dump(num_imputer, 'num_imputer.pkl')
joblib.dump(cat_imputer, 'cat_imputer.pkl')

# Load the model and encoders
model = joblib.load('model.pkl')
label_encoders = joblib.load('label_encoders.pkl')
num_imputer = joblib.load('num_imputer.pkl')
cat_imputer = joblib.load('cat_imputer.pkl')

def preprocess_new_data(new_data):
    # Impute missing values
    new_data[numerical_features] = num_imputer.transform(new_data[numerical_features])
    new_data[categorical_features] = cat_imputer.transform(new_data[categorical_features])
    
    # Encode categorical features
    for column, encoder in label_encoders.items():
        if column in new_data.columns:
            # Handle unseen labels
            new_data[column] = new_data[column].apply(lambda x: x if x in encoder.classes_ else encoder.classes_[0])
            new_data[column] = encoder.transform(new_data[column])
    
    return new_data

def make_predictions(new_data):
    new_data = preprocess_new_data(new_data)
    try:
        predictions = model.predict(new_data)
        return predictions
    except Exception as e:
        print(f"Error making predictions: {e}")
        return None

# Example new data for prediction
new_data = pd.DataFrame({
    'title': ['Sample Title'],
    'town': ['Sample Town'],
    'state': ['Sample State'],
    'area': [1000],
    'bedrooms': [3],
    'bathrooms': [2],
    'stories': [1],
    'mainroad': [0],
    'guestroom': [1],
    'basement': [0],
    'hotwaterheating': [1],
    'airconditioning': [0],
    'parking': [1],
    'prefarea': [1],
    'furnishingstatus': [0],
    'toilets': [2],
    'parking_space': [1]
})

# Make predictions
predictions = make_predictions(new_data)
print(predictions)

In [None]:
#Task 5.3: Write a comprehensive report summarizing the project, including the methodology, results, and conclusions.