In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pickle


In [None]:
# Load the dataset
data = pd.read_csv('C:/Users/16192/OneDrive/Documents/GitHub/ANA680Final/Data/test.csv')
print(data.shape)
print(data.head(10))

In [None]:
# Ensure 'SalePrice' column exists in data
if 'SalePrice' not in data.columns:
    raise ValueError("'SalePrice' column is missing from the data")

In [None]:
# Select relevant features and target variable
features = ['Neighborhood', 'LotArea', 'YearBuilt', 'BldgType', 'CentralAir', 'GarageCars', 'TotRmsAbvGrd', 'FullBath', 'HalfBath']
target = 'SalePrice'

X = data[features]
y = data[target]

In [None]:
# Preprocess features
categorical_features = ['Neighborhood', 'BldgType', 'CentralAir']
numerical_features = ['LotArea', 'YearBuilt', 'GarageCars', 'TotRmsAbvGrd', 'FullBath', 'HalfBath']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# Fit the preprocessor on the entire data
preprocessor.fit(X)

In [None]:
# Create the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train the model
model.fit(X_train, y_train)

In [None]:
# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
# Calculate residuals
residuals = y_test - y_pred

In [None]:
# Scatter Plot of Actual vs. Predicted Prices
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=3)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted Prices')
plt.grid(True)
plt.savefig('actual_vs_predicted.png')  # Save the figure
plt.show()

In [None]:
# Histogram of Residuals
plt.figure(figsize=(10, 6))
plt.hist(residuals, bins=30, edgecolor='k', alpha=0.7)
plt.xlabel('Prediction Error (Residual)')
plt.ylabel('Number of Houses')
plt.title('Distribution of Prediction Errors')
plt.grid(True)
plt.savefig('histogram_of_residuals.png')
plt.show()

In [None]:
# Print the first few predictions and actual values
for actual, predicted in zip(y_test[:10], y_pred[:10]):
    print(f"Actual: {actual}, Predicted: {predicted}")

In [None]:
# Save the model to a file
model_path = 'house_price_model.pkl'
with open(model_path, 'wb') as model_file:
    pickle.dump(model, model_file)


In [None]:
# Save the preprocessor separately
with open('preprocessor.pkl', 'wb') as preprocessor_file:
    pickle.dump(preprocessor, preprocessor_file)