# Linear Regression Analysis on House Prices

In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset (placeholder code as the actual dataset needs to be provided)
# Assuming the dataset is in a CSV file format and has been preprocessed based on the team's approach
# dataset = pd.read_csv('path_to_dataset.csv')

# Placeholder dataset creation for demonstration purposes
np.random.seed(0)
dataset = pd.DataFrame({
    'MSZoning': np.random.choice(['RL', 'RM', 'FV', 'RH', 'C'], size=1000),
    'LotFrontage': np.random.normal(60, 20, 1000),
    'Street': np.random.choice(['Pave', 'Grvl'], size=1000),
    'OverallQual': np.random.randint(1, 10, 1000),
    'YearBuilt': np.random.randint(1900, 2021, 1000),
    'BsmtQual': np.random.choice(['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'], size=1000),
    'TotalBsmtSF': np.random.normal(1050, 400, 1000),
    'GrLivArea': np.random.normal(1500, 500, 1000),
    'GarageArea': np.random.normal(500, 200, 1000),
    'SalePrice': np.random.normal(200000, 50000, 1000)
})

# Preprocessing: Encoding categorical variables, normalization, etc. (Placeholder)
# Normally, we would perform preprocessing steps like handling NA values, encoding categorical variables, normalization etc. 
# For this demonstration, we'll skip these steps.

# Splitting the dataset into features (X) and target variable (y)
X = dataset.drop('SalePrice', axis=1)
y = dataset['SalePrice']

# Encoding categorical variables (placeholder code)
X_encoded = pd.get_dummies(X, drop_first=True)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=0)

# Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predicting the Test set results
y_pred = model.predict(X_test)

# Evaluating the Model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output results
print('Mean Squared Error:', mse)
print('R-squared Score:', r2)


Mean Squared Error: 2840845406.968538
R-squared Score: -0.04084064913057994
