# Lab Task: Simple and Multiple Linear Regression

## Objective
Predict house prices based on dataset features using Linear Regression.

In [None]:
# --- Task 1: Data Loading and Exploration ---

# 1. Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# 2. Load the dataset
df = pd.read_csv('usa_housing.csv')

# 3. Display first 5 rows to check data structure
print("Data Sample:")
print(df.head())

# 4. Show basic statistics (mean, std, min, max)
print("\nData Statistics:")
print(df.describe())

# 5. Visualize correlations between features
plt.figure(figsize=(10, 6))
# Select only numerical columns for the heatmap
numerical_df = df.select_dtypes(include=[np.number])
sns.heatmap(numerical_df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

In [None]:
# --- Task 2: Simple Linear Regression ---
# Predicting Price using ONLY 'Avg. Area Income'

# 1. Select Feature (X) and Target (y)
X_simple = df[['Avg. Area Income']]
y_simple = df['Price']

# 2. Split data into Training (80%) and Testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_simple, y_simple, test_size=0.2, random_state=42)

# 3. Initialize and train the Linear Regression model
lm_simple = LinearRegression()
lm_simple.fit(X_train, y_train)

# 4. Make predictions on the test set
predictions = lm_simple.predict(X_test)

# 5. Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2_simple = r2_score(y_test, predictions)

print("Simple Linear Regression Results:")
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2_simple}")

# 6. Visualize the Regression Line
plt.scatter(X_test, y_test, color='blue', label='Actual Data')
plt.plot(X_test, predictions, color='red', label='Prediction Line')
plt.xlabel('Avg. Area Income')
plt.ylabel('Price')
plt.legend()
plt.title('Simple Linear Regression')
plt.show()

In [None]:
# --- Task 3: Multiple Linear Regression ---
# Predicting Price using ALL numerical features

# 1. Select Features (Drop 'Price' and text 'Address') and Target (Price)
X_multi = df.drop(['Price', 'Address'], axis=1)
y_multi = df['Price']

# 2. Split data into Training (70%) and Testing (30%)
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(X_multi, y_multi, test_size=0.3, random_state=42)

# 3. Train the model
lm_multi = LinearRegression()
lm_multi.fit(X_train_m, y_train_m)

# 4. Make predictions
predictions_multi = lm_multi.predict(X_test_m)

# 5. Evaluate performance
mse_multi = mean_squared_error(y_test_m, predictions_multi)
r2_multi = r2_score(y_test_m, predictions_multi)

print("Multiple Linear Regression Results:")
print(f"Mean Squared Error: {mse_multi}")
print(f"R2 Score: {r2_multi}")

# 6. Show coefficients to see which feature matters most
coeff_df = pd.DataFrame(lm_multi.coef_, X_multi.columns, columns=['Coefficient'])
print("\nFeature Coefficients:")
print(coeff_df)

In [None]:
# --- Task 4: Model Comparison ---

print(f"Simple Linear Regression R2:   {r2_simple:.4f}")
print(f"Multiple Linear Regression R2: {r2_multi:.4f}")

# Compare which is better
if r2_multi > r2_simple:
    print("\nConclusion: Multiple Linear Regression is better.")
    print("Reason: Using more relevant features helps the model predict prices more accurately.")
else:
    print("\nConclusion: Simple Linear Regression is better.")

### Bonus Questions Answers

1. **What does R2 score tell you?**
   It tells you how accurate the model is. A score of 1.0 is perfect, while 0.0 is bad.

2. **Why is Multiple Regression better?**
   Because it considers multiple factors (like Age of House, Number of Rooms) instead of just one (Income), making the prediction smarter.

3. **Strongest Feature?**
   'Avg. Area Income' usually has the highest coefficient, meaning it affects the price the most.

4. **Limitations of Linear Regression?**
   It assumes data follows a straight line. If the real relationship is curved (exponential), this model will fail. It also handles outliers poorly.