# Linear Regression

## Objective
Build a linear regression model to predict real estate prices using numerical features. Evaluate performance using cross-validation and standard regression metrics.

# Import & Install Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import hvplot.pandas




import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


# Checking Data

In [None]:
df = pd.read_csv("Real estate.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# checking null values
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.corr()

# Exploratory Data Analysis (EDA)

In [None]:
sns.heatmap(df.corr(), annot=True , cmap='Reds')
plt.show()

In [None]:
count_plot_fields = [i for i in df.columns if df[i].nunique()]  # list comprehension
count_plot_fields

In [None]:
fig = plt.figure(figsize = (15,14))
for i ,j in enumerate(count_plot_fields):
    plt.subplot(4,4, i+1)
    sns.countplot(x = df[j])
    plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(df)
plt.show()

# Training a Linear Regression Model

### X and y arrays

In [None]:
X=df.drop('Y house price of unit area', axis=1)

y=df['Y house price of unit area']

In [None]:
print("X=",X.shape,"\ny=", y.shape)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


## Train Test Split
Now let's split the data into a training set and a testing set. We will train out model on the training set and then use the test set to evaluate the model.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

## Linear Regression

In [None]:
model = LinearRegression()
cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='r2')

In [None]:
model.fit(X_train, y_train)

## Model Evaluation

In [None]:
model.coef_

In [None]:
pd.DataFrame(model.coef_, X.columns, columns=['Coeficients'])

## Predictions from model

In [None]:
y_pred = model.predict(X_test)

## Evaluating Model's Performance

### Regression Evaluation Metrics

#### 1. MSE (Mean Squared Error)
*  Measures the average of the squared differences between predicted and actual values.
* Significance: Penalizes larger errors more heavily. Lower is better.

#### 2. RMSE (Root Mean Squared Error)
* Square root of MSE. Same units as the target variable.
* Significance: Easier to interpret than MSE; shows average prediction error.

#### 3. **MAE (Mean Absolute Error)**
* Average of absolute differences between predicted and actual values.
* Significance: Less sensitive to outliers than MSE. Lower is better.

#### 4. **R² Score (Coefficient of Determination)**
* Measures how much of the variance in the target is explained by the model.
* Significance: Ranges from 0 to 1 (or negative). **Closer to 1 means better fit.


In [None]:
mse = metrics.mean_squared_error (y_test , y_pred)
rmse = np.sqrt(mse)
r2 = metrics.r2_score(y_test, y_pred)
mae = metrics.mean_absolute_error (y_test , y_pred)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))
print("Mean squared error [mse]:", mse)
print("Root mean squared error [rmse]:", rmse)
print("R^2 score [r2]:", r2)
print("Mean absolute error [mae]:mae", mae)

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred , color = 'blue', alpha = 0.7)
plt.xlabel("actual house prices")
plt.ylabel("predicted house price")
plt.title("actual vs predicted house prices")
plt.legend()
plt.show()

## **Residual Histogram**

* **For Linear Regression it is a good idea to separately evaluate residuals $$(y-\hat{y})$$ and not just calculate performance metrics (e.g. RMSE).**

* **The residual eerors should be random and close to a normal distribution.**

In [None]:
test_residual= y_test - y_pred

In [None]:
pd.DataFrame({'Error Values': (test_residual)}).hvplot.kde()

In [None]:
sns.displot(test_residual, bins=25, kde=True)
plt.show()

In [None]:
sns.scatterplot(x=y_test, y=test_residual)

plt.axhline(y=0, color='r', ls='--')
plt.show()