This is a simple implementation of Linear Regression on Boston Housing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Fetch the dataset as plain text
url = "http://lib.stat.cmu.edu/datasets/boston"
response = requests.get(url)
data = response.text

# Step 2: Split the data into lines
lines = data.split("\n")

# Step 3: Locate the data section (after metadata)
# The data starts after line 22
data_lines = lines[22:]

# Step 4: Process the data
# Data is split across two lines for each record
data_array = []
for i in range(0, len(data_lines) - 1, 2):  # Process in pairs of lines
    if data_lines[i].strip() == "":  # Skip empty lines
        continue
    line1 = np.array(data_lines[i].split(), dtype=float)  # First part of the record
    line2 = np.array(data_lines[i + 1].split(), dtype=float)  # Second part of the record
    combined = np.hstack([line1, line2])  # Combine both parts into one record
    data_array.append(combined)

# Step 5: Convert to a DataFrame
# Column names are known from the metadata
columns = [
    "CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS",
    "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"
]
boston_df = pd.DataFrame(data_array, columns=columns)

# Step 6: Display the DataFrame
boston_df.head()

 <pre>
 CRIM     per capita crime rate by town
 ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
 INDUS    proportion of non-retail business acres per town
 CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
 NOX      nitric oxides concentration (parts per 10 million)
 RM       average number of rooms per dwelling
 AGE      proportion of owner-occupied units built prior to 1940
 DIS      weighted distances to five Boston employment centres
 RAD      index of accessibility to radial highways
 TAX      full-value property-tax rate per $10,000
 PTRATIO  pupil-teacher ratio by town
 B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
 LSTAT    % lower status of the population
 MEDV     Median value of owner-occupied homes in $1000's

In [None]:
# Checking the missing values
print(boston_df.isnull().sum())

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(boston_df.corr(), annot=True)
plt.show()

In [None]:
plt.scatter(boston_df['RM'], boston_df['MEDV'], color='blue')
plt.xlabel('Average Number of Rooms per Dwelling (RM)')
plt.ylabel('Median Value of Owner-occupied Homes (MEDV)')
plt.title('Relationship between RM and MEDV')
plt.show()

In [None]:
plt.bar(boston_df['AGE'], boston_df['MEDV'], color='Green')
plt.xlabel('Age of the Property')
plt.ylabel('Median Value of Owner-occupied Homes (MEDV)')
plt.title('Relationship between MEDV and Age')

In [None]:
plt.bar(boston_df['INDUS'], boston_df['MEDV'], color='red')
plt.xlabel('Proportion of Non-Retail Business Acres per Town (INDUS)')
plt.ylabel('Median Value of Owner-occupied Homes (MEDV)')

In [None]:
plt.bar(boston_df['LSTAT'], boston_df['MEDV'], color='orange')
plt.xlabel('Lower Status of the Population (%)')
plt.ylabel('Median Value of Owner-occupied Homes (MEDV)')

In [None]:
# Segregating into dependent and independent values
x = boston_df.drop('MEDV', axis=1)
y = boston_df['MEDV']

In [None]:
# splitting the data into train test split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# Creating the model
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# Score of the model
model.score(X_test, y_test)

In [None]:
# Mean Squared Error of the model
mean_squared_error(y_test, model.predict(X_test))

In [None]:
# r2_score of the model
r2_score(y_test, model.predict(X_test))