In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('datasets/HousingData.csv')

In [None]:
%%capture
"""
CRIM - per capita crime rate by town
ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
INDUS - proportion of non-retail business acres per town.
CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
NOX - nitric oxides concentration (parts per 10 million)
RM - average number of rooms per dwelling
AGE - proportion of owner-occupied units built prior to 1940
DIS - weighted distances to five Boston employment centres
RAD - index of accessibility to radial highways
TAX - full-value property-tax rate per $10,000
PTRATIO - pupil-teacher ratio by town
B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
LSTAT - % lower status of the population
MEDV - Median value of owner-occupied homes in $1000's
"""

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)
df.isnull().sum()

In [None]:
"""
X (Independent Variable) - Independent variables, also known as predictor variables or features, are the variables that 
are used to predict or explain the outcome of interest.

Y (Target Variable) - The target variable, is the variable that is being predicted or explained by the independent variables.
"""

"""
We will be plotting scatter plots and corr matri to determine which valuescan be used as X
"""
import seaborn as sb
import matplotlib.pyplot as plt

for column in df.columns:
    sb.scatterplot(x=df['MEDV'], y=df[column])
    plt.show()

df.corr(method='pearson')['MEDV']

In [None]:
"""
From above we will decide X
"""
X = df[['INDUS', 'TAX', 'NOX', 'RM', 'AGE', 'PTRATIO', 'LSTAT']]
Y = df[['MEDV']]

In [None]:
"""
Split the datasets into train and test
"""
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
"""
Train the model using train data
"""
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, Y_train)

In [None]:
"""
Predict on testing data and evaluating the model
"""
from sklearn.metrics import mean_squared_error, mean_absolute_error

Y_pred = model.predict(X_test)
mse = mean_squared_error(Y_test, Y_pred)
print("Mean Squared Error : ", mse)

In [None]:
"""
Plotting actual vs predicted
"""
import matplotlib.pyplot as plt

plt.scatter(Y_test, Y_pred)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Prices')
plt.show()