                                             Housing-price-predictor


Importing libraries

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

Importing the datsets and understanding the data

In [None]:
housing = pd.read_csv("data.csv")
#housing.head(), #housing.info(), #housing['CHAS'].value_counts(),#housing.describe()

Plotting the data in order to understand it better

In [None]:

housing.hist(bins = 50, figsize=(20,15))

Train-test splitting

1. Using a user-defined function

In [None]:

def split_train_test(data, test_ratio):
    np.random.seed(42)
    shuffled = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
#train_set, test_set = split_train_test(housing, 0.2)

2. Using scikit - learn

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state =42)


Stratified Split - attempts to keep the same percentages of classes in each split

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size= 0.2, random_state = 42)
for train_index, test_index in split.split(housing,housing['CHAS']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
housing = strat_train_set.copy()

#Looking for correlations

In [None]:
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)

Scatter matrix and Scatter plots

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["RM","ZN","MEDV","LSTAT"]
scatter_matrix(housing[attributes],figsize=(12,8))

In [None]:
housing.plot(kind="scatter",x="RM", y = "MEDV",alpha = 0.8)

#Trying out different attributes

In [None]:
housing.head()

In [None]:
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False)

In [None]:
housing = strat_train_set.drop("MEDV",axis=1)
housing_labels = strat_train_set["MEDV"].copy()

Handling Missing attributes

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy="median")
imputer.fit(housing)
X = imputer.transform(housing)
housing_tr = pd.DataFrame(X,columns=housing.columns)
housing_tr.describe()


##Scikit-learn Design
#primarly, three types of objects
1. Estimators - It estimates some parameters based on a dataset.
2. Transformers - Transform method takes input and returns uptput based on the learning from fit()
3. Predictors - Linear Regression model is an example of predictor. fit() and predict() are two common functions. It also gives score function which will evaluate the predictions.

Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

Creating a pipeline

In [None]:
from sklearn.pipeline import Pipeline
my_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),
('std_scaler', StandardScaler())
])

In [None]:
housing_num_tr = my_pipeline.fit_transform(housing)

In [None]:
housing_num_tr

Selecting and training the model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(housing_num_tr,housing_labels)

In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
prepared_data = my_pipeline.transform(some_data)
model.predict(prepared_data)
list(some_labels)

Evaluating the model

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num_tr)
mse = mean_squared_error(housing_labels,housing_predictions)
mse = np.sqrt(mse)
mse

using better evaluation technique - cross-validation

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr,housing_labels,scoring="neg_mean_squared_error", cv = 10)
rmse_scores = np.sqrt(-scores)
rmse_scores

In [None]:
def print_scores(scores):
    print("Scores:", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

print_scores(rmse_scores)

Saving the model

In [None]:

from joblib import dump, load
dump(model , 'Dragon.joblib')


Model Testing

In [None]:
X_test = strat_test_set.drop("MEDV",axis = 1 )
Y_test = strat_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse