In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
my_linear_regression = LinearRegression()
my_linear_regression.fit(X_train, Y_train)
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('/content/california_housing.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


In [None]:
features_names = df.columns.to_list()[:-1]
X = df[features_names] # features
Y = df['median_house_value'] # response or target variable

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

In [None]:
my_linear_regression = LinearRegression()
my_linear_regression.fit(X_train, Y_train)

In [None]:
print("R^2 =", my_linear_regression.score(X_train, Y_train))

R^2 = 0.6401549846633445


In [None]:
print('The intercept (b_0) is  {}'.format(my_linear_regression.intercept_))
print('The coef (b_1) is  {}'.format(my_linear_regression.coef_[0]))
print('The coef (b_2) is  {}'.format(my_linear_regression.coef_[1]))

The intercept (b_0) is  -3361263.4608528726
The coef (b_1) is  0.8912800982923328
The coef (b_2) is  -40858.91627299241


In [None]:
house_price_pred = my_linear_regression.predict(X_test)
house_price_pred

array([251347.23196202,  66700.3451833 , 105042.4562009 , ...,
       320302.54356955,  61471.33576644, 299086.18631992])

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, house_price_pred))
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, house_price_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, house_price_pred)))

Mean Absolute Error: 50320.82616860233
Mean Squared Error: 4836018152.499041
Root Mean Squared Error: 69541.48511858976


In [None]:
def MAPE(Y, pred): # MAPE: Mean Abs Percentage Error
  l1 = list(Y)
  l2 = list(pred)
  er = []
  for i in range(len(l1)):
    e = np.abs(l1[i]-l2[i])
    er.append(e/l1[i])
  return np.mean(er)

In [None]:
print('MAPE:', MAPE(Y_test, house_price_pred)*100, "%")

MAPE: 29.658795519400694 %


In [None]:
"""
This code performs a simple linear regression on a dataset of housing prices.

First, it extracts the names of the features from the pandas DataFrame df and assigns them to the features_names variable. It then extracts the feature data from df using the features_names list and assigns it to the variable X, while the response variable data (i.e., the median house value) is extracted and assigned to the variable Y.

Next, an instance of the LinearRegression() class from scikit-learn is created and fitted to the training data using the .fit() method. The intercept and coefficients of the linear regression model are then printed using the intercept_ and coef_ attributes of the LinearRegression object.

The model is then used to predict the housing prices on the test data using the .predict() method and the predicted values are stored in the variable house_price_pred.

The code then calculates several metrics to evaluate the performance of the model on the test data, including the mean absolute error, mean squared error, and root mean squared error. These metrics are computed using functions from the scikit-learn library.

Finally, the code defines a custom function MAPE() to calculate the mean absolute percentage error (MAPE) of the model's predictions. This function takes two arguments (Y and pred) and returns the mean absolute percentage error.
"""