## SARiSC Laboratorium

In [18]:
import pandas as pd
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
from sklearn.preprocessing import StandardScaler

1. longitude: A measure of how far west a house is; a higher value is farther west

2. latitude: A measure of how far north a house is; a higher value is farther north

3. housingMedianAge: Median age of a house within a block; a lower number is a newer building

4. totalRooms: Total number of rooms within a block

5. totalBedrooms: Total number of bedrooms within a block

6. population: Total number of people residing within a block

7. households: Total number of households, a group of people residing within a home unit, for a block

8. medianIncome: Median income for households within a block of houses (measured in tens of thousands of US Dollars)

9. medianHouseValue: Median house value for households within a block (measured in US Dollars)

10. oceanProximity: Location of the house w.r.t ocean/sea

In [7]:
df = pd.read_csv('housing.csv')
profile = ProfileReport(df, title='California Housing Prices - EDA Report', explorative=True)
profile.to_file("california_housing_eda_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [21]:
nan_values = df.isna().sum()

print("Kolumny z wartościami NaN oraz ich liczba:")
print(nan_values[nan_values > 0])

df_cleaned = df.dropna()

nan_values = df_cleaned.isna().sum()

print("Kolumny z wartościami NaN oraz ich liczba:")
print(nan_values[nan_values > 0])

ocean_proximity_encoded = pd.get_dummies(df_cleaned['ocean_proximity'], prefix='ocean_proximity')
df_encoded = pd.concat([df_cleaned, ocean_proximity_encoded], axis=1)
df_encoded.drop('ocean_proximity', axis=1, inplace=True)

scaler = StandardScaler()

columns_to_standardize = ['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income']

df_encoded[columns_to_standardize] = scaler.fit_transform(df_encoded[columns_to_standardize])

print(df_encoded.info())
print(df_encoded.describe())


Kolumny z wartościami NaN oraz ich liczba:
total_bedrooms    207
dtype: int64
Kolumny z wartościami NaN oraz ich liczba:
Series([], dtype: int64)
<class 'pandas.core.frame.DataFrame'>
Index: 20433 entries, 0 to 20639
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   longitude                   20433 non-null  float64
 1   latitude                    20433 non-null  float64
 2   housing_median_age          20433 non-null  float64
 3   total_rooms                 20433 non-null  float64
 4   total_bedrooms              20433 non-null  float64
 5   population                  20433 non-null  float64
 6   households                  20433 non-null  float64
 7   median_income               20433 non-null  float64
 8   median_house_value          20433 non-null  float64
 9   ocean_proximity_<1H OCEAN   20433 non-null  bool   
 10  ocean_proximity_INLAND      20433 non-null  bool   
 11  ocea

In [29]:


X = df_encoded.drop(columns=['median_house_value'])  
y = df_encoded['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

linear_reg = LinearRegression()
decision_tree_reg = DecisionTreeRegressor(max_depth=5, min_samples_split=2, min_samples_leaf=1)
neural_network_reg = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=1000)

linear_reg.fit(X_train, y_train)
decision_tree_reg.fit(X_train, y_train)
neural_network_reg.fit(X_train, y_train)

linear_reg_pred = linear_reg.predict(X_test)
decision_tree_reg_pred = decision_tree_reg.predict(X_test)
neural_network_reg_pred = neural_network_reg.predict(X_test)

linear_reg_mse = mean_squared_error(y_test, linear_reg_pred)
decision_tree_reg_mse = mean_squared_error(y_test, decision_tree_reg_pred)
neural_network_reg_mse = mean_squared_error(y_test, neural_network_reg_pred)

print("Mean Squared Error (MSE) dla regresji liniowej:", linear_reg_mse)
print("Mean Squared Error (MSE) dla drzewa decyzyjnego:", decision_tree_reg_mse)
print("Mean Squared Error (MSE) dla modelu opartego na sieciach neuronowych:", neural_network_reg_mse)


Mean Squared Error (MSE) dla regresji liniowej: 4802173538.604159
Mean Squared Error (MSE) dla drzewa decyzyjnego: 5089544884.890054
Mean Squared Error (MSE) dla modelu opartego na sieciach neuronowych: 3570185353.0107675




In [31]:
linear_reg_mae = mean_absolute_error(y_test, linear_reg_pred)
decision_tree_reg_mae = mean_absolute_error(y_test, decision_tree_reg_pred)
neural_network_reg_mae = mean_absolute_error(y_test, neural_network_reg_pred)

linear_reg_rmse = np.sqrt(mean_squared_error(y_test, linear_reg_pred))
decision_tree_reg_rmse = np.sqrt(mean_squared_error(y_test, decision_tree_reg_pred))
neural_network_reg_rmse = np.sqrt(mean_squared_error(y_test, neural_network_reg_pred))

linear_reg_r2 = r2_score(y_test, linear_reg_pred)
decision_tree_reg_r2 = r2_score(y_test, decision_tree_reg_pred)
neural_network_reg_r2 = r2_score(y_test, neural_network_reg_pred)

print("Mean Absolute Error (MAE):")
print("Regresja liniowa:", linear_reg_mae)
print("Drzewo decyzyjne:", decision_tree_reg_mae)
print("Sieć neuronowa:", neural_network_reg_mae)

print("\nRoot Mean Squared Error (RMSE):")
print("Regresja liniowa:", linear_reg_rmse)
print("Drzewo decyzyjne:", decision_tree_reg_rmse)
print("Sieć neuronowa:", neural_network_reg_rmse)

print("\nR^2 score:")
print("Regresja liniowa:", linear_reg_r2)
print("Drzewo decyzyjne:", decision_tree_reg_r2)
print("Sieć neuronowa:", neural_network_reg_r2)

Mean Absolute Error (MAE):
Regresja liniowa: 50413.43330810035
Drzewo decyzyjne: 50571.18306701999
Sieć neuronowa: 41405.62684873046

Root Mean Squared Error (RMSE):
Regresja liniowa: 69297.7166911303
Drzewo decyzyjne: 71341.04628395951
Sieć neuronowa: 59751.028049823275

R^2 score:
Regresja liniowa: 0.6488402154431994
Drzewo decyzyjne: 0.6278261352067561
Sieć neuronowa: 0.7389295681814283
