In [212]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [213]:
# 1. Wczytaj dane home_prices.csv


df = pd.read_csv('Home_prices.csv')
print(df)

# 2. Pokaż wszystkie kolumny pliku (data_frame.columns)
print(df.columns)

        Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0        1          60       RL         65.0     8450   Pave   NaN      Reg   
1        2          20       RL         80.0     9600   Pave   NaN      Reg   
2        3          60       RL         68.0    11250   Pave   NaN      IR1   
3        4          70       RL         60.0     9550   Pave   NaN      IR1   
4        5          60       RL         84.0    14260   Pave   NaN      IR1   
...    ...         ...      ...          ...      ...    ...   ...      ...   
1455  1456          60       RL         62.0     7917   Pave   NaN      Reg   
1456  1457          20       RL         85.0    13175   Pave   NaN      Reg   
1457  1458          70       RL         66.0     9042   Pave   NaN      Reg   
1458  1459          20       RL         68.0     9717   Pave   NaN      Reg   
1459  1460          20       RL         75.0     9937   Pave   NaN      Reg   

     LandContour Utilities  ... PoolArea PoolQC  Fe

In [214]:
# 3. Usuń brakujące wartości (data_frame.dropna(...))
df = df.dropna(how='all')

missing_data = df.isnull().sum()
print("Missing data")
print(missing_data)

# 4. Wybierz odpowiednią kolumnę, która posłuży jako „target” (macierz Y) do predykcji
target = df['SalePrice']
print(target)

Missing data
Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64
0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64


In [215]:
# 5. Zastanów się które cechy weźmiesz do jako macierz cech (macierz X) (zwróć uwagę, na typy cech) 
# 6. Wybierz te cechy do zbioru 
X = df[[
    'MSSubClass',
    'LotArea',
    'YearBuilt',
    'GrLivArea',
    #'GarageYrBlt',
    'PoolArea',
    'MoSold',
    '1stFlrSF', 
    '2ndFlrSF'
 ]]

X.describe()

Unnamed: 0,MSSubClass,LotArea,YearBuilt,GrLivArea,PoolArea,MoSold,1stFlrSF,2ndFlrSF
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,10516.828082,1971.267808,1515.463699,2.758904,6.321918,1162.626712,346.992466
std,42.300571,9981.264932,30.202904,525.480383,40.177307,2.703626,386.587738,436.528436
min,20.0,1300.0,1872.0,334.0,0.0,1.0,334.0,0.0
25%,20.0,7553.5,1954.0,1129.5,0.0,5.0,882.0,0.0
50%,50.0,9478.5,1973.0,1464.0,0.0,6.0,1087.0,0.0
75%,70.0,11601.5,2000.0,1776.75,0.0,8.0,1391.25,728.0
max,190.0,215245.0,2010.0,5642.0,738.0,12.0,4692.0,2065.0


In [216]:
# 8. Utwórz zbiór treningowy i testowy
X_train, X_test, Y_train, Y_test = train_test_split(X, target, test_size=0.2, random_state=42)

missing_data = X_train.isnull().sum()
print("Missing data")
print(missing_data)

Missing data
MSSubClass    0
LotArea       0
YearBuilt     0
GrLivArea     0
PoolArea      0
MoSold        0
1stFlrSF      0
2ndFlrSF      0
dtype: int64


In [217]:
# 9. Przetestuj kilka algorytmów (LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor())

regressors = [
    ('Linear Regression', LinearRegression()),
    ('Decision Tree Regressor', DecisionTreeRegressor()),
    ('Random Forest Regressor', RandomForestRegressor()),
    ('ADA Boost Regressor', AdaBoostRegressor()),
    ('MLP Regressor', MLPRegressor())
]

results = []

for regressor_name, regressor in regressors:
    
    #cv_scores = cross_val_score(regressor, X_train, Y_train, cv=5, scoring='accuracy')

    regressor.fit(X_train, Y_train)

    y_pred = regressor.predict(X_test)
    score = regressor.score(X_test, Y_test)

    # Metrics for prediction
    mse = mean_squared_error(Y_test, y_pred)
    mae = mean_absolute_error(Y_test, y_pred)



    results.append({
        'Regressor': regressor_name,
        'Score': score,
        'Mean Square Error' : mse,
        'Mean Absolute Error': mae
    })
        



In [218]:
# 10. Zademonstruj wyniki wszystkich testów. Podaj wartości
for res in results:
    print('-'*50)
    print(f"Regressor: {res['Regressor']}")
    #print(f"Mean CV accuracy: {res['CV Accuracy']}")
    print(f"Test Accuracy:")
    print(f"\tScore: {res['Score']:.4f}")
    print(f"\tMean Square Error: {res['Mean Square Error']:.4f}")
    print(f"\tMean Absolute Error: {res['Mean Absolute Error']:.4f}")
    print('-'*50)

--------------------------------------------------
Regressor: Linear Regression
Test Accuracy:
	Score: 0.7210
	Mean Square Error: 2139984102.7674
	Mean Absolute Error: 29627.3143
--------------------------------------------------
--------------------------------------------------
Regressor: Decision Tree Regressor
Test Accuracy:
	Score: 0.8106
	Mean Square Error: 1452380263.8524
	Mean Absolute Error: 26506.0594
--------------------------------------------------
--------------------------------------------------
Regressor: Random Forest Regressor
Test Accuracy:
	Score: 0.8614
	Mean Square Error: 1062952916.8662
	Mean Absolute Error: 21803.3485
--------------------------------------------------
--------------------------------------------------
Regressor: ADA Boost Regressor
Test Accuracy:
	Score: 0.7644
	Mean Square Error: 1807267662.4076
	Mean Absolute Error: 29994.7409
--------------------------------------------------
--------------------------------------------------
Regressor: MLP 