In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
data_path = '/content/Housing.csv'
df = pd.read_csv(data_path)

In [3]:
categorical_cols = df.select_dtypes(include=['object']).columns.to_list()
print(categorical_cols)

['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']


In [4]:
ordinal_encoder = OrdinalEncoder()
encoded_categorical_cols = ordinal_encoder.fit_transform(df[categorical_cols])

encoded_categorical_df = pd.DataFrame(encoded_categorical_cols, columns=categorical_cols)

numerical_df = df.drop(categorical_cols, axis=1)
encoded_df = pd.concat([encoded_categorical_df, numerical_df], axis=1)

In [5]:
normalizer = StandardScaler()
dataset_arr = normalizer.fit_transform(encoded_df)

In [6]:
X, y = dataset_arr[:, 1:], dataset_arr[:, 0]

In [7]:
test_size = 0.3
random_state = 1
is_shuffle = True
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state, shuffle=is_shuffle)

In [8]:
regressor = RandomForestRegressor(random_state=random_state)
regressor.fit(X_train, y_train)

In [9]:
regressor =  AdaBoostRegressor(random_state=random_state)
regressor.fit(X_train, y_train)

In [10]:
regressor = GradientBoostingRegressor(
    random_state = random_state
)
regressor.fit(X_train, y_train)

In [11]:
y_pred = regressor.predict(X_val)

In [12]:
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)

print('Evaluation results on validation set:')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')

Evaluation results on validation set:
Mean Absolute Error: 0.6209042448262554
Mean Squared Error: 0.9609208197729747


#Queston 6:

In [23]:
def calculate_mse(x):
  group1 = df[df['X']<= x]
  group2 = df[df['X']> x]
  mse_group1 = mean_squared_error(group1['Y'],[group1['Y'].mean()] * len(group1)) if not group1.empty else 0
  mse_group2 = mean_squared_error(group2['Y'], [group2['Y'].mean()] * len(group2)) if not group2.empty else 0

  total_mse = (len(group1)*mse_group1 + len(group2)*mse_group2) / len(df)
  return total_mse

In [24]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
# Create a dictionary with your data
data = {
    'X': [3, 5, 8, 10, 12],
    'Y': [12, 20, 28, 32, 36]
}
# Create a DataFrame
df = pd.DataFrame(data)
mse_a = calculate_mse(3)
mse_b = calculate_mse(8)
mse_c = calculate_mse(5)
mse_d = calculate_mse(10)

# In kết quả
print(f'MSE for X ≤ 3: {mse_a}')
print(f'MSE for X ≤ 8: {mse_b}')
print(f'MSE for X ≤ 5: {mse_c}')
print(f'MSE for X ≤ 10: {mse_d}')

MSE for X ≤ 3: 28.0
MSE for X ≤ 8: 27.2
MSE for X ≤ 5: 12.8
MSE for X ≤ 10: 47.2


In [27]:
def predict(X):
    if X <= 3:
        # Chỉ có một điểm (3, 12) trong nhóm này
        return df[df['X'] <= 3]['Y'].mean()  # Sử dụng giá trị trung bình
    elif X <= 5:
        return df[df['X'] <= 5]['Y'].mean()  # Nhóm bao gồm (3, 12) và (5, 20)
    elif X <= 8:
        return df[df['X'] <= 8]['Y'].mean()  # Nhóm bao gồm (3, 12), (5, 20), (8, 28)
    elif X <= 10:
        return df[df['X'] <= 10]['Y'].mean()  # Nhóm bao gồm (3, 12), (5, 20), (8, 28), (10, 32)
    else:
        return df['Y'].mean()  # Tất cả các điểm

predicted_value = predict(15)

print(f'Predicted value for X = 2: {predicted_value}')

Predicted value for X = 2: 25.6
