In [3]:
from numpy import square
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


df = pd.read_csv('melb_data.csv')

print("First 5 rows of the dataset:")
print(df.head())

print("\nInfo about the dataset:")
print(df.info())

print("\nDescriptive statistics:")
print(df.describe())

#Vizualization
plt.figure(figsize=(8, 5))
sns.histplot(df['Price'], bins=30)
plt.title('Distribution of House Prices')
plt.xlabel('Price (AUD)')
plt.ylabel('Count')
plt.show()

#Analysing of the columns
important_cols = [ 'Suburb', 'Rooms', 'Type', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Regionname', 'Propertycount']
df = df[important_cols + ['Price']]

print("\nSelected columns:")
print(df.columns)

#Preprocessing data
#Cheking for incorrect values and processing them
print("\nMissing values:", df.isnull().sum())

for col in df.columns:
    unique_count = df[col].nunique()
    dup_pct = ((len(df) - unique_count) / len(df)) * 100
    print(f"\n{col}: Unique = {unique_count}, Duplicate % = {dup_pct:.1f}%")

categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\nUnique values in '{col}' (top 10):")
    print(df[col].value_counts().head(10))
    print(f"Total unique: {df[col].nunique()}")

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop('Price')
for col in numeric_cols:
    print(f"\nOutliers in '{col}:")
    print(df[col].describe())
    #Z-score
    z_scores = (df[col] - df[col].mean()) / df[col].std()
    outiers_z = df[col][abs(z_scores) > 3]
    print(f"Z-score outliers (>3 std): {len(outiers_z)} values, e.g. {outiers_z.head().tolist()}")
    #IQR
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers_iqr = df[col][(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)]
    print(f"IQR outliers: {len(outliers_iqr)} values")

print("\nChecking for extra characters in text columns:")
for col in categorical_cols:
    has_extra_chars = df[col].str.contains(r'[^a-zA-Z\s]', na=False).sum()
    if has_extra_chars > 0:
        print(f"{col}: Found {has_extra_chars} rows with extra characters, e.g.:")
        print(df[col][df[col].str.contains(r'[^a-zA-Z\s]', na=False)].head().tolist())
    else:
        print(f"{col}: No extra characters found.")

#Boxplot for vizualization(first 4 numeric)
plt.figure(figsize=(12, 8))
for i, col in enumerate(numeric_cols[:4], 1):
    plt.subplot(2, 2, i)
    sns.boxplot(y=df[col])
    plt.title(f"Boxplot of {col}")

plt.tight_layout()
plt.show()

#Correlation
corr_rooms_price = df['Rooms'].corr(df['Price'])
corr_bedroom2_price = df['Bedroom2'].corr(df['Price'])
print(f"Correlation Rooms vs Price: {corr_rooms_price:.3f}")
print(f"Correlation Bedroom2 vs Price: {corr_bedroom2_price:.3f}")

corr_rooms_bedroom2 = df['Rooms'].corr(df['Bedroom2'])
print(f"Correlation Rooms vs Bedroom2: {corr_rooms_bedroom2:.3f}")

corr_propertycount_price = df['Propertycount'].corr(df['Price'])
print(f"Correlation Propertycount vs Price: {corr_propertycount_price:.3f}")

df = df.drop(['Propertycount', 'Bedroom2'], axis=1)

#Handling missing values
df['Car'] = df['Car'].fillna(df['Car'].median())

cols_for_building = ['Rooms', 'Bathroom', 'Landsize']
imputer_building = KNNImputer(n_neighbors=5)
df[cols_for_building + ['BuildingArea']] = imputer_building.fit_transform(df[cols_for_building + ['BuildingArea']])

cols_for_year = ['Rooms', 'BuildingArea', 'Landsize']
imputer_year = KNNImputer(n_neighbors=5)
df[cols_for_year + ['YearBuilt']] = imputer_year.fit_transform(df[cols_for_year + ['YearBuilt']])

print("\nMissing values after KNN:")
print(df.isnull().sum())

#Handling outliers
bathroom_99 = df['Bathroom'].quantile(0.99)
df.loc[df['Bathroom'] > 4, 'Bathroom'] = bathroom_99

median_car = df['Car'].median()
df.loc[df['Car'] > 4, 'Car'] = median_car

landsiz_99 = df['Landsize'].quantile(0.99)
df.loc[df['Landsize'] > landsiz_99, 'Landsize'] = landsiz_99

building_99 = df['BuildingArea'].quantile(0.99)
df.loc[df['BuildingArea'] > building_99, 'BuildingArea'] = building_99

numeric_cols2 = df.select_dtypes(include=['int64', 'float64']).columns.drop('Price')
for col in numeric_cols2:
    print(f"\nOutliers in '{col}:")
    print(df[col].describe())
    #Z-score
    z_scores = (df[col] - df[col].mean()) / df[col].std()
    outiers_z = df[col][abs(z_scores) > 3]
    print(f"Z-score outliers (>3 std): {len(outiers_z)} values, e.g. {outiers_z.head().tolist()}")
    #IQR
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers_iqr = df[col][(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)]
    print(f"IQR outliers: {len(outliers_iqr)} values")

#Normalization of categorical data
#Grouping for high-cardinality
threshold = 0.01
value_counts = df["Suburb"].value_counts(normalize=True)
rare_suburbs = value_counts[value_counts < threshold].index
df["Suburb"] = df["Suburb"].apply(lambda x: 'Other' if x in rare_suburbs else x)
print(f"Suburb unique after grouping: {df['Suburb'].nunique()}")

#LabelEncoder
le = LabelEncoder()
df['Suburb'] = le.fit_transform(df['Suburb'])

#One-hot
df = pd.get_dummies(df, columns=['Type', 'Regionname'], drop_first=True)

#Normalization of numerical data
numeric_cols_to_scale = ['Rooms', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Distance']
scaler = MinMaxScaler()
df[numeric_cols_to_scale] = scaler.fit_transform(df[numeric_cols_to_scale])

print("\nFirst 5 rows after encoding and normalization:")
print(df.head())

#Training and evaluation phase of models
X = df.drop('Price', axis=1)
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)    
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    results[name] = {'MAE': mae, 'RMSE': rmse, 'R²': r2}

    print(f"\n{name}:")
    print(f"MAE = {mae:.2f}")
    print(f"RMSE = {rmse:.2f}")
    print(f"R² = {r2:.3f}")

best_model = min(results, key=lambda x: results[x]['RMSE'])
print(f"\nThe best model by RMSE: {best_model} from RMSE = {results[best_model]['RMSE']:.2f}")

ModuleNotFoundError: No module named 'xgboost'