In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
# Load dataset
data = pd.read_csv('data.csv')

In [None]:
# Split features and target
x = data.drop(['price'], axis=1)
y = data['price']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Function to apply log transform and feature engineering
def prepare_features(df):
    df = df.copy()
    # Log transformations (+1 to avoid log(0))
    cols_to_log = ['age', 'amneties', 'area', 'atmDistance', 'balconies', 'bathrooms',
                   'hospitalDistance', 'restrauntDistance', 'schoolDistance', 'shoppingDistance', 'status']
    for col in cols_to_log:
        df[col] = np.log(df[col] + 1)
    
    # Derived features
    df['bathrooms_ratio'] = df['bathrooms'] / df['area']
    df['total_rooms_est'] = df['bathrooms'] + df['balconies'] + 1  # rough estimate
    df['households_est'] = 1
    df['household_rooms'] = df['total_rooms_est'] / df['households_est']
    
    return df

In [None]:
# Prepare train and test sets
train_data = x_train.join(y_train)
train_data = prepare_features(train_data)

test_data = x_test.join(y_test)
test_data = prepare_features(test_data)

# Separate features and target
x_train_prepared = train_data.drop(['price'], axis=1)
y_train_prepared = train_data['price']

x_test_prepared = test_data.drop(['price'], axis=1)
y_test_prepared = test_data['price']

# Ensure same column order
x_test_prepared = x_test_prepared[x_train_prepared.columns]

In [None]:
# Train model
reg = LinearRegression()
reg.fit(x_train_prepared, y_train_prepared)

In [None]:
# Evaluate
r2 = reg.score(x_test_prepared, y_test_prepared)
print("R² on test data:", r2)