In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load data from CSV
file_path = '/content/Real estate.csv'
data = pd.read_csv(file_path)

# Drop rows with null values (if any)
data = data.dropna()

# Select variables for regression
X = data[['X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station',
          'X4 number of convenience stores', 'X5 latitude', 'X6 longitude']]  # Independent variables
y = data['Y house price of unit area']  # Dependent variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

# Display coefficients
coefficients = pd.DataFrame({'Variable': X.columns, 'Coefficient': model.coef_})
print("\nCoefficients:")
print(coefficients)


Mean Squared Error: 53.505619124504506
R^2 Score: 0.6810580555095801

Coefficients:
                                 Variable  Coefficient
0                     X1 transaction date     5.440742
1                            X2 house age    -0.270791
2  X3 distance to the nearest MRT station    -0.004759
3         X4 number of convenience stores     1.091425
4                             X5 latitude   229.043054
5                            X6 longitude   -29.492591


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load data from CSV
file_path = '/content/Real estate.csv'
data = pd.read_csv(file_path)

# Drop rows with null values (if any)
data = data.dropna()

# Define the independent variables and the dependent variable
X_all = data[['X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station',
              'X4 number of convenience stores', 'X5 latitude', 'X6 longitude']]
y = data['Y house price of unit area']

# Function to fit a model and calculate R² score
def fit_and_evaluate(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2, model

# Perform regression with all variables
all_mse, all_r2, all_model = fit_and_evaluate(X_all, y)
print("Regression with All Variables:")
print("Mean Squared Error:", all_mse)
print("R² Score:", all_r2)

# Remove specified variables and perform regression
X_reduced = data[['X1 transaction date', 'X4 number of convenience stores', 'X5 latitude']]
reduced_mse, reduced_r2, reduced_model = fit_and_evaluate(X_reduced, y)
print("\nRegression with Specified Variables Removed (X2, X3, X6):")
print("Mean Squared Error:", reduced_mse)
print("R² Score:", reduced_r2)

# Display coefficients for both models
all_coefficients = pd.DataFrame({'Variable': X_all.columns, 'Coefficient': all_model.coef_})
reduced_coefficients = pd.DataFrame({'Variable': X_reduced.columns, 'Coefficient': reduced_model.coef_})

print("\nCoefficients with All Variables:")
print(all_coefficients)

print("\nCoefficients with Specified Variables Removed:")
print(reduced_coefficients)


Regression with All Variables:
Mean Squared Error: 53.505619124504506
R² Score: 0.6810580555095801

Regression with Specified Variables Removed (X2, X3, X6):
Mean Squared Error: 81.72067199091401
R² Score: 0.5128707889689842

Coefficients with All Variables:
                                 Variable  Coefficient
0                     X1 transaction date     5.440742
1                            X2 house age    -0.270791
2  X3 distance to the nearest MRT station    -0.004759
3         X4 number of convenience stores     1.091425
4                             X5 latitude   229.043054
5                            X6 longitude   -29.492591

Coefficients with Specified Variables Removed:
                          Variable  Coefficient
0              X1 transaction date     4.178191
1  X4 number of convenience stores     1.864308
2                      X5 latitude   392.337992


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score

# Load data from CSV
file_path = '/content/Real estate.csv'
data = pd.read_csv(file_path)

# Drop rows with null values (if any)
data = data.dropna()

# Define the independent variables and the dependent variable
X = data[['X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station',
          'X4 number of convenience stores', 'X5 latitude', 'X6 longitude']]
y = data['Y house price of unit area']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up the Lasso model with cross-validation to find the best alpha
lasso = Lasso()
parameters = {'alpha': [0.01, 0.1, 1, 10, 100]}
lasso_regressor = GridSearchCV(lasso, parameters, cv=5)
lasso_regressor.fit(X_train, y_train)

# Get the best model
best_lasso_model = lasso_regressor.best_estimator_

# Evaluate the best model
y_pred = best_lasso_model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best Alpha:", lasso_regressor.best_params_['alpha'])
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

# Display coefficients
coefficients = pd.DataFrame({'Variable': X.columns, 'Coefficient': best_lasso_model.coef_})
print("\nCoefficients:")
print(coefficients)


Best Alpha: 0.01
Mean Squared Error: 53.38708327895188
R^2 Score: 0.6817646364947361

Coefficients:
                                 Variable  Coefficient
0                     X1 transaction date     5.434905
1                            X2 house age    -0.264976
2  X3 distance to the nearest MRT station    -0.004938
3         X4 number of convenience stores     1.153346
4                             X5 latitude   134.123150
5                            X6 longitude    -0.000000


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score

# Load data from CSV
file_path = '/content/Real estate.csv'
data = pd.read_csv(file_path)

# Drop rows with null values (if any)
data = data.dropna()

# Define the independent variables and the dependent variable
X = data[['X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station',
          'X4 number of convenience stores', 'X5 latitude', 'X6 longitude']]
y = data['Y house price of unit area']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to fit a model and calculate R^2 score
def fit_and_evaluate(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2, model

# Set up the Lasso model with cross-validation to find the best alpha
lasso = Lasso()
lasso_parameters = {'alpha': [0.01, 0.1, 1, 10, 100]}
lasso_regressor = GridSearchCV(lasso, lasso_parameters, cv=5)
lasso_regressor.fit(X_train, y_train)

# Get the best Lasso model
best_lasso_model = lasso_regressor.best_estimator_

# Evaluate the best Lasso model
lasso_mse, lasso_r2, best_lasso_model = fit_and_evaluate(best_lasso_model, X_train, y_train, X_test, y_test)

print("Lasso Regression Results:")
print("Best Alpha:", lasso_regressor.best_params_['alpha'])
print("Mean Squared Error:", lasso_mse)
print("R^2 Score:", lasso_r2)
print("\nCoefficients:")
print(pd.DataFrame({'Variable': X.columns, 'Coefficient': best_lasso_model.coef_}))

# Set up the Ridge model with cross-validation to find the best alpha
ridge = Ridge()
ridge_parameters = {'alpha': [0.01, 0.1, 1, 10, 100]}
ridge_regressor = GridSearchCV(ridge, ridge_parameters, cv=5)
ridge_regressor.fit(X_train, y_train)

# Get the best Ridge model
best_ridge_model = ridge_regressor.best_estimator_

# Evaluate the best Ridge model
ridge_mse, ridge_r2, best_ridge_model = fit_and_evaluate(best_ridge_model, X_train, y_train, X_test, y_test)

print("\nRidge Regression Results:")
print("Best Alpha:", ridge_regressor.best_params_['alpha'])
print("Mean Squared Error:", ridge_mse)
print("R^2 Score:", ridge_r2)
print("\nCoefficients:")
print(pd.DataFrame({'Variable': X.columns, 'Coefficient': best_ridge_model.coef_}))


Lasso Regression Results:
Best Alpha: 0.01
Mean Squared Error: 53.38708327895188
R^2 Score: 0.6817646364947361

Coefficients:
                                 Variable  Coefficient
0                     X1 transaction date     5.434905
1                            X2 house age    -0.264976
2  X3 distance to the nearest MRT station    -0.004938
3         X4 number of convenience stores     1.153346
4                             X5 latitude   134.123150
5                            X6 longitude    -0.000000

Ridge Regression Results:
Best Alpha: 0.01
Mean Squared Error: 53.377710891581515
R^2 Score: 0.6818205044110658

Coefficients:
                                 Variable  Coefficient
0                     X1 transaction date     5.516721
1                            X2 house age    -0.268184
2  X3 distance to the nearest MRT station    -0.004988
3         X4 number of convenience stores     1.121934
4                             X5 latitude   177.003090
5                            X6