In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

# Load the Excel file
file_path = 'NY_regression.xlsx'
xls = pd.ExcelFile(file_path)

# Load data from the first sheet
df = pd.read_excel(xls, sheet_name='Sheet1')

# Dropping unnecessary columns
df_cleaned = df[['price', 'beds', 'baths', 'sqft', 'lat', 'lon']]

# Handling missing values
df_cleaned = df_cleaned.dropna()

# Removing outliers using quantiles
q_low = df_cleaned[['price', 'sqft']].quantile(0.01)
q_high = df_cleaned[['price', 'sqft']].quantile(0.99)
df_cleaned = df_cleaned[(df_cleaned['price'] >= q_low['price']) & (df_cleaned['price'] <= q_high['price'])]
df_cleaned = df_cleaned[(df_cleaned['sqft'] >= q_low['sqft']) & (df_cleaned['sqft'] <= q_high['sqft'])]

# Prepare data for regression
X = df_cleaned[['beds', 'baths', 'sqft', 'lat', 'lon']]
y = np.log(df_cleaned['price'])  # Log-transforming the price to normalize distribution

# Splitting data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# First Model: Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r_squared = r2_score(y_test, y_pred)
adjusted_r_squared = 1 - (1-r_squared) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
print(f"Linear Regression - R-squared: {r_squared}, Adjusted R-squared: {adjusted_r_squared}")

# Second Model: Pipeline with Polynomial Features and Feature Selection
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('select', SelectKBest(score_func=f_regression, k=8)),
    ('model', LinearRegression())
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
r_squared = r2_score(y_test, y_pred)
adjusted_r_squared = 1 - (1-r_squared) * (len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
print(f"Polynomial Regression - R-squared: {r_squared}, Adjusted R-squared: {adjusted_r_squared}")

# Final Model: Gradient Boosting Regressor with Cross-Validation
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
cv_scores_gbr = cross_val_score(gbr_model, X, y, cv=10, scoring='r2')
mean_cv_r_squared_gbr = np.mean(cv_scores_gbr)
std_cv_r_squared_gbr = np.std(cv_scores_gbr)
print(f"Gradient Boosting - Mean R-squared: {mean_cv_r_squared_gbr}, Std Dev: {std_cv_r_squared_gbr}")

# Train the final model
gbr_model.fit(X_train, y_train)

# Predict new data (example)
new_data = pd.DataFrame({
    'beds': [5, 3],
    'baths': [4, 3],
    'sqft': [1500, 1000],
    'lat': [40.7128, 40.7306],
    'lon': [-74.0060, -73.9352]
})
predictions = np.exp(gbr_model.predict(new_data))  # Inverse the log transformation
print("Predicted Prices:\n", predictions)

