In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from lime.lime_tabular import LimeTabularExplainer

In [None]:
df = pd.read_csv('NY-House-Dataset.csv')
df

In [None]:

# Counting the number of distinct values in each column of the DataFrame
distinct_value_counts = {col: df[col].nunique() for col in df.columns}
distinct_value_counts

In [None]:
unique_value_counts = {col: df[col].value_counts().head() for col in df.columns}
unique_value_counts

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df['PRICE_standardized'] = scaler.fit_transform(df[['PRICE']])
sorted_df = df.sort_values(by='PRICE_standardized', ascending=False)
sorted_df = sorted_df[2:]
df = df.drop('BROKERTITLE', axis=1)

In [None]:
sorted_df.head()

In [None]:
import matplotlib.pyplot as plt

grouped = sorted_df.groupby('SUBLOCALITY')

# Plotting
fig, ax = plt.subplots(figsize=(10, 6))

# Create a list for x-tick labels
xticklabels = []
positions = []

for i, (name, group) in enumerate(grouped):
    # Adjust the 'whis' parameter here, e.g., 1.0, 1.5, etc.
    ax.boxplot(group['PRICE_standardized'], positions=[i], whis=1.5)

    xticklabels.append(name)
    positions.append(i)

# Set x-axis label
ax.set_xlabel('SUBLOCALITY')

# Set y-axis label
ax.set_ylabel('PRICE')

# Set x-tick labels
ax.set_xticks(positions)
ax.set_xticklabels(xticklabels, rotation=45)  # Rotate for better readability if needed

# Set title
ax.set_title('Boxplot of Prices for Each Sublocality (Adjusted for Outliers)')

# Show plot
plt.show()

In [None]:
import seaborn as sns

average_prices = df.groupby('SUBLOCALITY')['PRICE'].mean()

# Using seaborn's style
sns.set(style="whitegrid")

# Plotting the average prices
plt.figure(figsize=(10, 6))
barplot = sns.barplot(x=average_prices.index, y=average_prices.values, palette="viridis")

# Adding labels and title with more customization
plt.xlabel('Sublocality', fontsize=14, fontweight='bold', color='navy')
plt.ylabel('Average Price', fontsize=14, fontweight='bold', color='navy')
plt.title('Average Prices by Sublocality', fontsize=16, fontweight='bold', color='darkred')

# Customizing ticks for readability
plt.xticks(fontsize=12, fontweight='bold', rotation=90)
plt.yticks(fontsize=12, fontweight='bold')

# Adding value labels on top of each bar
for p in barplot.patches:
    barplot.annotate(format(p.get_height(), '.2f'), 
                     (p.get_x() + p.get_width() / 2., p.get_height()), 
                     ha = 'center', va = 'center', 
                     xytext = (0, 9), 
                     textcoords = 'offset points',
                     fontsize=12)

# Show plot
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
label_encoder = LabelEncoder()
df['SUBLOCALITY_encoded'] = label_encoder.fit_transform(df['SUBLOCALITY'])
df
# Selecting the features (X) and the target (Y)
X = df[['SUBLOCALITY_encoded', 'PRICE', 'BEDS', 'BATH']]
Y = df['TYPE']

df.drop('PRICE', axis=1)

In [None]:
Y.value_counts()

In [None]:
X.head()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [None]:
# Initialize the classifier
clf = RandomForestClassifier(random_state=42)

# Train the classifier
clf.fit(X_train, Y_train)

# Make predictions on the test set
Y_pred = clf.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Accuracy: {accuracy}")

# Confusion matrix
conf_matrix = confusion_matrix(Y_test, Y_pred)

# Plotting the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

In [None]:
df.columns

In [None]:
columns_to_remove = ['ADDRESS', 'STATE', 'MAIN_ADDRESS', 'STREET_NAME', 'LONG_NAME', 'FORMATTED_ADDRESS','PRICE_standardized','LATITUDE','LONGITUDE','SUBLOCALITY_encoded']
df_predict_price = df.drop(columns=columns_to_remove)
categorical_columns = ['ADMINISTRATIVE_AREA_LEVEL_2', 'LOCALITY', 'SUBLOCALITY', 'TYPE']

label_encoder = LabelEncoder()
for col in categorical_columns:
    df_predict_price[col] = label_encoder.fit_transform(df_predict_price[col])



df_predict_price.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Splitting the data into features (X) and target variable (y)
X = df_predict_price.drop('PRICE', axis=1)
y = df_predict_price['PRICE']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
regressor = RandomForestRegressor(random_state=42)

# Train the model
regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = regressor.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")


In [None]:
feature_importances = regressor.feature_importances_

# Creating a DataFrame to display feature importances
features = X.columns
importances_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})

# Sorting the DataFrame by importance for better visualization
importances_df = importances_df.sort_values(by='Importance', ascending=False)

# Print the DataFrame
print(importances_df)

# Optional: Plotting the feature importances
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importances_df)
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gb_regressor = GradientBoostingRegressor(random_state=42)

gb_regressor.fit(X_train, y_train)

y_pred_gb = gb_regressor.predict(X_test)

mae_gb = mean_absolute_error(y_test, y_pred_gb)
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print(f"Mean Absolute Error (Gradient Boosting): {mae_gb}")
print(f"Mean Squared Error (Gradient Boosting): {mse_gb}")
print(f"R² Score (Gradient Boosting): {r2_gb}")


In [None]:
from sklearn.preprocessing import StandardScaler

# Assuming df_predict_price is the DataFrame you're using and it's already encoded
# Selecting the numerical columns for scaling
numerical_columns = ['BEDS', 'BATH','PROPERTYSQFT']  # Adjust this list based on your actual numerical columns

# Initializing the scaler
scaler = StandardScaler()

# Applying the scaler to the numerical columns
df_predict_price[numerical_columns] = scaler.fit_transform(df_predict_price[numerical_columns])

df_predict_price.head()


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

X = df_predict_price.drop('PRICE', axis=1)
y = df_predict_price['PRICE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor(random_state=42)

# Hyperparameter Grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Grid Search with Cross-Validation
grid_search = GridSearchCV(gb_regressor, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# Best Parameters
print("Best parameters:", grid_search.best_params_)

# Best Model
best_model = grid_search.best_estimator_

# Predictions
y_pred = best_model.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [None]:
# Assuming this part comes after your model has made predictions
from lime.lime_tabular import LimeTabularExplainer

# Initialize the explainer
explainer = LimeTabularExplainer(training_data=X_train.values,
                                 feature_names=X.columns.tolist(),
                                 class_names=['Price'],
                                 mode='regression')

# Select an instance to explain
instance_index = 0  # Example index, choose appropriately
instance = X_test.iloc[instance_index]

# Generate explanation
exp = explainer.explain_instance(data_row=instance, predict_fn=best_model.predict)

# Visualize the explanation
exp.show_in_notebook(show_table=True)


In [None]:
gb_regressor = GradientBoostingRegressor(random_state=42)

gb_regressor.fit(X_train, y_train)

y_pred_gb = gb_regressor.predict(X_test)

mae_gb = mean_absolute_error(y_test, y_pred_gb)
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print(f"Mean Absolute Error (Gradient Boosting): {mae_gb}")
print(f"Mean Squared Error (Gradient Boosting): {mse_gb}")
print(f"R² Score (Gradient Boosting): {r2_gb}")
