In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
from sklearn.utils import resample
from sklearn.metrics import precision_recall_curve

In [2]:
data = pd.read_csv(r'C:\Users\Loaner\Downloads\Real_Estate_Sales_2001-2020_GL.csv\Real_Estate_Sales_2001-2020_GL.csv',low_memory=False)

In [3]:
if 'Date Recorded' in data.columns:
    data['Date Recorded'] = pd.to_datetime(data['Date Recorded'], errors='coerce').dt.year

In [4]:
data['ReadyToSell'] = (data['Sales Ratio'] > 0.5).astype(int)

In [5]:
data['Property Type'] = data['Property Type'].fillna('Unknown')
data['Residential Type'] = data['Residential Type'].fillna('Unknown')
data['Location'] = data['Location'].fillna('Unknown')
data['Address'] = data['Address'].fillna('Unknown')
data['Sales Ratio'] = data['Sales Ratio'].fillna(data['Sales Ratio'].mean())
data['Assessed Value'] = data['Assessed Value'].fillna(data['Assessed Value'].mean())


In [6]:
data


Unnamed: 0,Serial Number,List Year,Date Recorded,Town,Address,Assessed Value,Sale Amount,Sales Ratio,Property Type,Residential Type,Non Use Code,Assessor Remarks,OPM remarks,Location,ReadyToSell
0,2020177,2020,2021.0,Ansonia,323 BEAVER ST,133000.0,248400.0,0.535400,Residential,Single Family,,,,POINT (-73.06822 41.35014),1
1,2020225,2020,2021.0,Ansonia,152 JACKSON ST,110500.0,239900.0,0.460600,Residential,Three Family,,,,Unknown,0
2,2020348,2020,2021.0,Ansonia,230 WAKELEE AVE,150500.0,325000.0,0.463000,Commercial,Unknown,,,,Unknown,0
3,2020090,2020,2020.0,Ansonia,57 PLATT ST,127400.0,202500.0,0.629100,Residential,Two Family,,,,Unknown,1
4,200500,2020,2021.0,Avon,245 NEW ROAD,217640.0,400000.0,0.544100,Residential,Single Family,,,,Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997208,190272,2019,2020.0,New London,4 BISHOP CT,60410.0,53100.0,1.137665,Single Family,Single Family,14 - Foreclosure,,,Unknown,1
997209,190284,2019,2019.0,Waterbury,126 PERKINS AVE,68280.0,76000.0,0.898400,Single Family,Single Family,25 - Other,PRIVATE SALE,,Unknown,1
997210,190129,2019,2020.0,Windsor Locks,19 HATHAWAY ST,121450.0,210000.0,0.578300,Single Family,Single Family,,,,Unknown,1
997211,190504,2019,2020.0,Middletown,8 BYSTREK DR,203360.0,280000.0,0.726300,Single Family,Single Family,,,,Unknown,1


In [7]:
data_encoded = pd.get_dummies(data, columns=['Property Type', 'Residential Type'], drop_first=True)


In [8]:
X = data_encoded.drop(columns=['ReadyToSell', 'Sale Amount','Assessed Value','Sales Ratio']) 
y = data_encoded['ReadyToSell'] 
X = X.apply(pd.to_numeric, errors='coerce').fillna(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
train_data = pd.concat([X_train, y_train], axis=1)

majority = train_data[train_data['ReadyToSell'] == 1]
minority = train_data[train_data['ReadyToSell'] == 0]

minority_oversampled = resample(
    minority,
    replace=True,                  
    n_samples=len(majority),       
    random_state=42                
)

balanced_train_data = pd.concat([majority, minority_oversampled])


X_train_balanced = balanced_train_data.drop(columns=['ReadyToSell'])
y_train_balanced = balanced_train_data['ReadyToSell']


In [25]:
gb_classifier = GradientBoostingClassifier(random_state=42)
gb_classifier.fit(X_train_balanced, y_train_balanced)

y_pred = gb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 75.56%
Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.82      0.65     54251
           1       0.91      0.73      0.81    145192

    accuracy                           0.76    199443
   macro avg       0.72      0.77      0.73    199443
weighted avg       0.81      0.76      0.77    199443



In [35]:
gb_classifier = GradientBoostingClassifier(
    max_depth=5,
    learning_rate=0.05,
    n_estimators=200,
    subsample=0.8,
    random_state=42
)
gb_classifier.fit(X_train_balanced, y_train_balanced)
y_probs = gb_classifier.predict_proba(X_test)[:, 1]

threshold = 0.4  
y_pred_adjusted = (y_probs >= threshold).astype(int)

print("Classification Report with Adjusted Threshold:")
print(classification_report(y_test, y_pred_adjusted))

Classification Report with Adjusted Threshold:
              precision    recall  f1-score   support

           0       0.59      0.72      0.65     54251
           1       0.89      0.82      0.85    145192

    accuracy                           0.79    199443
   macro avg       0.74      0.77      0.75    199443
weighted avg       0.81      0.79      0.80    199443



In [38]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(gb_classifier, X, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(cv_scores):.2f} ± {np.std(cv_scores):.2f}")

Cross-Validation Accuracy: 0.36 ± 0.22
