In [1]:
# Install necessary libraries
!pip install pandas numpy scikit-learn matplotlib seaborn




In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
# Load the dataset with low_memory=False to avoid dtype warnings
df = pd.read_csv(r"C:\Users\Jhanvi\Downloads\Real_Estate_Sales_2001-2020_GL.csv", low_memory=False)



In [4]:
# Check the first few rows of the dataset
df.head()


Unnamed: 0,Serial Number,List Year,Date Recorded,Town,Address,Assessed Value,Sale Amount,Sales Ratio,Property Type,Residential Type,Non Use Code,Assessor Remarks,OPM remarks,Location
0,2020177,2020,04/14/2021,Ansonia,323 BEAVER ST,133000.0,248400.0,0.5354,Residential,Single Family,,,,POINT (-73.06822 41.35014)
1,2020225,2020,05/26/2021,Ansonia,152 JACKSON ST,110500.0,239900.0,0.4606,Residential,Three Family,,,,
2,2020348,2020,09/13/2021,Ansonia,230 WAKELEE AVE,150500.0,325000.0,0.463,Commercial,,,,,
3,2020090,2020,12/14/2020,Ansonia,57 PLATT ST,127400.0,202500.0,0.6291,Residential,Two Family,,,,
4,200500,2020,09/07/2021,Avon,245 NEW ROAD,217640.0,400000.0,0.5441,Residential,Single Family,,,,


In [5]:
# Check for missing values
df.isna().sum()


Serial Number            0
List Year                0
Date Recorded            2
Town                     0
Address                 51
Assessed Value           0
Sale Amount              0
Sales Ratio              0
Property Type       382446
Residential Type    388309
Non Use Code        707532
Assessor Remarks    847349
OPM remarks         987279
Location            799516
dtype: int64

In [6]:
# Define the keywords for the target flag
keywords = ['ESTATE SALE', 'SHORT SALE', 'RENOVATED', 'TOTAL RENOVATION', 'MUST SELL', 'MOVING SALE', 'DISTRESSED']

# Function to flag likely to sell based on remarks
def flag_likely_to_sell(remarks):
    if pd.isna(remarks) or remarks.strip() == "":
        return 0
    remarks = str(remarks).upper()
    if any(keyword in remarks for keyword in keywords):
        return 1
    return 0

# Apply the function to create the 'Likely_to_Sell_Flag' column
df['Likely_to_Sell_Flag'] = df['Assessor Remarks'].apply(flag_likely_to_sell) | df['OPM remarks'].apply(flag_likely_to_sell)

# Check the first few rows to verify the flag creation
df[['Assessor Remarks', 'OPM remarks', 'Likely_to_Sell_Flag']].head()


Unnamed: 0,Assessor Remarks,OPM remarks,Likely_to_Sell_Flag
0,,,0
1,,,0
2,,,0
3,,,0
4,,,0


In [7]:
# Check the number of missing values in each column
print(df.isnull().sum())

# Drop rows where the 'Likely_to_Sell_Flag' column is missing
df = df.dropna(subset=['Likely_to_Sell_Flag'])

# Fill missing numerical columns with the median (or mean, depending on your preference)
df['Assessed Value'] = df['Assessed Value'].fillna(df['Assessed Value'].median())
df['Sale Amount'] = df['Sale Amount'].fillna(df['Sale Amount'].median())
df['Sales Ratio'] = df['Sales Ratio'].fillna(df['Sales Ratio'].median())

# Fill missing categorical columns with the most frequent value
df['Town'] = df['Town'].fillna(df['Town'].mode()[0])
df['Property Type'] = df['Property Type'].fillna(df['Property Type'].mode()[0])
df['Residential Type'] = df['Residential Type'].fillna(df['Residential Type'].mode()[0])


Serial Number               0
List Year                   0
Date Recorded               2
Town                        0
Address                    51
Assessed Value              0
Sale Amount                 0
Sales Ratio                 0
Property Type          382446
Residential Type       388309
Non Use Code           707532
Assessor Remarks       847349
OPM remarks            987279
Location               799516
Likely_to_Sell_Flag         0
dtype: int64


In [8]:
# One-hot encoding for categorical columns
df = pd.get_dummies(df, columns=['Town', 'Property Type', 'Residential Type'], drop_first=True)

# Feature scaling for numerical columns
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# List of numerical features to scale
numerical_features = ['Assessed Value', 'Sale Amount', 'Sales Ratio']

df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Check the updated dataframe
df.head()


Unnamed: 0,Serial Number,List Year,Date Recorded,Address,Assessed Value,Sale Amount,Sales Ratio,Non Use Code,Assessor Remarks,OPM remarks,...,Property Type_Public Utility,Property Type_Residential,Property Type_Single Family,Property Type_Three Family,Property Type_Two Family,Property Type_Vacant Land,Residential Type_Four Family,Residential Type_Single Family,Residential Type_Three Family,Residential Type_Two Family
0,2020177,2020,04/14/2021,323 BEAVER ST,-0.087479,-0.026696,-0.005243,,,,...,False,True,False,False,False,False,False,True,False,False
1,2020225,2020,05/26/2021,152 JACKSON ST,-0.100947,-0.028286,-0.005283,,,,...,False,True,False,False,False,False,False,False,True,False
2,2020348,2020,09/13/2021,230 WAKELEE AVE,-0.077004,-0.012371,-0.005282,,,,...,False,False,False,False,False,False,False,True,False,False
3,2020090,2020,12/14/2020,57 PLATT ST,-0.090831,-0.03528,-0.005194,,,,...,False,True,False,False,False,False,False,False,False,True
4,200500,2020,09/07/2021,245 NEW ROAD,-0.036815,0.001655,-0.005239,,,,...,False,True,False,False,False,False,False,True,False,False


In [9]:
from sklearn.model_selection import train_test_split

# Define the features (X) and target (y)
X = df.drop(columns=['Likely_to_Sell_Flag', 'Serial Number', 'Address', 'Date Recorded', 'Non Use Code', 'Assessor Remarks', 'OPM remarks', 'Location'])
y = df['Likely_to_Sell_Flag']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the splits
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")


Training set: (797770, 187), Test set: (199443, 187)


In [10]:
from sklearn.ensemble import RandomForestClassifier


In [11]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [12]:
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model with the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix to check how well the model is classifying
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Classification Report for more detailed performance metrics (precision, recall, f1-score)
from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 98.38%
Confusion Matrix:
[[196101    216]
 [  3014    112]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    196317
           1       0.34      0.04      0.06      3126

    accuracy                           0.98    199443
   macro avg       0.66      0.52      0.53    199443
weighted avg       0.97      0.98      0.98    199443



In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [14]:
pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1





In [15]:
pip install --upgrade pip 

Note: you may need to restart the kernel to use updated packages.


In [16]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


In [17]:
from collections import Counter

# Check the class distribution before and after resampling
print("Original class distribution:", Counter(y))
print("Resampled class distribution:", Counter(y_resampled))


Original class distribution: Counter({0: 981733, 1: 15480})
Resampled class distribution: Counter({0: 981733, 1: 981733})


In [18]:
from sklearn.model_selection import train_test_split

# Split the resampled data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

print(f"Training data size: {len(X_train)}")
print(f"Test data size: {len(X_test)}")


Training data size: 1570772
Test data size: 392694


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Initialize the model (you can change to other classifiers)
model = LogisticRegression(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.73      0.69    196150
           1       0.69      0.60      0.64    196544

    accuracy                           0.67    392694
   macro avg       0.67      0.67      0.67    392694
weighted avg       0.67      0.67      0.67    392694

Confusion Matrix:
 [[144014  52136]
 [ 78402 118142]]


In [20]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model on the scaled data
model = LogisticRegression(random_state=42, max_iter=500)
model.fit(X_train_scaled, y_train)


In [21]:
model = LogisticRegression(max_iter=500, random_state=42)
model.fit(X_train, y_train)


In [22]:
y_pred = model.predict(X_test)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt

# Predicted class probabilities
y_prob = model.predict_proba(X_test)[:, 1]  # Getting probabilities for the positive class


# ROC Curve and AUC
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC Curve)')
plt.legend(loc='lower right')
plt.show()


In [None]:
from sklearn.model_selection import GridSearchCV

# Set hyperparameters for tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [100, 200, 500]
}

grid_search = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Hyperparameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Reevaluate the best model
y_pred_best = best_model.predict(X_test)
print(classification_report(y_test, y_pred_best))
