In [None]:
### original code on which the test has been run

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE

# Step 1: Load the dataset
df = pd.read_csv(r"C:\Users\91727\OneDrive\Desktop\hodson\vishesh codes and files\hodson.csv", low_memory=False)

# Step 2: Data Preprocessing (Handling Missing Values)
# Example: Fill missing numerical columns with median and categorical columns with mode
df['Assessed Value'] = df['Assessed Value'].fillna(df['Assessed Value'].median())
df['Sale Amount'] = df['Sale Amount'].fillna(df['Sale Amount'].median())
df['Sales Ratio'] = df['Sales Ratio'].fillna(df['Sales Ratio'].median())
df['Town'] = df['Town'].fillna(df['Town'].mode()[0])
df['Property Type'] = df['Property Type'].fillna(df['Property Type'].mode()[0])
df['Residential Type'] = df['Residential Type'].fillna(df['Residential Type'].mode()[0])

# Step 3: Feature Engineering (creating Likely_to_Sell_Flag from remarks)
keywords = ['ESTATE SALE', 'SHORT SALE', 'RENOVATED', 'TOTAL RENOVATION', 'MUST SELL', 'MOVING SALE', 'DISTRESSED']

def flag_likely_to_sell(remarks):
    if pd.isna(remarks) or remarks.strip() == "":
        return 0
    remarks = str(remarks).upper()
    if any(keyword in remarks for keyword in keywords):
        return 1
    return 0

df['Likely_to_Sell_Flag'] = df['Assessor Remarks'].apply(flag_likely_to_sell) | df['OPM remarks'].apply(flag_likely_to_sell)

# Step 4: Feature and Target Selection
X = df.drop(columns=['Likely_to_Sell_Flag', 'Serial Number', 'Address', 'Date Recorded', 'Non Use Code', 'Assessor Remarks', 'OPM remarks', 'Location'])
y = df['Likely_to_Sell_Flag']

df.head(10)
# Step 5: Handle Class Imbalance with SMOTE (Optional)
# Step 1: Drop unnecessary columns (already done earlier)
X = df.drop(columns=['Likely_to_Sell_Flag', 'Serial Number', 'Address', 'Date Recorded', 
                     'Non Use Code', 'Assessor Remarks', 'OPM remarks', 'Location'])

# Step 2: Handle missing values
# Replace missing categorical values with 'Unknown'
X['Town'] = X['Town'].fillna('Unknown')
X['Property Type'] = X['Property Type'].fillna('Unknown')
X['Residential Type'] = X['Residential Type'].fillna('Unknown')

# Replace missing numerical values with median
X['Assessed Value'] = X['Assessed Value'].fillna(X['Assessed Value'].median())
X['Sale Amount'] = X['Sale Amount'].fillna(X['Sale Amount'].median())
X['Sales Ratio'] = X['Sales Ratio'].fillna(X['Sales Ratio'].median())

# Step 3: Convert categorical columns to numeric using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Step 4: Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 6: Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


# Step 7: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Step 8: Define Models to Evaluate
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine (SVM)": SVC(random_state=42),
    "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Ridge Classifier": RidgeClassifier(random_state=42)
}

# Step 9: Function to Evaluate Models
def evaluate_models(X, y):
    for model_name, model in models.items():
        print(f"\nEvaluating {model_name}...")
        
        # Train the model
        model.fit(X_train_scaled, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_scaled)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Accuracy of {model_name}: {accuracy * 100:.2f}%")
        
        # Print classification report
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
        
        # Print confusion matrix
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        print("-" * 50)

        
# Step 10: Run the Model Evaluation
evaluate_models(X_resampled, y_resampled)
        
        


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE


In [2]:
# Step 1: Load the dataset
df = pd.read_csv(r"C:\Users\91727\OneDrive\Desktop\hodson\vishesh codes and files\hodson.csv", low_memory=False)

In [3]:
# Step 2: Data Preprocessing (Handling Missing Values)
# Example: Fill missing numerical columns with median and categorical columns with mode
df['Assessed Value'] = df['Assessed Value'].fillna(df['Assessed Value'].median())
df['Sale Amount'] = df['Sale Amount'].fillna(df['Sale Amount'].median())
df['Sales Ratio'] = df['Sales Ratio'].fillna(df['Sales Ratio'].median())
df['Town'] = df['Town'].fillna(df['Town'].mode()[0])
df['Property Type'] = df['Property Type'].fillna(df['Property Type'].mode()[0])
df['Residential Type'] = df['Residential Type'].fillna(df['Residential Type'].mode()[0])

In [4]:
# Step 3: Feature Engineering (creating Likely_to_Sell_Flag from remarks)
keywords = ['ESTATE SALE', 'SHORT SALE', 'RENOVATED', 'TOTAL RENOVATION', 'MUST SELL', 'MOVING SALE', 'DISTRESSED']

def flag_likely_to_sell(remarks):
    if pd.isna(remarks) or remarks.strip() == "":
        return 0
    remarks = str(remarks).upper()
    if any(keyword in remarks for keyword in keywords):
        return 1
    return 0

df['Likely_to_Sell_Flag'] = df['Assessor Remarks'].apply(flag_likely_to_sell) | df['OPM remarks'].apply(flag_likely_to_sell)


In [5]:
# Step 4: Feature and Target Selection
X = df.drop(columns=['Likely_to_Sell_Flag', 'Serial Number', 'Address', 'Date Recorded', 'Non Use Code', 'Assessor Remarks', 'OPM remarks', 'Location'])
y = df['Likely_to_Sell_Flag']

In [6]:
df.head(10)

Unnamed: 0,Serial Number,List Year,Date Recorded,Town,Address,Assessed Value,Sale Amount,Sales Ratio,Property Type,Residential Type,Non Use Code,Assessor Remarks,OPM remarks,Location,Likely_to_Sell_Flag
0,190176,2019,01-06-2020,Seymour,236 ROOSEVELT DR,52570,75000.0,0.7009,Single Family,Single Family,25 - Other,2 PROPERTIES,,POINT (-73.1376 41.35798),0
1,190711,2019,29-04-2020,Norwalk,11 COLUMBINE LN,311630,555000.0,0.5615,Single Family,Single Family,,5-31-126-0,,,0
2,190509,2019,01-09-2020,Wethersfield,3 WHEELER RD,162230,225000.0,0.721,Single Family,Single Family,10 - A Will,ESTATE SALE,,,1
3,20058,2020,01-06-2021,Barkhamsted,46 RATLUM MTN RD,203530,415000.0,0.490434,Residential,Single Family,,"2003 COLONIAL, 2140 SFLA, 2.99 AC",,,0
4,200207,2020,23-11-2020,Bristol,BROAD PLACE LOTS 225-6 AND 224,8400,38500.0,0.2181,Vacant Land,Single Family,25 - Other,MULTIPLE LOT SALE,,POINT (-72.90406 41.66996),0
5,20093,2020,02-02-2021,Burlington,57 ROCK RD,194670,275000.0,0.7078,Residential,Single Family,10 - A Will,ESTATE SALE,,,1
6,200142,2020,04-03-2021,Coventry,SQUIRREL TR,35600,42750.0,0.8327,Vacant Land,Single Family,,R/C/8,,,0
7,200110,2020,14-01-2021,Cromwell,25 MIDWAY DR,128200,208000.0,0.6163,Residential,Condo,25 - Other,BAA OVERRIDE,,,0
8,200485,2020,13-01-2021,Danbury,27 CROWS NEST LA 6A,110300,195000.0,0.5656,Residential,Condo,,L15008-61,,,0
9,200915,2020,10-05-2021,Danbury,51 OLD BOSTON POST RD,191600,394000.0,0.4862,Residential,Single Family,,H22043,,,0


In [7]:
# Step 1: Drop unnecessary columns (already done earlier)
X = df.drop(columns=['Likely_to_Sell_Flag', 'Serial Number', 'Address', 'Date Recorded', 
                     'Non Use Code', 'Assessor Remarks', 'OPM remarks', 'Location'])

# Step 2: Handle missing values
# Replace missing categorical values with 'Unknown'
X['Town'] = X['Town'].fillna('Unknown')
X['Property Type'] = X['Property Type'].fillna('Unknown')
X['Residential Type'] = X['Residential Type'].fillna('Unknown')

# Replace missing numerical values with median
X['Assessed Value'] = X['Assessed Value'].fillna(X['Assessed Value'].median())
X['Sale Amount'] = X['Sale Amount'].fillna(X['Sale Amount'].median())
X['Sales Ratio'] = X['Sales Ratio'].fillna(X['Sales Ratio'].median())

# Step 3: Convert categorical columns to numeric using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Step 4: Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


In [8]:
# Step 6: Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [9]:
# Step 7: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:

# Step 8: Define Models to Evaluate
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine (SVM)": SVC(random_state=42),
    "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Ridge Classifier": RidgeClassifier(random_state=42)
}


In [11]:
# Step 9: Function to Evaluate Models
def evaluate_models(X, y):
    for model_name, model in models.items():
        print(f"\nEvaluating {model_name}...")
        
        # Train the model
        model.fit(X_train_scaled, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_scaled)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Accuracy of {model_name}: {accuracy * 100:.2f}%")
        
        # Print classification report
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
        
        # Print confusion matrix
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        print("-" * 50)


In [12]:
# Step 10: Run the Model Evaluation
evaluate_models(X_resampled, y_resampled)


Evaluating Logistic Regression...
Accuracy of Logistic Regression: 93.01%
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.99      0.93     27312
           1       0.98      0.87      0.93     27310

    accuracy                           0.93     54622
   macro avg       0.94      0.93      0.93     54622
weighted avg       0.94      0.93      0.93     54622

Confusion Matrix:
[[26917   395]
 [ 3421 23889]]
--------------------------------------------------

Evaluating Random Forest...
Accuracy of Random Forest: 95.18%
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95     27312
           1       0.96      0.94      0.95     27310

    accuracy                           0.95     54622
   macro avg       0.95      0.95      0.95     54622
weighted avg       0.95      0.95      0.95     54622

Confusion Matrix:
[[26252  1060]
 [ 1572 25738]]
-------------

In [None]:
###code with modification which i will run later

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from collections import Counter  # <CHANGE>

# Step 1: Load the dataset
df = pd.read_csv(r"C:\Users\91727\OneDrive\Desktop\hodson\vishesh codes and files\hodson.csv", low_memory=False)

# Step 2: Data Preprocessing (Handling Missing Values)
# Example: Fill missing numerical columns with median and categorical columns with mode
df['Assessed Value'] = df['Assessed Value'].fillna(df['Assessed Value'].median())
df['Sale Amount'] = df['Sale Amount'].fillna(df['Sale Amount'].median())
df['Sales Ratio'] = df['Sales Ratio'].fillna(df['Sales Ratio'].median())
df['Town'] = df['Town'].fillna(df['Town'].mode()[0])
df['Property Type'] = df['Property Type'].fillna(df['Property Type'].mode()[0])
df['Residential Type'] = df['Residential Type'].fillna(df['Residential Type'].mode()[0])

# Step 3: Feature Engineering (creating Likely_to_Sell_Flag from remarks)
keywords = ['ESTATE SALE', 'SHORT SALE', 'RENOVATED', 'TOTAL RENOVATION', 'MUST SELL', 'MOVING SALE', 'DISTRESSED']

def flag_likely_to_sell(remarks):
    if pd.isna(remarks) or remarks.strip() == "":
        return 0
    remarks = str(remarks).upper()
    if any(keyword in remarks for keyword in keywords):
        return 1
    return 0

df['Likely_to_Sell_Flag'] = df['Assessor Remarks'].apply(flag_likely_to_sell) | df['OPM remarks'].apply(flag_likely_to_sell)

# Step 4: Feature and Target Selection
X = df.drop(columns=['Likely_to_Sell_Flag', 'Serial Number', 'Address', 'Date Recorded', 'Non Use Code', 'Assessor Remarks', 'OPM remarks', 'Location'])
y = df['Likely_to_Sell_Flag']

# Print dataset shapes before SMOTE
print(f"Dataset shape before SMOTE: {X.shape}, Target shape: {y.shape}")  # <CHANGE>

# Step 5: Handle Class Imbalance with SMOTE
# Step 1: Drop unnecessary columns (already done earlier)
X = df.drop(columns=['Likely_to_Sell_Flag', 'Serial Number', 'Address', 'Date Recorded', 
                     'Non Use Code', 'Assessor Remarks', 'OPM remarks', 'Location'])

# Step 2: Handle missing values
# Replace missing categorical values with 'Unknown'
X['Town'] = X['Town'].fillna('Unknown')
X['Property Type'] = X['Property Type'].fillna('Unknown')
X['Residential Type'] = X['Residential Type'].fillna('Unknown')

# Replace missing numerical values with median
X['Assessed Value'] = X['Assessed Value'].fillna(X['Assessed Value'].median())
X['Sale Amount'] = X['Sale Amount'].fillna(X['Sale Amount'].median())
X['Sales Ratio'] = X['Sales Ratio'].fillna(X['Sales Ratio'].median())

# Step 3: Convert categorical columns to numeric using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Step 4: Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Print class distributions after SMOTE
print(f"Class distribution before SMOTE: {Counter(y)}")  # <CHANGE>
print(f"Class distribution after SMOTE: {Counter(y_resampled)}")  # <CHANGE>

# Step 6: Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 7: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Check scaling
print("First 5 rows of scaled training data:")  # <CHANGE>
print(X_train_scaled[:5])  # <CHANGE>

# Step 8: Define Models to Evaluate
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine (SVM)": SVC(random_state=42),
    "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Ridge Classifier": RidgeClassifier(random_state=42)
}

# Step 9: Function to Evaluate Models
results = {}  # <CHANGE>
for model_name, model in models.items():
    print(f"\nEvaluating {model_name}...")  # <CHANGE>
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy of {model_name}: {accuracy * 100:.2f}%")
    
    # Save results
    results[model_name] = {  # <CHANGE>
        "accuracy": accuracy,
        "classification_report": classification_report(y_test, y_pred, output_dict=True),
        "confusion_matrix": confusion_matrix(y_test, y_pred).tolist()
    }
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("-" * 50)

# Save evaluation results to JSON
import json  # <CHANGE>
with open("model_evaluation_results.json", "w") as f:  # <CHANGE>
    json.dump(results, f)  # <CHANGE>
    
# Feature importance analysis
feature_importances = RandomForestClassifier().fit(X_train_scaled, y_train).feature_importances_  # <CHANGE>
important_features = pd.Series(feature_importances, index=X.columns).sort_values(ascending=False)  # <CHANGE>
print("Top 10 important features:")  # <CHANGE>
print(important_features.head(10))  # <CHANGE>
