In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE

# Step 1: Load the dataset
df = pd.read_csv(r"C:\Users\91727\OneDrive\Desktop\hodson\vishesh codes and files\hodson.csv", low_memory=False)

# Step 2: Data Preprocessing (Handling Missing Values)
# Example: Fill missing numerical columns with median and categorical columns with mode
df['Assessed Value'] = df['Assessed Value'].fillna(df['Assessed Value'].median())
df['Sale Amount'] = df['Sale Amount'].fillna(df['Sale Amount'].median())
df['Sales Ratio'] = df['Sales Ratio'].fillna(df['Sales Ratio'].median())
df['Town'] = df['Town'].fillna(df['Town'].mode()[0])
df['Property Type'] = df['Property Type'].fillna(df['Property Type'].mode()[0])
df['Residential Type'] = df['Residential Type'].fillna(df['Residential Type'].mode()[0])

# Step 3: Feature Engineering (creating Likely_to_Sell_Flag from remarks)
keywords = ['ESTATE SALE', 'SHORT SALE', 'RENOVATED', 'TOTAL RENOVATION', 'MUST SELL', 'MOVING SALE', 'DISTRESSED']

def flag_likely_to_sell(remarks):
    if pd.isna(remarks) or remarks.strip() == "":
        return 0
    remarks = str(remarks).upper()
    if any(keyword in remarks for keyword in keywords):
        return 1
    return 0

df['Likely_to_Sell_Flag'] = df['Assessor Remarks'].apply(flag_likely_to_sell) | df['OPM remarks'].apply(flag_likely_to_sell)

# Step 4: Feature and Target Selection
X = df.drop(columns=['Likely_to_Sell_Flag', 'Serial Number', 'Address', 'Date Recorded', 'Non Use Code', 'Assessor Remarks', 'OPM remarks', 'Location'])
y = df['Likely_to_Sell_Flag']

# Step 5: Handle Class Imbalance with SMOTE (Optional)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 6: Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 7: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 8: Define Models to Evaluate
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine (SVM)": SVC(random_state=42),
    "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Ridge Classifier": RidgeClassifier(random_state=42)
}

# Step 9: Function to Evaluate Models
def evaluate_models(X, y):
    for model_name, model in models.items():
        print(f"\nEvaluating {model_name}...")
        
        # Train the model
        model.fit(X_train_scaled, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_scaled)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Accuracy of {model_name}: {accuracy * 100:.2f}%")
        
        # Print classification report
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
        
        # Print confusion matrix
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))
        print("-" * 50)

# Step 10: Run the Model Evaluation
evaluate_models(X_resampled, y_resampled)


ValueError: could not convert string to float: 'Seymour'