In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
#read in and clean data
dataFrame = pd.read_csv("Housing.csv")
data_cleaned = dataFrame.dropna(axis='index', how='any')

In [3]:
#select features, 
X = data_cleaned[["area", "prefarea", "furnishingstatus"]]
X = pd.get_dummies(X, columns=["prefarea", "furnishingstatus"])
Y = data_cleaned["price"]

In [4]:
# put in buckets
price_ranges = [(0, 1000000), (1000000, 2000000), (2000000, 3000000), (3000000, 4000000), 
                (4000000, 5000000), (5000000, 6000000), (6000000, 7000000), (7000000, 8000000),
                (8000000, 9000000), (9000000, 10000000), (10000000, 11000000), (11000000, 12000000),
                (12000000, 13000000), (13000000, float('inf'))]
labels = ['<1M', '1M-2M', '2M-3M', '3M-4M', '4M-5M', '5M-6M', '6M-7M', '7M-8M', '8M-9M', '9M-10M',
          '10M-11M', '11M-12M', '12M-13M', '>=13M']
data_cleaned['price_category'] = pd.cut(data_cleaned['price'], bins=[rng[0] for rng in price_ranges] + [float('inf')], labels=labels)

In [5]:
#do the split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [6]:
#do the decision tree
tree_classifier = DecisionTreeClassifier()
tree_classifier.fit(X_train, y_train)

In [7]:
y_pred = tree_classifier.predict(X_test)

In [8]:
# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.009174311926605505
Precision: 0.0030581039755351682
Recall: 0.009174311926605505
F1 Score: 0.0045871559633027525


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
matrix = confusion_matrix(data_cleaned['price_category'], data_cleaned['price_category'], labels=labels)
print(matrix)

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   9   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0  62   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 148   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0 131   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0  78   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0  53   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0  27   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0  20   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   9   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   3   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   1   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   3   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   1]]


In [10]:
# Find the lowest price for bucketing
lowest_price = dataFrame['price'].min()

# Find the highest price for bucketing
highest_price = dataFrame['price'].max()

print("Lowest Price:", lowest_price)
print("Highest Price:", highest_price)

Lowest Price: 1750000
Highest Price: 13300000


In [11]:
import pandas as pd

# Define more granular price ranges and corresponding labels
price_ranges = [(0, 1000000), (1000000, 2000000), (2000000, 3000000), (3000000, 4000000), 
                (4000000, 5000000), (5000000, 6000000), (6000000, 7000000), (7000000, 8000000),
                (8000000, 9000000), (9000000, 10000000), (10000000, 11000000), (11000000, 12000000),
                (12000000, 13000000), (13000000, float('inf'))]
labels = ['<1M', '1M-2M', '2M-3M', '3M-4M', '4M-5M', '5M-6M', '6M-7M', '7M-8M', '8M-9M', '9M-10M',
          '10M-11M', '11M-12M', '12M-13M', '>=13M']

# Bucketize prices using cut() function
dataFrame['price_category'] = pd.cut(dataFrame['price'], bins=[rng[0] for rng in price_ranges] + [float('inf')], labels=labels)

# Display the first few rows to verify
print(dataFrame[['price', 'price_category']].head())


      price price_category
0  13300000          >=13M
1  12250000        12M-13M
2  12250000        12M-13M
3  12215000        12M-13M
4  11410000        11M-12M


In [12]:
from sklearn.metrics import confusion_matrix

# Predict target variable Y using the decision tree model
Y_predicted = tree_classifier.predict(X)

# Bucketize actual prices into categories
actual_price_categories = pd.cut(Y, bins=[rng[0] for rng in price_ranges] + [float('inf')], labels=labels)

# Bucketize predicted prices into categories
predicted_price_categories = pd.cut(Y_predicted, bins=[rng[0] for rng in price_ranges] + [float('inf')], labels=labels)

# Compute confusion matrix
matrix = confusion_matrix(actual_price_categories, predicted_price_categories, labels=labels)
print(matrix)


[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   6   2   1   0   0   0   0   0   0   0   0   0   0]
 [  0   3  58   1   0   0   0   0   0   0   0   0   0   0]
 [  0   4  24 107  10   0   3   0   0   0   0   0   0   0]
 [  0   0  11  14 101   3   0   1   0   1   0   0   0   0]
 [  0   1   5   5   6  59   0   1   0   1   0   0   0   0]
 [  0   0   5   5   6   6  28   1   0   1   0   0   1   0]
 [  0   0   5   0   4   5   1  12   0   0   0   0   0   0]
 [  0   0   1   2   1   2   1   0  13   0   0   0   0   0]
 [  0   0   2   0   1   1   1   0   0   4   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   3   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   1   0   0]
 [  0   0   0   0   0   0   1   0   0   0   0   0   2   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   1]]


In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print ("Accuracy is ", accuracy_score(Y, Y_predicted))

# We have to specify how to combine for the multiclassifications
print ("Precision is ", precision_score(Y, Y_predicted, average="weighted"))
print ("Sensitivity is ", recall_score(Y, Y_predicted, average="weighted"))
print ("F1 is ", f1_score(Y, Y_predicted, average="weighted"))


Accuracy is  0.6238532110091743
Precision is  0.6446266119201899
Sensitivity is  0.6238532110091743
F1 is  0.5973323453610823


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
