In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import log2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

def load_data(filepath):                        #loads file and removes the rows with missing values
    df = pd.read_csv(filepath)
    return df

def preprocess_data(df):
    df = df.dropna()
    df = pd.get_dummies(df, drop_first=True)
    return df

def split_data(df, target_column='price'):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    return train_test_split(X, y, test_size=0.2, random_state=42)

def train_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"Mean Absolute % Error: {mape:.2f}")
    print(f"Root Mean Squared Error: {rmse:.2f}")
    print(f"R² Score: {r2:.2f}")
    return y_pred

def accuracy_model(model, X_test, y_test):                                #accuracy
    accuracy = model.score(X_test, y_test)
    return accuracy

def equal_width_binning(series, bins=4):
    return pd.cut(series, bins=bins, labels=False, duplicates='drop')

def equal_frequency_binning(series, bins=4):
    # Use qcut for quantile-based binning
    return pd.qcut(series, q=bins, labels=False, duplicates='drop')

def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy_val = -np.sum([(counts[i]/np.sum(counts)) * np.log2(counts[i]/np.sum(counts))
                           for i in range(len(elements))])
    return entropy_val


def calculate_entropy(data):                                             # Entropy Calculation
    values, counts = np.unique(data, return_counts=True)
    probabilities = counts / counts.sum()
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

def calculate_gini(series):
    probs = series.value_counts(normalize=True)  # probabilities
    return 1 - sum(probs ** 2)

def information_gain(df, feature_name, target_column):
    total_entropy = entropy(df[target_column])

    values, counts = np.unique(df[feature_name], return_counts=True)
    weighted_entropy = 0
    for val, count in zip(values, counts):
        subset = df[df[feature_name] == val]
        weighted_entropy += (count / len(df)) * entropy(subset[target_column])

    return total_entropy - weighted_entropy

# --- Find best root node ---
def find_root_node(df, target_column):
    max_ig = -1
    best_feature = None

    for col in df.columns:
        if col != target_column:
            ig = information_gain(df, col, target_column)
            if ig > max_ig:
                max_ig = ig
                best_feature = col

    return best_feature, max_ig

In [2]:
# A1: Entropy calculation

filepath = r"C:\Users\dhruv\OneDrive\Desktop\ml\listings.csv"
df = load_data(filepath)
df = preprocess_data(df)

target_column='price'
if pd.api.types.is_numeric_dtype(df[target_column]):
    df[target_column] = equal_width_binning(df[target_column], bins=4)

print("\nEntropy of target:", calculate_entropy(df[target_column]))

print("\nEntropy of target:", entropy(df[target_column]))

#----frequency binning----#
#if pd.api.types.is_numeric_dtype(df[target_column]):
#   df[target_column] = equal_frequency_binning(df[target_column], bins=4)

#print(df[target_column].value_counts())  # should be roughly equal counts
#print("\nEntropy of target:", calculate_entropy(df[target_column]))


Entropy of target: 0.030753173912389797

Entropy of target: 0.030753173912389797


In [3]:
#A2: Gini index

gini_value = calculate_gini(df[target_column])
print("Gini index of target:", gini_value)

Gini index of target: 0.006289245589069514


In [None]:
#A3: root node feature for decision tree

filepath = r"C:\Users\dhruv\OneDrive\Desktop\ml\listings.csv"
df = load_data(filepath)
df = preprocess_data(df)

target_column = 'price'

# Equal-width binning for target
if pd.api.types.is_numeric_dtype(df[target_column]):
    df[target_column] = equal_width_binning(df[target_column], bins=4)

# Run A3
root_feature, ig_value = find_root_node(df, target_column)
print(f"Best Root Node: {root_feature} (Information Gain = {ig_value:.4f})")

In [None]:
#A4: 