In [2]:
#information gain
import pandas as pd
import numpy as np

def entropy(data):
    values, counts = np.unique(data, return_counts=True)
    probs = counts / len(data)
    return -np.sum(probs * np.log2(probs))

def information_gain(data, attribute_index):
    total_entropy = entropy(data[:, -1])
    values, counts = np.unique(data[:, attribute_index], return_counts=True)
    weighted_entropy = sum((counts[i] / len(data)) * entropy(data[data[:, attribute_index] == values[i], -1]) for i in range(len(values)))
    return total_entropy - weighted_entropy

# Load CSV file
file_path = input("Enter the path to the CSV file: ")
df = pd.read_csv(file_path)
data = df.values

print("Dataset loaded successfully:")
print(df)

try:
    attribute_index = int(input(f"Enter the index of the attribute (0 to {data.shape[1] - 2}) for which you want to calculate information gain: "))
    if 0 <= attribute_index < data.shape[1] - 1:
        ig = information_gain(data, attribute_index)
        print(f"Information Gain for attribute {attribute_index}: {ig}")
    else:
        print(f"Invalid attribute index. Please enter a number between 0 and {data.shape[1] - 2}.")
except ValueError:
    print("Invalid input. Please enter a valid integer for the attribute index.")
except Exception as e:
    print(f"An error occurred: {e}")


Enter the path to the CSV file: homeprices multiple.csv
Dataset loaded successfully:
   AREA  BEDROOMS  AGE   PRICE
0  2600       3.0   20  550000
1  3000       4.0   15  565000
2  3200       NaN   18  610000
3  3600       3.0   30  595000
4  4000       5.0    8  760000
5  4100       6.0    8  810000
Enter the index of the attribute (0 to 2) for which you want to calculate information gain: 1
Information Gain for attribute 1: 2.2516291673878226


In [6]:
#Gini index
import pandas as pd
import numpy as np

def gini_index(data):
    # Calculate the Gini index of a dataset
    class_labels = data[:, -1]
    total_instances = len(class_labels)
    label_counts = np.unique(class_labels, return_counts=True)[1]
    label_probabilities = label_counts / total_instances
    return 1 - np.sum(label_probabilities**2)

def gini_index_attribute(data, attribute_index):
    # Calculate the Gini index of an attribute in a dataset
    attribute_values = np.unique(data[:, attribute_index])
    total_instances = len(data)
    gini_attribute = 0
    for value in attribute_values:
        subset = data[data[:, attribute_index] == value]
        subset_instances = len(subset)
        gini_subset = gini_index(subset)
        gini_attribute += (subset_instances / total_instances) * gini_subset
    return gini_attribute

# Load CSV file
file_path = input("Enter the path to the CSV file: ")
df = pd.read_csv(file_path)
data = df.values

print("Dataset loaded successfully:")
print(df)

try:
    attribute_index = int(input(f"Enter the index of the attribute (0 to {data.shape[1] - 2 }) for which you want to calculate Gini index: "))
    if 0 <= attribute_index < data.shape[1] - 1:
        gini_attr = gini_index_attribute(data, attribute_index)
        print(f"Gini index for attribute {attribute_index}: {gini_attr}")
    else:
        print(f"Invalid attribute index. Please enter a number between 0 and {data.shape[1] - 2}.")
except ValueError:
    print("Invalid input. Please enter a valid integer for the attribute index.")
except Exception as e:
    print(f"An error occurred: {e}")


Enter the path to the CSV file: homeprices multiple.csv
Dataset loaded successfully:
   AREA  BEDROOMS  AGE   PRICE
0  2600       3.0   20  550000
1  3000       4.0   15  565000
2  3200       NaN   18  610000
3  3600       3.0   30  595000
4  4000       5.0    8  760000
5  4100       6.0    8  810000
Enter the index of the attribute (0 to 2) for which you want to calculate Gini index: 1
Gini index for attribute 1: 0.16666666666666666


In [17]:
#different distance measures
import pandas as pd
import numpy as np

# Function to calculate Euclidean distance
def euclidean_distance(instance1, instance2):
    return np.linalg.norm(instance1 - instance2)

# Function to calculate Manhattan distance
def manhattan_distance(instance1, instance2):
    return np.sum(np.abs(instance1 - instance2))

# Function to calculate Cosine similarity
def cosine_similarity(instance1, instance2):
    dot_product = np.dot(instance1, instance2)
    norm1 = np.linalg.norm(instance1)
    norm2 = np.linalg.norm(instance2)
    return dot_product / (norm1 * norm2)

# Load CSV file
file_path = 'homeprices multiple.csv' #input("Enter the path to the CSV file: ")
df = pd.read_csv(file_path)

# Print loaded dataset
print("Dataset loaded successfully:")
print(df)

# Mapping of distance measure names to functions
distance_measures = {
    "1": ("Euclidean", euclidean_distance),
    "2": ("Manhattan", manhattan_distance),
    "3": ("Cosine Similarity", cosine_similarity)
}

# Print distance measure options
print("\nSelect a distance measure:")
for key, (measure_name, _) in distance_measures.items():
    print(f"{key}. {measure_name}")

# Accept user input for selecting distance measure
selected_measure_name = input("Enter the index or name of the distance measure: ")

# Validate the selected measure
if selected_measure_name in distance_measures:
    selected_measure = distance_measures[selected_measure_name][1]  # Get the function corresponding to the selected measure
    selected_measure_name = distance_measures[selected_measure_name][0]  # Get the name of the selected measure
else:
    print("Invalid distance measure selection. Please choose from the available options.")
    exit()

# Input indices of two instances
index1 = int(input(f"Enter index of the first instance (0 to {len(df)-1}): "))
index2 = int(input(f"Enter index of the second instance (0 to {len(df)-1}): "))

# Validate indices
if 0 <= index1 < len(df) and 0 <= index2 < len(df):
    instance1 = df.iloc[index1, :-1].values  # Exclude last column (assuming it's the target variable)
    instance2 = df.iloc[index2, :-1].values  # Exclude last column (assuming it's the target variable)

    # Calculate distance based on user's choice
    distance = selected_measure(instance1, instance2)
    print(f"{selected_measure_name} distance between instance {index1} and instance {index2}: {distance}")
else:
    print(f"Invalid indices. Please enter indices between 0 and {len(df)-1}.")


Dataset loaded successfully:
   AREA  BEDROOMS  AGE   PRICE
0  2600       3.0   20  550000
1  3000       4.0   15  565000
2  3200       NaN   18  610000
3  3600       3.0   30  595000
4  4000       5.0    8  760000
5  4100       6.0    8  810000

Select a distance measure:
1. Euclidean
2. Manhattan
3. Cosine Similarity
Enter the index or name of the distance measure: 1
Enter index of the first instance (0 to 5): 1
Enter index of the second instance (0 to 5): 5
Euclidean distance between instance 1 and instance 5: 1100.0240906452914
