In [1]:
import pandas as pd
import numpy as np
import math
import random
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_df = pd.read_csv('training.data', header = None)
test_df = pd.read_csv('test.data', header = None)

In [3]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,16.92,0.335,y,p,k,v,0.29,f,f,0,f,s,200,0,-
1,b,48.5,4.25,u,g,m,v,0.125,t,f,0,t,g,225,0,+
2,a,68.67,15.0,u,g,e,z,0.0,t,t,14,f,g,0,3376,+
3,a,25.08,2.54,y,p,aa,v,0.25,t,f,0,t,g,370,0,+
4,b,33.75,2.75,u,g,i,bb,0.0,f,f,0,f,g,180,0,-


In [4]:
columns_with_question_mark = train_df.eq('?').any()
print(columns_with_question_mark[columns_with_question_mark].index)

numeric = [1,13]
categorical = [0, 3, 4, 5, 6]

Index([0, 1, 3, 4, 5, 6, 13], dtype='int64')


In [5]:
columns_with_question_mark = test_df.eq('?').any()
print(columns_with_question_mark[columns_with_question_mark].index)

Index([0, 1, 3, 4, 5, 6, 13], dtype='int64')


In [6]:
train_df[numeric] = train_df[numeric].replace('?', np.nan)
test_df[numeric] = test_df[numeric].replace('?', np.nan)

# Step 2: Calculate median of each numeric column
medians = train_df[numeric].median()

# Step 3: Replace NaN values with median in each column
train_df[1].fillna(29.17, inplace = True)
train_df[13].fillna(160.0, inplace = True)

test_df[1].fillna(29.17, inplace = True)
test_df[13].fillna(160.0, inplace = True)

print("Medians:")
print(medians)

Medians:
1     29.17
13    160.0
dtype: object


In [7]:
print(train_df.isnull().any())

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
dtype: bool


In [8]:

# Assuming 'categorical' is defined somewhere in your code.
# categorical = [...]

# Step 2: Replace '?' values with NaN
train_df.replace('?', np.nan, inplace=True)
test_df.replace('?', np.nan, inplace=True)

# Prepare a list to store median values temporarily
temp_median_values = []

# Step 2: For each categorical column, find the "median" value
for col in categorical:
    # Drop NaN values, sort the remaining values, and then attempt to find the middle value
    sorted_col = train_df[col].dropna().sort_values().reset_index(drop=True)

    if len(sorted_col) % 2 == 1:
        # If odd number of entries, take the middle value
        temp_median_values.append(sorted_col[len(sorted_col) // 2])
    else:
        # If even, take the lower of the two middle values (as one of the possible interpretations)
        temp_median_values.append(sorted_col[(len(sorted_col) // 2) - 1])

# Convert the list of median values into a pandas Series
medians = pd.Series(temp_median_values, index=categorical)

# Fill NaN values with medians
train_df[categorical] = train_df[categorical].fillna(medians)
test_df[categorical] = test_df[categorical].fillna(medians)

# Display the Series object with median values
print("Medians:")
print(medians)


Medians:
0     b
3     u
4     g
5    ff
6     v
dtype: object


In [9]:
print(test_df.isnull().any())

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
dtype: bool


In [10]:
for col in train_df.columns:
    try:
        train_df[col] = pd.to_numeric(train_df[col])
        test_df[col] =  pd.to_numeric(test_df[col])
    except ValueError:
        pass  # If conversion is not possible, leave the column as it is

In [11]:
def Entropy(df, attribute):
    value_counts = df[attribute].value_counts()
    total = df.shape[0]

    gain = 0
    for i in range(value_counts.shape[0]):
        #print(value_counts[i])
        gain = gain + (value_counts[i]/total * math.log(value_counts[i]/total, 2))

    gain = -gain
    return gain


In [12]:
def Information_GainC(df, attribute, target_attribute):

    total = df.shape[0]
    gain = 0
    if(df[attribute].dtype == object):
        value_counts = df[attribute].value_counts()
        for index, count in enumerate(value_counts.items()):
            unique_value, frequency = count
            df_subset = df[df[attribute] == unique_value]

            gain = gain + (frequency/total) * Entropy(df_subset,target_attribute)

        return Entropy(df,target_attribute) - gain, None



    else:
        df = df.sort_values(by=attribute, ascending=True)
        df = df.reset_index(drop=True)
        total = df.shape[0]
        splits = []
        for index in range(total - 1):
            if(index==0):
                splits.append((1-df.loc[index, attribute])/2)

            splits.append((df.loc[index, attribute]+df.loc[index + 1, attribute])/2)


        max_split = None
        for split_value in splits:
            
            temp_gain = 0

            
            df_below = df[df[attribute] <= split_value]
            temp_gain = temp_gain + (df_below.shape[0]/total) * Entropy(df_below,target_attribute)




            df_above = df[df[attribute] > split_value]
            temp_gain = temp_gain + (df_above.shape[0]/total) * Entropy(df_above,target_attribute)


            
            temp_gain = Entropy(df,target_attribute) - temp_gain
            if (temp_gain > gain):
                gain = temp_gain
                max_split = split_value
        
        

                

        
        return gain, max_split



In [13]:
def IV(df, attribute,split_value):
    gain = 0
    total = df.shape[0]
    if (df[attribute].dtype == object):
        value_counts = df[attribute].value_counts()
        

        
        for i in range(value_counts.shape[0]):
            gain = gain + (value_counts[i]/total * math.log(value_counts[i]/total, 2))


    else:
        df_below = df[df[attribute] <= split_value]
        if(df_below.shape[0] != 0):
            gain = gain + (df_below.shape[0]/total * math.log(df_below.shape[0]/total, 2))

        
        df_above = df[df[attribute] > split_value]
        if(df_above.shape[0] != 0):
            gain = gain + (df_above.shape[0]/total * math.log(df_above.shape[0]/total, 2))


    
    return -gain

In [14]:
class C45Class:
    def __init__(self, data,target,subset_unique_categorical_value = None):
        self.target = target
        self.df = data
        self.branches = []
        self.gain_ratio_max = float('-inf')
        self.max_attribute = None
        self.leaf_value = None
        self.df_subset = None
        self.split_value = None
        self.max_split = None
        self.subset_unique_categorical_value = subset_unique_categorical_value


        if (self.df[self.target].nunique() == 1):
            self.leaf_value = self.df[self.target].unique()[0]
            #print(self.leaf_value)
            #print(self.df[self.target].nunique())
            #print("_______________________________________________________________________________________")
        else:

            columns_except_one = [col for col in self.df.columns if col != self.target]
            for col in columns_except_one:
                self.gain, self.split_value = Information_GainC(self.df,col,self.target)



                iv_value = IV(self.df, col, self.split_value)
                if iv_value == 0:
                    self.gain_ratio = float('-inf')  # Or set to a default value that indicates an invalid or undefined gain ratio
                else:
                    self.gain_ratio = self.gain / iv_value


                #self.gain_ratio = self.gain/IV(self.df,col,self.split_value)
                #print(f"col = {col}    split {self.split_value}     {self.gain}            {self.gain_ratio}")
                if(self.gain_ratio>self.gain_ratio_max):
                    self.gain_ratio_max = self.gain_ratio
                    self.max_attribute = col
                    self.max_split = self.split_value

                

            #print('____________________________________________________________________________________________________________________________________________________________________________________________________________________')

            #print(f"df Shape = {self.df.shape}       Max attribute = {self.max_attribute}       Max split = {self.max_split}")
            if (self.df[self.max_attribute].dtype != object):
                self.df_below = self.df[self.df[self.max_attribute] <= self.max_split]
                self.df_above = self.df[self.df[self.max_attribute] > self.max_split]

                #print(f'Below shape = {self.df_below.shape}         Above Shape= {self.df_above.shape}  {self.max_split}')
                #df_below.drop(self.max_attribute, axis=1, inplace=True)       
                #branches.append(C45Class(df_below))
                subset_class = C45Class(self.df_below, self.target)
                self.branches.append(subset_class)





                

                subset_class = C45Class(self.df_above, self.target)
                self.branches.append(subset_class)
                #df_above.drop(self.max_attribute, axis=1, inplace=True)

                #branches.append(C45Class(df_above))




                



            else:
                value_counts = self.df[self.max_attribute].value_counts()
                #print(value_counts)
                for index, count in enumerate(value_counts.items()):
                    unique_value, frequency = count
                    
                    self.df_subset = self.df[self.df[self.max_attribute] == unique_value]
                    #self.df_subset.drop(self.max_attribute, axis=1, inplace=True)
                    #print(self.max_attribute)
                    #print(self.df_subset)
                    #if (max_attribute == 'CGPA'):
                    #print(f"Subset = {self.df_subset.shape}")
                    subset_class = C45Class(self.df_subset, self.target, unique_value)
                    self.branches.append(subset_class)




#instance = C45Class(df1,'Target Class')
instance_C45 = C45Class(train_df,15)
print('____________________________________________________________________________________________________________________________________________________________________________________________________________________')


____________________________________________________________________________________________________________________________________________________________________________________________________________________


In [15]:

def make_prediction(row, tree):
    #print(f"{row.size}            {tree.max_attribute}")
    #print(tree.df)
    #print(f"leaf = {tree.leaf_value}        tree_branches = {len(tree.branches)}        Max_attribute = {tree.max_attribute}       row_attribute = {row[tree.max_attribute]}")
    #print("________________________________________________________________________________________")
    if (tree.leaf_value != None):
        return tree.leaf_value
    elif not isinstance(row[tree.max_attribute], str):
        #print(type(row[tree.max_attribute]))
        if(row[tree.max_attribute] <= tree.max_split):
            return make_prediction(row, tree.branches[0])
        else:
            return make_prediction(row, tree.branches[1])
    else:
        for branch in tree.branches:
            if (row[tree.max_attribute] == branch.subset_unique_categorical_value):
                return make_prediction(row, branch)





In [16]:

# for index, row in train_df.iterrows():
#     print(f"predicted = {make_prediction(row, instance)}        Actual = {row[15]}")
print("_______________________________________________________________________________________________________")
for index, row in test_df.iterrows():
    #print(f"predicted = {make_prediction(row, instance)}        Actual = {row[15]}")
    if (index == 97):
        prediction = make_prediction(row, instance_C45)
        if (prediction == None):
            prediction = random.choice(['+', '-'])
        print(f"predicted = {prediction}        Actual = {row[15]}")

_______________________________________________________________________________________________________
predicted = +        Actual = +


In [17]:
def Gini(df, attribute):
    value_counts = df[attribute].value_counts()
    total = df.shape[0]

    gain = 1
    for i in range(value_counts.shape[0]):
        #print(value_counts[i])
        gain = gain - ((value_counts[i]/total)**2)

    return gain

In [18]:
def GiniIndex(df, attribute, target_attribute):

    total = df.shape[0]
    gain = float('inf')
    if(df[attribute].dtype == object):
        value_counts = df[attribute].value_counts()
        for index, count in enumerate(value_counts.items()):
            unique_value, frequency = count
            df_subset = df[df[attribute] == unique_value]

            gain = gain + (frequency/total) * Gini(df_subset,target_attribute)

        return gain, None



    else:
        df = df.sort_values(by=attribute, ascending=True)
        df = df.reset_index(drop=True)
        total = df.shape[0]
        splits = []
        for index in range(total - 1):
            if(index==0):
                splits.append((1-df.loc[index, attribute])/2)

            splits.append((df.loc[index, attribute]+df.loc[index + 1, attribute])/2)


        max_split = None
        for split_value in splits:
            
            temp_gain = 0

            
            df_below = df[df[attribute] <= split_value]
            temp_gain = temp_gain + (df_below.shape[0]/total) * Gini(df_below,target_attribute)




            df_above = df[df[attribute] > split_value]
            temp_gain = temp_gain + (df_above.shape[0]/total) * Gini(df_above,target_attribute)


            
            if (temp_gain < gain):
                gain = temp_gain
                max_split = split_value
        
        

                

        
        return gain, max_split


In [19]:
class GiniClass:
    def __init__(self, data,target,subset_unique_categorical_value = None):
        self.target = target
        self.df = data
        self.branches = []
        self.gain_max = float('inf')
        self.max_attribute = None
        self.leaf_value = None
        self.df_subset = None
        self.split_value = None
        self.max_split = None
        self.subset_unique_categorical_value = subset_unique_categorical_value



                    
        #print(self.df)
        if (self.df[self.target].nunique() == 1):
            self.leaf_value = self.df[self.target].unique()[0]
            #print(self.leaf_value)
            #print(self.df[self.target].nunique())
            #print("_______________________________________________________________________________________")
        else:

            columns_except_one = [col for col in self.df.columns if col != self.target]
            for col in columns_except_one:
                self.gain, self.split_value = GiniIndex(self.df,col,self.target)
                #print(f"Col = {col}   gain = {self.gain}")





                #self.gain_ratio = self.gain/IV(self.df,col,self.split_value)
                #print(f"col = {col}    split {self.split_value}     {self.gain}            {self.gain_ratio}")
                #if(self.gain<self.gain_max or (self.gain_max == 0.5 and self.gain == 0.5 and self.df[self.max_attribute].dtype == object)):
                if(self.gain<self.gain_max):
                    self.gain_max = self.gain
                    self.max_attribute = col
                    self.max_split = self.split_value

                

            #print('____________________________________________________________________________________________________________________________________________________________________________________________________________________')

            #print(f"df Shape = {self.df.shape}       Max attribute = {self.max_attribute}       Max split = {self.max_split}")
            if (self.df[self.max_attribute].dtype != object):
                self.df_below = self.df[self.df[self.max_attribute] <= self.max_split]
                self.df_above = self.df[self.df[self.max_attribute] > self.max_split]

                #print(f'Below shape = {self.df_below.shape}         Above Shape= {self.df_above.shape}  {self.max_split}')
                #df_below.drop(self.max_attribute, axis=1, inplace=True)       
                #branches.append(C45Class(df_below))
                
                subset_class = GiniClass(self.df_below, self.target)
                self.branches.append(subset_class)





                

                subset_class = GiniClass(self.df_above, self.target)
                self.branches.append(subset_class)
                #df_above.drop(self.max_attribute, axis=1, inplace=True)

                #branches.append(C45Class(df_above))




                



            else:
                value_counts = self.df[self.max_attribute].value_counts()
                #print(value_counts)
                for index, count in enumerate(value_counts.items()):
                    unique_value, frequency = count
                    
                    self.df_subset = self.df[self.df[self.max_attribute] == unique_value]
                    #self.df_subset.drop(self.max_attribute, axis=1, inplace=True)
                    #print(self.max_attribute)
                    #if (max_attribute == 'CGPA'):
                    #print(f"Subset = {self.df_subset.shape}")
                    subset_class = GiniClass(self.df_subset, self.target, unique_value)
                    self.branches.append(subset_class)




#instance = C45Class(df1,'Target Class')
#instance1 = GiniClass(train_df,15)
print('____________________________________________________________________________________________________________________________________________________________________________________________________________________')


____________________________________________________________________________________________________________________________________________________________________________________________________________________


In [20]:

# # Creating the DataFrame manually from the provided image
# data = {
#     'a1': ['T', 'T', 'T', 'F', 'F', 'F', 'F', 'T', 'F'],
#     'a2': ['T', 'T', 'F', 'F', 'T', 'T', 'F', 'F', 'T'],
#     'a3': [1.0, 6.0, 5.0, 4.0, 7.0, 3.0, 8.0, 7.0, 5.0],
#     'Target Class': ['+', '+', '-', '+', '-', '-', '-', '+', '-']
# }

# df12 = pd.DataFrame(data)
# GiniClass(df12,'Target Class')


# Define the data for the DataFrame
data = {
    'Annual Income': [60, 70, 75, 85, 90, 95, 100, 120, 125, 220],
    'Label': ['No', 'No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No']
}

# Create the DataFrame
dataframe = pd.DataFrame(data)

# Display the DataFrame
GiniIndex(dataframe,'Annual Income','Label')


(0.3, 97.5)

In [21]:
instanceGini = GiniClass(train_df,15)

(195, 16)

In [42]:
max_f1 = float('-inf')
max_model = None
for i in range(10):
    df_original = train_df.copy()
    #print(f"Iteration i = {i}")
    start = i * (df_original.shape[0] / 10)
    end = i * (df_original.shape[0] / 10) + (df_original.shape[0] / 10) - 1
    subset = df_original.loc[start:end].copy()  # Make sure to copy the subset

    df_original = df_original.drop(df_original.loc[start:end].index)  # Drop the subset and assign back to df_original

    instance_C45 = C45Class(df_original,15)
    
    actual = []
    predicted = []
    for index, row in subset.iterrows():
        prediction = make_prediction(row, instance_C45)
        if (prediction == None):
            prediction = random.choice(['+', '-'])

        predicted.append(prediction)
        actual.append(row[15])
    
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for val in range(len(predicted)):
        if (actual[val] == '+' and predicted[val] == '+'):
            TP = TP + 1
        elif (actual[val] == '+' and predicted[val] == '-'):
            FN = FN + 1
        elif (actual[val] == '-' and predicted[val] == '-'):
            TN = TN + 1
        elif (actual[val] == '-' and predicted[val] == '+'):
            FP = FP + 1
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1 = (2*precision*recall)/(precision + recall)
    print(f"Model {i} F1 Score = {f1}")
    if (f1 > max_f1):
        max_f1 = f1
        max_model = instance_C45







    

Model 0 F1 Score = 0.7692307692307692
Model 1 F1 Score = 0.7659574468085107
Model 2 F1 Score = 0.7659574468085107
Model 3 F1 Score = 0.7659574468085107
Model 4 F1 Score = 0.7272727272727272
Model 5 F1 Score = 0.8571428571428572
Model 6 F1 Score = 0.8461538461538461
Model 7 F1 Score = 0.8235294117647058
Model 8 F1 Score = 0.8275862068965517
Model 9 F1 Score = 0.7346938775510203


In [43]:
# print("_______________________________________________________________________________________________________")
# for index, row in test_df.iterrows():
#     #print(f"predicted = {make_prediction(row, instance)}        Actual = {row[15]}")
#         prediction = make_prediction(row, instance_C45)
#         if (prediction == None):
#             prediction = random.choice(['+', '-'])
#         print(f"predicted = {prediction}        Actual = {row[15]}")


actual = []
predicted = []
for index, row in test_df.iterrows():
    prediction = make_prediction(row, max_model)
    if (prediction == None):
        prediction = random.choice(['+', '-'])

    predicted.append(prediction)
    actual.append(row[15])

TP = 0
TN = 0
FP = 0
FN = 0
for val in range(len(predicted)):
    if (actual[val] == '+' and predicted[val] == '+'):
        TP = TP + 1
    elif (actual[val] == '+' and predicted[val] == '-'):
        FN = FN + 1
    elif (actual[val] == '-' and predicted[val] == '-'):
        TN = TN + 1
    elif (actual[val] == '-' and predicted[val] == '+'):
        FP = FP + 1
precision = TP/(TP+FP)
recall = TP/(TP+FN)
f1 = (2*precision*recall)/(precision + recall)
print(f1)

0.7874015748031497


In [44]:
max_f1 = float('-inf')
max_model = None
for i in range(10):
    df_original = train_df.copy()
    #print(f"Iteration i = {i}")
    start = i * (df_original.shape[0] / 10)
    end = i * (df_original.shape[0] / 10) + (df_original.shape[0] / 10) - 1
    subset = df_original.loc[start:end].copy()  # Make sure to copy the subset

    df_original = df_original.drop(df_original.loc[start:end].index)  # Drop the subset and assign back to df_original

    instanceGini = GiniClass(df_original,15)
    
    actual = []
    predicted = []
    for index, row in subset.iterrows():
        prediction = make_prediction(row, instanceGini)
        if (prediction == None):
            prediction = random.choice(['+', '-'])

        predicted.append(prediction)
        actual.append(row[15])
    
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for val in range(len(predicted)):
        if (actual[val] == '+' and predicted[val] == '+'):
            TP = TP + 1
        elif (actual[val] == '+' and predicted[val] == '-'):
            FN = FN + 1
        elif (actual[val] == '-' and predicted[val] == '-'):
            TN = TN + 1
        elif (actual[val] == '-' and predicted[val] == '+'):
            FP = FP + 1
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1 = (2*precision*recall)/(precision + recall)
    print(f"Model Gini {i} F1 Score = {f1}")
    if (f1 > max_f1):
        max_f1 = f1
        max_model = instanceGini







    

Model Gini 0 F1 Score = 0.5581395348837209
Model Gini 1 F1 Score = 0.6530612244897959
Model Gini 2 F1 Score = 0.6956521739130435
Model Gini 3 F1 Score = 0.5882352941176471
Model Gini 4 F1 Score = 0.5
Model Gini 5 F1 Score = 0.7636363636363634
Model Gini 6 F1 Score = 0.7241379310344828
Model Gini 7 F1 Score = 0.6250000000000001
Model Gini 8 F1 Score = 0.6274509803921569
Model Gini 9 F1 Score = 0.64


In [45]:
actual = []
predicted = []
for index, row in test_df.iterrows():
    prediction = make_prediction(row, max_model)
    if (prediction == None):
        prediction = random.choice(['+', '-'])

    predicted.append(prediction)
    actual.append(row[15])

TP = 0
TN = 0
FP = 0
FN = 0
for val in range(len(predicted)):
    if (actual[val] == '+' and predicted[val] == '+'):
        TP = TP + 1
    elif (actual[val] == '+' and predicted[val] == '-'):
        FN = FN + 1
    elif (actual[val] == '-' and predicted[val] == '-'):
        TN = TN + 1
    elif (actual[val] == '-' and predicted[val] == '+'):
        FP = FP + 1
precision = TP/(TP+FP)
recall = TP/(TP+FN)
f1 = (2*precision*recall)/(precision + recall)
print(f1)

0.6821705426356589
