In [29]:
import pandas as pd
import math

def entropy(data):
    """Calculates Shannon's entropy for a given distribution"""
    # Compute the frequency of each class label
    value_counts = data.value_counts()
    # Compute the probability of each class label
    probabilities = value_counts / len(data)
    # Compute the entropy
    entropy = sum(-p * math.log2(p) for p in probabilities)
    return entropy

def feature_ranking(features, target):
    """
    Computes the feature ranking vector using Shannon's entropy as the scoring function.

    Args:
        features: A pandas dataframe containing the features.
        target: A pandas series containing the target variable.

    Returns:
        A pandas series containing the feature weights (in descending order).
    """
    feature_weights = {}
    for feature in features.columns:
        #print(list(features.groupby(feature))[2])
              #[target].apply(entropy))
        feature_score = entropy(features.groupby(feature).apply(entropy))
        #feature_score = entropy(features.groupby(feature)[target].apply(entropy))
        feature_weights[feature] = feature_score
    
    total_score = sum(feature_weights.values())
    feature_weights = {k: v/total_score for k, v in feature_weights.items()}
    
    return pd.Series(feature_weights).sort_values(ascending=False)



In [30]:
# Load the dataset
data = pd.read_csv('updated_output.csv')
data = data.rename(columns=lambda x: x.replace('#', 'hash'))
# print(data.columns)
# print(type(data.target))
# print(data.target.head())


# Split the dataset into features and target
features = data.drop('target', axis=1)
target = data['target']

# print(features.columns)

# Compute the feature weights using Shannon's entropy
feature_weights = feature_ranking(features, target)

# Print the feature weights
print(feature_weights)

domain_expiration      0.207128
domain_name            0.189308
page_rank              0.189308
http_status            0.138085
is___present           0.138085
number_of_redirects    0.138085
index                  0.000000
error_fetch            0.000000
num_domains.2          0.000000
num_domains.1          0.000000
num_domains            0.000000
is_abnormal            0.000000
is_redirected          0.000000
user_id                0.000000
is_hash_present        0.000000
is_@_present           0.000000
inner_html             0.000000
expanded_url           0.000000
url                    0.000000
created_at             0.000000
unique_domains         0.000000
dtype: float64


In [8]:
data

Unnamed: 0,index,user_id,created_at,url,expanded_url,inner_html,http_status,is_@_present,is___present,is_hash_present,...,error_fetch,page_rank,is_abnormal,domain_expiration,num_domains,num_domains.1,num_domains.2,domain_name,unique_domains,target
0,0,24858289,Wed Nov 12 20:14:48 +0000 2014,http://t.co/ahvQxUqTws,https://www.youtube.com/watch?v=Y6f6_WiyWjI,"b'<!DOCTYPE html><html style=""font-size: 10px;...",200,False,True,False,...,False,4,False,2024-02-15 05:13:12,1,1,1,www.youtube.com,1,1
1,1,24858289,Wed Nov 12 20:01:32 +0000 2014,http://t.co/HyI5EQKz6Q,https://www.facebook.com/Davideb66/posts/10205...,"b'<!DOCTYPE html>\n<html lang=""ml"" id=""faceboo...",200,False,False,False,...,False,1,False,2031-03-30 04:00:00,1,1,1,www.facebook.com,1,1
2,2,24858289,Wed Nov 12 12:41:32 +0000 2014,http://t.co/aHHbFXJbIS,https://www.mipiaceroma.it/blog,b'<!DOCTYPE html>\n<!--[if IE 9 ]><html class=...,200,False,False,False,...,False,44424115,False,2024-01-10 00:00:00,1,1,1,www.mipiaceroma.it,1,1
3,3,24858289,Tue Nov 11 22:23:43 +0000 2014,http://t.co/NAHQ4l2pUy,https://www.facebook.com/Davideb66/posts/10205...,"b'<!DOCTYPE html>\n<html lang=""ml"" id=""faceboo...",200,False,False,False,...,False,1,False,2031-03-30 04:00:00,1,1,1,www.facebook.com,1,1
4,4,24858289,Tue Nov 11 22:17:01 +0000 2014,http://t.co/o8ZJHt7Neu,http://www.muzu.tv/spandau-ballet/gold-music-v...,"b'<!DOCTYPE html>\r\n<html lang=""en-US"">\r\n<h...",404,False,False,False,...,False,149230,False,,1,1,1,www.muzu.tv,1,1
5,5,24858289,Mon Nov 10 19:33:26 +0000 2014,http://t.co/qTe0I9IzU5,https://www.virgilio.it/,"b'<!DOCTYPE html>\n<html class="""" lang=""it-IT""...",200,False,False,False,...,False,1158775,False,2023-05-10 00:00:00,1,1,1,www.virgilio.it,1,1
6,6,24858289,Mon Nov 10 19:33:21 +0000 2014,http://t.co/5zjKTwsLST,https://www.facebook.com/Davideb66/posts/10205...,"b'<!DOCTYPE html>\n<html lang=""ml"" id=""faceboo...",200,False,False,False,...,False,1,False,2031-03-30 04:00:00,1,1,1,www.facebook.com,1,1
7,7,24858289,Sun Nov 09 21:50:54 +0000 2014,http://t.co/dNkWiNfA16,https://www.facebook.com/Davideb66/posts/10205...,"b'<!DOCTYPE html>\n<html lang=""ml"" id=""faceboo...",200,False,False,False,...,False,1,False,2031-03-30 04:00:00,1,1,1,www.facebook.com,1,1
8,8,24858289,Sat Nov 08 22:02:08 +0000 2014,http://t.co/riutWINwKZ,https://www.youtube.com/watch?v=34Hx2SlUOHE,"b'<!DOCTYPE html><html style=""font-size: 10px;...",200,False,False,False,...,False,4,False,2024-02-15 05:13:12,1,1,1,www.youtube.com,1,1
9,9,24858289,Sat Nov 08 22:01:19 +0000 2014,http://t.co/e6PkjXRdBz,http://www.muzu.tv/musica-degli-anni-80/falco-...,"b'<!DOCTYPE html>\r\n<html lang=""en-US"">\r\n<h...",404,False,False,False,...,False,149230,False,,1,1,1,www.muzu.tv,1,1
