# 2.4数据对象关系的计算方法

In [9]:
"""
    Author: <李昌峻>
    Date:    2024/9/19
    Version: 1.0
    Encoding: UTF-8
    Description: 数据挖掘课程2.4代码实现
    Email: changjunli049@gmail.com
"""

'\n    Author: <李昌峻>\n    Date:    2024/9/19\n    Version: 1.0\n    Encoding: UTF-8\n    Description: 数据挖掘课程2.4代码实现\n    Email: changjunli049@gmail.com\n'

### 2.4.1对象相似性计算方法

In [34]:
# 可能用到的库
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.font_manager import FontProperties # 字体设置
import math
from collections import Counter
from scipy.stats import spearmanr

### 1.标称属性相似性

In [11]:
# 简单匹配距离
def simple_matching_distance(obj1,obj2):
    """
    
    :param obj1: 
    :param obj2: 
    :return: float，简单匹配距离
    """
    if len(obj1)==len(obj2):
        total_attributes = len(obj1)
        matching_attributes = 0
        
        # 查找对象取值相同的属性数
        for i in range(total_attributes):
            if(obj1[i]==obj2[i]):
                matching_attributes += 1
            
        return 1-(matching_attributes/total_attributes)
    
# 示例 分别为书上的i和j
obj1 = ["fruit", "red", "yes", "good"]
obj2 = ["fruit", "yellow", "yes", "good"]


distance = simple_matching_distance(obj1, obj2)
print(f"标称属性相似性距离: {distance:.2f}")
    

标称属性相似性距离: 0.25


### 2.二元属性相似性
本书37页（2-10）展示的公式实际为求解相异性系数。\
相似性系数公式为：
$$d(i, j)=\frac{q+t}{q+r+s+t}$$

In [12]:
## 相异性（书本示例）：
def simple_matching_coefficient(obj1, obj2):
    """
    :param obg1: list, 对象1的二元属性值
    :param obj2: list, 对象2的二元属性值
    :return: float, 简单匹配系数
    """
    if len(obj1) != len(obj2):
        raise ValueError("两个对象的属性数量必须相同")
    
    a = b = c = d = 0
    
    for i in range(len(obj1)):
        if obj1[i] == 1 and obj2[i] == 1:
            a += 1
        elif obj1[i] == 1 and obj2[i] == 0:
            b += 1
        elif obj1[i] == 0 and obj2[i] == 1:
            c += 1
        elif obj1[i] == 0 and obj2[i] == 0:
            d += 1
    
    smc= (b + c) / (a + b + c + d) 
    smc = (b + c) / (a + b + c ) # 公式（2-11）
    return smc

# 示例
binary_vector1 = [0, 1, 0, 0, 1]
binary_vector2 = [1, 1, 1, 0, 1]

smc_value = simple_matching_coefficient(binary_vector1, binary_vector2)
print(f"简单匹配系数: {smc_value:.2f}")


简单匹配系数: 0.50


### 3.数值属性相似性

In [15]:
### 1.欧几里得距离
def euclidean_distance(point1, point2):
    """
    :param point1: list, 点1（n维）
    :param point2: list, 点2（n维）
    :return: float, 欧几里得距离
    """
    if len(point1) != len(point2):
        raise ValueError("两个点的维度必须相同")
    
    # 计算各坐标差的平方和
    squared_diff_sum = sum((p1 - p2) ** 2 for p1, p2 in zip(point1, point2))
    
    return math.sqrt(squared_diff_sum)

# 示例
point1 = [1, 1, 1, 1, 1]  # 5维点
point2 = [2, 2, 2, 2, 2]

distance_multi = euclidean_distance(point1, point2)
print(point1)
print(point2)
print(f"5维空间中的欧几里得距离: {distance_multi:.2f}")

    

[1, 1, 1, 1, 1]
[2, 2, 2, 2, 2]
5维空间中的欧几里得距离: 2.24


In [18]:
### 2.曼哈顿距离
def manhattan_distance(point1, point2):
    """
    :param point1: list, 点1（n维）
    :param point2: list, 点2（n维）
    :return: float, 曼哈顿距离
    """
    if len(point1) != len(point2):
        raise ValueError("两个点纬度必须相等")
    
    manhattan_distance=sum(abs(p1-p2) for p1, p2 in zip(point1, point2))
    
    return manhattan_distance

manhattan_distance=manhattan_distance(point1, point2)
print(point1)
print(point2)
print(f"示例的曼哈顿距离是：: {manhattan_distance:.2f}")

[1, 1, 1, 1, 1]
[2, 2, 2, 2, 2]
示例的曼哈顿距离是：: 5.00


In [19]:
### 3.切比雪夫距离
def chebyshev_distance(point1, point2):
    """
    :param point1: list, 点1的坐标（n维）
    :param point2: list, 点2的坐标（n维）
    :return: float, 切比雪夫距离
    """
    if len(point1) != len(point2):
        raise ValueError("两个点的维度必须相同")
    
    differences = [abs(p1 - p2) for p1, p2 in zip(point1, point2)]
    
    return max(differences)

chebyshev_distance=chebyshev_distance(point1, point2)
print(point1)
print(point2)
print(f"示例的切比雪夫距离: {chebyshev_distance:.2f}")

[1, 1, 1, 1, 1]
[2, 2, 2, 2, 2]
示例的切比雪夫距离: 1.00


In [None]:
### 4.闵可夫斯基距离
def minkowski_distance(point1, point2):
    """
    根据k值确定，k值确定后，和上面的一样，此处不做展示
    :param point1: 
    :param point2: 
    :return: 
    """


In [23]:
### 5.标准化欧几里得距离
def standardized_euclidean_distance(point1, point2, variances):
    """
    :param point1: list, 点1（n维）
    :param point2: list, 点2（n维）
    :param variances: list, 每个维度的方差
    :return: float, 标准化欧几里得距离
    """
    if len(point1) != len(point2) or len(point1) != len(variances):
        raise ValueError("两个点的维度和方差数组的长度必须相同")
    
    # 标准差的平方就是方差，这里用方差（variances）好写一点。。。
    squared_diff_sum = sum(((p1 - p2) ** 2) / var for p1, p2, var in zip(point1, point2, variances))

    return math.sqrt(squared_diff_sum)

# 假设已经计算出来了方差：
variances = [1, 4, 9, 16, 25]
standardized_euclidean_distance=standardized_euclidean_distance(point1, point2, variances)

print(point1)
print(point2)
print(f"示例的标准化欧几里得距离: {standardized_euclidean_distance:.2f}")

[1, 1, 1, 1, 1]
[2, 2, 2, 2, 2]
示例的标准化欧几里得距离: 1.21


### 4.序值属性相似性

In [25]:
def ordinal_similarity(value1, value2, rank_values):
    """
    :param value1: 序值属性1
    :param value2: 序值属性2
    :param rank_values: list, 序值属性的可能取值顺序
    :return: float, 相似性
    """
    # 获取value1和value2的排名
    rank1 = rank_values.index(value1) + 1 # 从0开始，所以+1
    rank2 = rank_values.index(value2) + 1
    n = len(rank_values)
    
    # 数据标准化：
    normalized_1 = (rank1-1)/(n-1)
    normalized_2 = (rank2-1)/(n-1)
    
    # 计算相似性，距离公式上面的任意都可以
    # 计算欧几里得距离
    euclidean_distance = math.sqrt((normalized_1 - normalized_2) ** 2)
    
    # 计算相似性（距离越小，相似性越大）
    similarity = 1 - euclidean_distance


    return similarity

# 示例
rank_values = ['小学', '初中', '高中', '本科', '研究生']
value1 = '本科'
value2 = '高中'

similarity = ordinal_similarity(value1, value2, rank_values)
print(f"'{value1}' 和 '{value2}' 之间的序值相似性: {similarity:.2f}")


'本科' 和 '高中' 之间的序值相似性: 0.75


### 5.Jaccard相似性

In [28]:
def jaccard_similarity(set1, set2):
    """
    :param set1: 集合1
    :param set2: 集合2
    :return: float, Jaccard相似性
    """
    # 计算交集和并集数量
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))

    similarity = intersection / union if union != 0 else 0
    return similarity

# 示例
set1 = {3,4,5}
set2 = {1,2,3,5,6,7}

similarity = jaccard_similarity(set1, set2)
print(f"Set1 和 Set2 之间的Jaccard相似性: {similarity:.4f}")


Set1 和 Set2 之间的Jaccard相似性: 0.2857


### 6.余弦相似度

In [30]:
def cosine_similarity(vector1, vector2):
    """    
    :param vector1: list, 向量1
    :param vector2: list, 向量2
    :return: float, 余弦相似度
    """
    # 计算点积
    dot_product = sum(a * b for a, b in zip(vector1, vector2))
    
    # 向量的模
    norm_vector1 = math.sqrt(sum(a ** 2 for a in vector1))
    norm_vector2 = math.sqrt(sum(b ** 2 for b in vector2))
    
    # 余弦相似度
    if norm_vector1 == 0 or norm_vector2 == 0:
        return 0  
    return norm_vector1,norm_vector2,dot_product / (norm_vector1 * norm_vector2)

# 示例
vector1 = [3,0,4,0,1,0,0,6,0]
vector2 = [1,0,3,0,0,2,0,1,0]

norm1,norm2,similarity = cosine_similarity(vector1, vector2)
print(norm1,norm2)
print(f"Vector1 和 Vector2 之间的余弦相似度: {similarity:.2f}")


7.874007874011811 3.872983346207417
Vector1 和 Vector2 之间的余弦相似度: 0.69


### 2.4.2数据相关性计算方法 

### 1.皮尔逊相关系数
$$d(i, j)=\frac{\sum\left(x_{i}-\bar{x}\right)\left(y_{i}-\bar{y}\right)}{\sqrt{\sum\left(x_{i}-\bar{x}\right)^{2}} \sqrt{\sum\left(y_{i}-\bar{y}\right)^{2}}}$$

In [32]:
def pearson_correlation(x, y):
    """
    :param x: list, 变量1
    :param y: list, 变量2
    :return: float, 皮尔逊相关系数
    """
    if len(x) != len(y):
        raise ValueError("两个变量的长度必须相等")
    
    # 计算均值
    mean_x = sum(x) / len(x)
    mean_y = sum(y) / len(y)
    
    # 计算分子 (协方差)
    numerator = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y))
    
    # 计算分母
    denominator_x = sum((xi - mean_x) ** 2 for xi in x)
    denominator_y = sum((yi - mean_y) ** 2 for yi in y)
    denominator = (denominator_x * denominator_y) ** 0.5
    
    if denominator == 0:
        return 0
    
    return numerator / denominator

# 示例
x = [10, 20, 30, 40, 50]
y = [12, 24, 36, 48, 60]

pearson_r = pearson_correlation(x, y)
print(f"X和Y的皮尔逊相关系数: {pearson_r:.2f}")


X和Y的皮尔逊相关系数: 1.00


### 2.斯皮尔曼秩相关系数
$$p=1- $ \frac {6\sum _ {i=1}^ {l-1} d_i^2}{n(n^ {2}-1)} $$


In [41]:
### scipy调用

# 示例数据
x = [10, 20, 30, 40, 50]
y = [12, 24, 36, 48, 60]

# 计算斯皮尔曼秩相关系数
rho, _ = spearmanr(x, y)
print(f"X和Y的斯皮尔曼秩相关系数: {rho:.2f}")

X和Y的斯皮尔曼秩相关系数: 1.00


In [42]:
### math实现
from scipy.stats import rankdata

def spearman_rank_correlation(x, y):
    """
    :param x: list, 变量1
    :param y: list, 变量2
    :return: float, 斯皮尔曼秩相关系数
    """
    # 计算秩次
    rank_x = rankdata(x)
    rank_y = rankdata(y)
    
    # 计算秩次差
    d = rank_x - rank_y
    d_squared = np.sum(d ** 2)
    
    n = len(x)
    rho = 1 - (6 * d_squared) / (n * (n ** 2 - 1))
    
    return rho

# 示例数据
x = [10, 20, 30, 40, 50]
y = [12, 24, 36, 48, 60]

rho = spearman_rank_correlation(x, y)
print(f"X和Y的斯皮尔曼秩相关系数: {rho:.2f}")

X和Y的斯皮尔曼秩相关系数: 1.00


### 3.协方差

In [51]:
def covariance(x, y):
    """
    :param x: list, 变量1
    :param y: list, 变量2
    :return: float, 协方差
    """
    if len(x) != len(y):
        raise ValueError("两个变量的长度必须相等")
    
    # 计算均值
    mean_x = sum(x) / len(x)
    mean_y = sum(y) / len(y)
    
    E_xy=sum((a*b) for a,b in zip(x, y))/len(x)
    
    # 协方差
    cov=E_xy-mean_x*mean_y
    
    #cov = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, y)) / len(x)
    
    return E_xy,cov

# 示例
x = [6,4,7,10,8]
y = [5,6,1,4,12]

E_xy,conv=covariance(x, y)
print(f"E_xy: {E_xy:.2f}")
print(f"X和Y的协方差: {conv:.2f}")


E_xy: 39.40
X和Y的协方差: 0.20
