# 1 问题引入

# 2 计算模型

# 3 编程实现

## 3.1 K-means

In [3]:
import scipy.io
import scipy
import random
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import importlib
import sys 
import os

In [2]:
# load data
load_data = scipy.io.loadmat('data/news_data.mat')
news_data = load_data['data']
news_labels = load_data['labels']

# shuffle
zipped_data = list(zip(news_data, news_labels))  
random.seed(0)
random.shuffle(zipped_data)
new_zipped_data = list(map(list, zip(*zipped_data)))  
news_data, news_labels = np.array(new_zipped_data[0]), np.array(new_zipped_data[1])  

# split data into training and test sets
training_data = news_data[:1000, 4900:]
training_labels = news_labels[:1000]
test_data = news_data[15000:, :]
test_labels = news_labels[15000:]

In [1]:
# K-means
class KMeans:
    def __init__(self):
        # K
        self.K = 0
        # assignments
        self.C = 0
        # mean vectors
        self.m = 0
        # loss function
        self.loss_fn = 0
        # N data points
        self.N = 0
        # dimension
        self.d = 0
        
    def generate_random_means(self, data):
        m = np.zeros((self.K, self.d))
        for k in range(self.K):
            m[k] = data[int(random.random()*self.N)]
        return m
    
    def squared_euclidean_dist(self, u, v):
        diff = u - v
        return sum(diff*diff)
    
    def fit(self, data, K, max_iter):
        print("Start fitting...")
        self.K = K
        self.N = data.shape[0]
        self.d = data.shape[1]
        self.C = np.zeros((self.N, 1))
        self.loss_fn = np.zeros((self.N, 1))
        
        cnt = 0
        self.m = self.generate_random_means(data)
        
        while cnt < max_iter:
            changed = False
            
            for i in range(self.N):
                min_dissimilarity = float('inf')
                min_k = 0
                for k in range(self.m.shape[0]):
                    dissimilarity = self.squared_euclidean_dist(data[i], self.m[k])
                    if dissimilarity < min_dissimilarity:
                        min_dissimilarity = dissimilarity
                        min_k = k
                self.loss_fn[i] = min_dissimilarity
                if self.C[i] != min_k:
                    self.C[i] =min_k
                    changed = True
                    
            for k in range(self.m.shape[0]):
                data_k = data[self.C.ravel()==k]
                if data_k.shape[0] != 0:
                    self.m[k] = np.sum(data_k, axis=0) / data_k.shape[0]
                
            cnt += 1
            if not changed:
                print("converged!")
                break
        
        print("Finish fitting !")

In [4]:
km = KMeans()
km.fit(training_data, 20, 10)

Start fitting...
Finish fitting !


## 3.2 Gaussian Mixture Model

In [7]:
# load data
load_data = scipy.io.loadmat('data/news_data.mat')
news_data = load_data['data']
news_labels = load_data['labels']

# shuffle
zipped_data = list(zip(news_data, news_labels))  
random.seed(0)
random.shuffle(zipped_data)
new_zipped_data = list(map(list, zip(*zipped_data)))  
news_data, news_labels = np.array(new_zipped_data[0]), np.array(new_zipped_data[1])  

# split data into training and test sets
training_data = news_data[:100, 4990:]
training_labels = news_labels[:100]
test_data = news_data[15000:, :]
test_labels = news_labels[15000:]

In [66]:
# K-means
class GMM:
    def __init__(self):
        # K
        self.K = 0
        # N data points
        self.N = 0
        # dimension
        self.d = 0
        # prior
        self.pi = 0
        # means
        self.u = 0
        # covarience
        self.sigma = 0
        # responsibility
        # self.r = 0
        
    def gaussian_pdf(self, x, u, sigma):
        if np.linalg.det(sigma)==0:
            return 1
        return 1/(np.sqrt(abs(np.linalg.det(sigma))))*np.exp(-(x-u).dot(scipy.linalg.pinv(sigma).dot(x.T-u.T))/2)
        
    # E step
    def expectation_step(self, x, k):
        numerator = self.pi[k]*self.gaussian_pdf(x, self.u[k], self.sigma[k])
        denominator = sum([self.pi[j]*self.gaussian_pdf(x, self.u[j], self.sigma[j]) for j in range(self.K)])
        if denominator==0:
             return 0
        return numerator/denominator
        
    # M step
    def maximization_step(self, data, k):
        N_k = self.r.sum()
        pi_k = N_k/self.N
        u_k = np.zeros(self.d)
        for i in range(self.N):
            u_k += self.r[k][i]*data[i]
        sigma_k = np.zeros((self.d,self.d))
        for i in range(self.N):
            sigma_k += self.r[k][i]*(data[i].T-u_k.T).dot(data[i]-u_k)
        sigma_k = sigma_k/N_k
        return pi_k, u_k, sigma_k
        
    
    def fit(self, data, K, max_iter):
        print("Start fitting...")
        self.K = K
        self.N = data.shape[0]
        self.d = data.shape[1]
        self.pi = np.ones((self.K, 1))/self.K
        self.u = np.random.rand(self.K, self.d)
        self.sigma = np.zeros((self.K, self.d, self.d))
        for k in range(self.K):
            self.sigma[k] = np.random.rand(self.d, self.d)
        self.r = np.zeros((self.K, self.N))
        
        cnt = 0        
        while cnt < max_iter:
            changed = False
            
            # E step
            for k in range(self.K):
                for i in range(self.N):
                    r_ki = self.expectation_step(data[i], k)
                    if r_ki != self.r[k][i]:
                        self.r[k][i] = r_ki
                        changed = True
            
            # M step
            for k in range(self.K):
                pi_k, u_k, sigma_k = self.maximization_step(data, k)
                if pi_k.all()!=self.pi[k].all() or u_k.all()!=self.u[k].all() or sigma_k.all()!=self.sigma[k].all():
                    self.pi[k], self.u[k], self.sigma[k] = pi_k, u_k, sigma_k
                    changed = True
                
            cnt += 1
            if not changed:
                print("converged!")
                break
        
        print("Finish fitting !")

In [75]:
gmm = GMM()
gmm.fit(training_data, 20, 10)

Start fitting...


  return 1/(np.sqrt(abs(np.linalg.det(sigma))))*np.exp(-(x-u).dot(scipy.linalg.pinv(sigma).dot(x.T-u.T))/2)
  return numerator/denominator


Finish fitting !


In [76]:
gmm.u

array([[0.05330126, 0.07202264, 0.66224688, 0.77956611, 0.68166415,
        0.95471968, 0.53383551, 0.29777572, 0.12442023, 0.6070693 ],
       [0.87604563, 0.79627107, 0.10475228, 0.68158897, 0.2694265 ,
        0.69304146, 0.49774665, 0.55433928, 0.26812896, 0.75895035],
       [0.14096206, 0.98021999, 0.21808375, 0.64617968, 0.09054794,
        0.66030935, 0.52141545, 0.47867006, 0.64142794, 0.02108312],
       [0.38332607, 0.7710274 , 0.5109666 , 0.66567261, 0.81701583,
        0.3814373 , 0.00390307, 0.25836569, 0.26700038, 0.32406001],
       [0.128912  , 0.56532943, 0.23095228, 0.75194695, 0.3922915 ,
        0.02546639, 0.99369061, 0.61689255, 0.64011767, 0.68509954],
       [0.68466947, 0.6709501 , 0.62243069, 0.33356903, 0.15546159,
        0.93762104, 0.28403063, 0.98876617, 0.24666508, 0.49314577],
       [0.89733215, 0.17867856, 0.18335707, 0.37335816, 0.13993649,
        0.79587307, 0.62395807, 0.50276732, 0.09764772, 0.93802451],
       [0.47595753, 0.9118602 , 0.3992955

# 4 模型评估