In [3]:
from collections import defaultdict
import math

In [25]:
# 模型


class MaxEntropy(object):
    def __init__(self):
        self.feats = defaultdict(int)   # 特徵對應次數，用來計算機率用
        self.trainset = []              # 訓練資料集，用來裝資料集
        self.labels = set()             # 標籤集合
        self.size = int()               # 訓練集大小
        self.M = int()                  # 訓練集最大特徵出現次數
        self.ep_ = float()              # 計算經驗分布的特徵期望值，應該是list of float
        self.w = float()                # 初始化權重，什麼的權重?
        self.ep = float()               # 模型分布的特徵期望值，應該是list of float
        self.last_w =  float()          # 收斂時的權重
        self.feats_id = defaultdict(int) # 取出特徵位置的 index
    
    def load_data(self, file):
        """
            取得資料，並初始化一下模型參數
        """
        for line in open(file):
            fields = line.strip().split()
            if len(fields) <= 3:       # 不太懂這個意思
                print(fields)
                continue
            label = fields[0]
            self.labels.add(label)
            for f in set(fields[1:]):  # word 特徵
                self.feats[(label, f)] += 1 # (label, f) tuple 是特徵，代表出現次數增加，不過這邊沒有Markov假設?
                print(label, f)
            self.trainset.append(fields)
    
    def train(self, max_iter=1000):
        """
            訓練參數。
        """
        self._initparams()
        for i in range(max_iter):
            print(f"iter {i+1} ...")
            self.ep = self._expected_value() # 計算模型分布的特徵期望
            self.last_w = self.w[:]          # 取得上一次模型參數
            for i, win in enumerate(self.w):
                # win 沒用到，作者應該是本來想用，發現self.w[i] += 即可。
                # delta 應該是更新公式。math.log剛好可以代表方向。
                delta = 1.0 / self.M * math.log(self.ep_[i] / self.ep[i])
                self.w[i] += delta # 更新w
            print(self.w, self.feats) # 這裡就很問號? 為什麼要有self.feats?
            if self._convergence(self.last_w, self.w):
                break # 收斂就停止
    
    def _initparams(self):
        """
            初始化參數
        """
        self.size = len(self.trainset)
        self.M = max([len(record)-1 for record in self.trainset]) # 訓練樣本中出現特徵最大值
        self.ep_ = [0.0] * len(self.feats)  # 代表經驗分布的特徵期望值，初始值0.0開始調整
        self.w = [0.0] * len(self.feats)    # 參數
        
        for i, f in enumerate(self.feats):
            counts = self.feats[f]
            self.ep_[i] = counts / self.size   # 其實期望值就是出現機率
            self.feats_id[f] = i               # 就是紀錄 idx
        
            
    
    def _expected_value(self):
        """
            計算各特徵的期望值。
        """
        ep = [0.0] * len(self.feats)
        for record in self.trainset:
            features = record[1:]
            prob = self._calprob(features) # 計算條件機率P(y|x)
            #print(prob)
            
            # 底下這邊看不太懂
            for f in features:
                for w, label in prob:
                    if(label, f) in self.feats:
                        idx = self.feats_id[(label, f)]
                        ep[idx] += w * (1.0 / self.size)
        return ep
    
    def _calprob(self, features):
        """
            計算條件機率，也就是模型預測機率P(y|x)
        """
        wgts = [(self._probwgt(features, label), label) for label in self.labels]
        Z = sum([w for w, label in wgts])
        prob = [(w / Z, label) for w, label in wgts]
        return prob
    
    def _probwgt(self, features, label):
        """
            計算每個特徵的 weight
        """
        wgt = 0.0
        for f in features:
            if (label, f) in self.feats: # 有出現過
                # 權重是加總個個出現過的權重的參數
                wgt += self.w[self.feats_id[(label, f)]]
        return math.exp(wgt)
    
    def _convergence(self, last_w, w):
        for w1, w2 in zip(last_w, w):
            if abs(w1 - w2) >= 0.01:
                return False
        return True
    
    def predict(self, x):
        features = x.strip().split()
        prob = self._calprob(features)
        prob.sort(reverse=True)
        return prob

In [26]:

model = MaxEntropy()
model.load_data('data.txt') # 導入數據集
model.train() # 訓練模型

['Outdoor', 'Sunny', 'Happy']
Outdoor Happy
Outdoor Sunny
Outdoor Dry
Outdoor Happy
Outdoor Sunny
Outdoor Humid
Outdoor Sunny
Outdoor Dry
Outdoor Sad
Outdoor Sunny
Outdoor Humid
Outdoor Sad
Outdoor Happy
Outdoor Humid
Outdoor Cloudy
Outdoor Happy
Outdoor Humid
Outdoor Cloudy
Outdoor Humid
Outdoor Cloudy
Outdoor Sad
Outdoor Humid
Outdoor Cloudy
Outdoor Sad
Indoor Happy
Indoor Rainy
Indoor Dry
Indoor Rainy
Indoor Sad
Indoor Dry
Indoor Humid
Indoor Rainy
Indoor Sad
Indoor Humid
Indoor Cloudy
Indoor Sad
Indoor Humid
Indoor Cloudy
Indoor Sad
[]
[]
[]
[]
[]
[]
[]
iter 1 ...
[0.1566678764152452, 0.23104906018664842, 0.0, 0.09589402415059367, 0.0, 0.09589402415059362, -0.30543024395805163, 0.23104906018664842, 0.0, 0.0, -0.13515503603605475, -0.1351550360360548] defaultdict(<class 'int'>, {('Outdoor', 'Happy'): 4, ('Outdoor', 'Sunny'): 4, ('Outdoor', 'Dry'): 2, ('Outdoor', 'Humid'): 6, ('Outdoor', 'Sad'): 4, ('Outdoor', 'Cloudy'): 4, ('Indoor', 'Happy'): 1, ('Indoor', 'Rainy'): 3, ('Indoor', '

In [27]:
print(model.predict('Rainy Happy Dry'))

[(0.9274250787320695, 'Indoor'), (0.07257492126793041, 'Outdoor')]
