1. Decision tree

注释以下每行代码的意思。

In [None]:
def valueOfMaxProb(vals):
    tarray=np.unique(vals, return_counts=True) #calculate the unique value in vals and also return # of time each appears.
    imax=np.argmax(tarray[1]) #choose the index of the max # of the time of appearance of unique vals.
    return tarray[0][imax], tarray[1][imax]/vals.shape[0] #return the respective value, the ratio to the total # of samples.

def value_count(vals):
    return np.unique(vals, return_counts=True) #return the unique value in vals and also return # of time each appears.

def entropy(cls): ## after a series testing, this is by far the fastest
    tarray=np.unique(cls, return_counts=True) # calculate the unique value in cls and also return # of time each appears.
    ent=0.0 #initialize with 0.0
    for count in tarray[1]: # for loop for the number of appearance for each unique value.
        ratio=count/cls.shape[0]  # number of appearance over number of samples. (ratio)
        ent+=ratio*math.log(ratio,2) # add to the partial entropy to the total.
    return -ent # return the -entropy

def gini(cls): ## after a series testing, this is by far the fastest
    tarray=np.unique(cls, return_counts=True) # calculate the unique value in cls and also return # of time each appears.
    size=cls.shape[0] # number of samples
    score=0.0 # initialize the lost score
    for count in tarray[1]: # for loop w.r.t. number of appearance for each unique values.
        p=count/size # calc the prior probability 
        score+=p*p # add it to the total loss
    return 1.0-score #return gini score.

class DecisionTree:
    def __init__(self, toPrint=False, measurement=entropy, LeafNodeContent=value_count):
        self.theTree=dict()  #initialize the tree
        self.toPrint=toPrint  #print info or not
        self.LeafNodeContent=LeafNodeContent  # The info in the Leaf Node
        self.measurement=measurement # measure to evaluate the goodness of split.
                        
    def trySplit(self, df,attrib, iscontinuous=False):
        ntotal=df.shape[0] # number of data points.
        if iscontinuous:  # for continuous feature
            b_score, b_value=999, 0 # initialize the entropy and threshold
            for value in set(df[attrib]): # for loop for each unique value of this feature
                df[self.workingVector]=df[attrib]<=value  #transform continous value to binary bool value
                score=0.0 #initialize the entropy
                for v, nodei in df.groupby(self.workingVector): # for each value of the feature and its respective row of df
                    score+=float(nodei.shape[0]/ntotal)*self.measurement(nodei[self.outcomeVar]) # add the entropy (ratio*partial_entropy) to total
                if score<b_score: # choose the best splitting node
                    b_score, b_value= score, value
            return b_score, b_value
        else: #for categorical data
            score=0.0 #same as above for the else case
            for v, nodei in df.groupby(attrib):
                score+=float(nodei.shape[0]/ntotal)*self.measurement(nodei[self.outcomeVar])
            return score, None
            
    def getMostScoreAttrib(self,df):   ## the score include InformationGain and OverallGiniScore
        optScore, optValue, optVar =999, 0, ''  #initialize the entropy, threshold, feature
        for v in df.columns: # for each column
            if v!=self.outcomeVar and v!=self.workingVector: ## outcomeVar should be avoided, so does the temperary variable
                score, svalue =self.trySplit(df,v, v in self.continuousVars) # get the score for this split
                if score<optScore: #if the entropy is less than the ealier one
                    optScore, optValue, optVar=score, svalue, v # get the best one
        return optVar, optValue    ## the argmin is what we need

    def splitNodes(self, df, treeNode):
        attrib, svalue=self.getMostScoreAttrib(df)  # get the most suitable feature for split
        tattrib=(attrib, svalue)  # feature and the threshold
        treeNode[tattrib]={} # for this feature, add a new dictionary
        
        if self.toPrint: print("\n>>>>> split by ",tattrib, '\n', df) #to print info

        if svalue==None: # for categorical feature
            broupbyExpre=df[attrib] # get this feature column
        else:
            broupbyExpre=df[attrib]<svalue # for continuous feature 

        for v, subdf in df.groupby(broupbyExpre): # for each unique value and the respective group
            if toPrint: print("\n>>>>> split by ",tattrib, v, '\n', subdf) # print info
            theScore=self.measurement(subdf[self.outcomeVar])  # get the sub-entropy 
            if theScore!=0 and subdf.shape[1]>3:  # rule out the case for (target already same) and (the remaining feature # only 1)
                if self.toPrint:  #print info
                    print(tattrib, '=',v, theScore,'. Calling splitNodes() recursively')
                    print(subdf)
                treeNode[tattrib][v]={} # give a new dic to this value of this feature
                self.splitNodes(subdf.drop(attrib, axis=1), treeNode[tattrib][v]) 
                # iteration: from the remaining feature, choose the best one to split from the given node.
            else: # come to leaf nodes
                if self.toPrint: print(tattrib,'=', v, 'leaf node', [(v.shape[0],k) for k,v in subdf.groupby(self.outcomeVar)])                 
                treeNode[tattrib][v]=self.LeafNodeContent(subdf[self.outcomeVar]) # give the value_count to this feature node's key 
        del df  # don't forget to remove the dataset

    def fit(self, X_train, y_train, continuousVars=[]):
        self.outcomeVar=y_train.name  # store the target col's name
        self.continuousVars=continuousVars # store continuous features 

        self.workingVector='_';  ## assume the working vector has name '_'
        while self.workingVector in X_train.columns: self.workingVector+='_'  ## adding more '_' if needed

        self.labels=np.unique(y_train)  ## store the total categories of target labels
        XY_train=X_train.merge(y_train, left_index=True, right_index=True) #merge data and label together
        XY_train[self.workingVector]=False  ## prefill the working vector, from now on no new vector 
        self.theTree=dict() # initialize the tree
        self.splitNodes(XY_train, self.theTree)  # call the split function to train the tree.
        
    def totalValueCount(self,di):
        outs=pd.Series()  # initialize an empty Series
        for key in di: # for loop of keys
            vals =di[key] # get the values of the key
            if type(vals)==dict: # if the value is also a dic
                vals=self.totalValueCount(vals) # recall this function to calc the new dic
            ss=pd.Series(vals[1], index=vals[0]).fillna(0) # use the target_count as the value, with index the target, also fill NaN with 0 
            outs=outs.add(ss,fill_value=0) # 如果索引存在于两个 Series 中，则相加。
        return list(outs.index), outs.values # return the index and values
        
    def estimate(self, dic, x_test):
        if len(dic)==0: return # if dic is empty, then return empty
        node=next(iter(dic)) # iteratively get the top keys. 
        field=node[0] # （特征和阈值的元组） we get the feature 
        if field not in x_test: return  # 如果测试样本中没有当前特征，直接返回 None
        if node[1]==None:   # if it's categorical         
            match=dic[node].get(x_test[field]) # if they are equal, return the equal part('s value).
        else: # if it's continuous
            match=dic[node].get(x_test[field]<node[1])  # firstly transform to binary boolean value.
        if match is None: # if didn't find a match 
            return self.totalValueCount(dic[node]) # return the total value count for each value under this node.
            
        if type(match) is dict: #if this match is still a dict
            if self.toPrint: print(match)  # match is a shorted dictionary so it can be treated just like the dic
            return self.estimate(match, x_test)  # recall this function to furthur match
        else:
            if self.toPrint: print(match, 'is a leaf of type ', type(match)) 
            return match # or else return this non-dictionary match

    def predict(self, X_test):
        predicted=[] # init an empty list to store pred
        for i in range(X_test.shape[0]): # for each data point in test set
            pred=self.estimate(self.theTree, X_test.iloc[i,:]) # try to find matches between the tree and the test point.
            if pred==None: # if didn't find match
                predicted.append(None) # just say none
            else:
                imax=np.argmax(pred[1]) # choose the match with the highest frequency, get the index
                predicted.append(pred[0][imax]) # get the target from the index
        return np.array(predicted)

    def predict_proba(self, X_test):
        predicted=pd.DataFrame([], columns=self.labels) # initialize an empty DF
        for i in range(X_test.shape[0]): # for each test data point
            pred=self.estimate(self.theTree, X_test.iloc[i,:]) # find match for this point
            if pred==None: # if didn't find one
                print(i, end=' ')
                row=pd.Series([0]*len(self.labels), index=self.labels)  # let this row empty
            else: # if find a match
                row=pd.Series(pred[1]/pred[1].sum(), index=pred[0])  # get the probability for each label belonging
            predicted=predicted.append(row, ignore_index=True) 
        return np.array(predicted.fillna(0))  ## fill the missing with 0


2. Deep Learning

在https://archive.ics.uci.edu/ml/index.php 选取一个你自己感兴趣的回归问题的数据集，在测试集（test set）上比较用pytorch训练的回归模型和线性回归这两种方法训练出来的模型的MPSE（mean prediction squared error）:
$$MPSE=\frac{1}{m}\sum_{i=1}^{m}(\hat{Y}_i-Y_i)^2,$$
其中$\hat{Y}_i$为测试集上的预测值，$Y_i$为测试集上的响应变量的值。

I chose this dataset: https://archive.ics.uci.edu/dataset/206/relative+location+of+ct+slices+on+axial+axis

In [4]:
import pandas as pd
data = pd.read_csv('/Users/dongwenou/Downloads/Intro to DS/slice_localization_data.csv')
data

Unnamed: 0,patientId,value0,value1,value2,value3,value4,value5,value6,value7,value8,...,value375,value376,value377,value378,value379,value380,value381,value382,value383,reference
0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.25000,...,-0.25,0.980381,0.0,0.000000,0.000000,0.0,0.0,-0.25,-0.25,21.803851
1,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.25000,...,-0.25,0.977008,0.0,0.000000,0.000000,0.0,0.0,-0.25,-0.25,21.745726
2,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.25000,...,-0.25,0.977008,0.0,0.000000,0.000000,0.0,0.0,-0.25,-0.25,21.687600
3,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.25000,...,-0.25,0.977008,0.0,0.000000,0.000000,0.0,0.0,-0.25,-0.25,21.629474
4,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.250000,-0.250000,-0.25000,...,-0.25,0.976833,0.0,0.000000,0.000000,0.0,0.0,-0.25,-0.25,21.571348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53495,96,0.591906,0.357764,0.000000,0.000000,0.552321,0.795304,0.946697,0.952227,0.84395,...,0.00,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.00,0.00,29.290398
53496,96,0.612313,0.000000,0.000000,0.000000,0.864160,0.820531,0.000000,0.938813,0.94374,...,0.00,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.00,0.00,27.945721
53497,96,0.612313,0.000000,0.000000,0.000000,0.864160,0.820531,0.000000,0.938813,0.94374,...,0.00,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.00,0.00,27.945721
53498,96,0.634921,0.904555,0.956087,0.980208,0.157664,0.000000,-0.250000,-0.250000,-0.25000,...,-0.25,0.000000,0.0,0.994967,0.806688,0.0,0.0,-0.25,-0.25,14.582997


In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.optim as optim
# Features (X) and target variable (y)
X = data.drop(columns=['reference', 'patientId'])  # Exclude target and patient ID
y = data['reference']  # The target variable


In [8]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=369)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert target to numpy array
y_train = y_train.values
y_test = y_test.values

In [12]:
# Train Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = lr_model.predict(X_test)

# Calculate MPSE
mse_lr = mean_squared_error(y_test, y_pred_lr)
print(f"Linear Regression MPSE: {mse_lr:.4f}")

Linear Regression MPSE: 68.0326


In [25]:
print(type(X_train))
print(len(y_train))

<class 'numpy.ndarray'>
40125


In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# 1. 数据准备：转换为 DataLoader
batch_size = 1024  # 批次大小

# 确保数据为 NumPy 数组
X_train = np.array(X_train)
y_train = np.array(y_train).reshape(-1, 1)
X_test = np.array(X_test)
y_test = np.array(y_test).reshape(-1, 1)

# 将数据转换为 TensorDataset
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                              torch.tensor(y_train, dtype=torch.float32))
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32),
                             torch.tensor(y_test, dtype=torch.float32))

# DataLoader 分批加载数据
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 2. 定义神经网络模型
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  # 输入层 -> 隐藏层1
        self.relu = nn.ReLU()                 # ReLU 激活函数
        self.fc2 = nn.Linear(128, 64)         # 隐藏层1 -> 隐藏层2
        self.fc3 = nn.Linear(64, 1)           # 隐藏层2 -> 输出层
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

# 3. 初始化模型、损失函数和优化器
input_dim = X_train.shape[1]  # 输入特征数
model = NeuralNetwork(input_dim)

criterion = nn.MSELoss()          # 使用均方误差损失函数 (回归问题)
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam 优化器

# 4. 训练模型
num_epochs = 100  # 训练轮次

for epoch in range(num_epochs):
    model.train()  # 设置模型为训练模式
    running_loss = 0.0
    
    for X_batch, y_batch in train_loader:  # 分批加载数据
        # 前向传播
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    # 每个 epoch 输出损失
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

# 5. 评估模型
model.eval()  # 设置模型为评估模式
test_loss = 0.0

with torch.no_grad():  # 在评估阶段不计算梯度
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        test_loss += loss.item()

print(f"Test MSE Loss: {test_loss/len(test_loader):.4f}")

Epoch [1/100], Loss: 2010.9114
Epoch [2/100], Loss: 210.7620
Epoch [3/100], Loss: 80.2516
Epoch [4/100], Loss: 58.4433
Epoch [5/100], Loss: 46.5438
Epoch [6/100], Loss: 38.8346
Epoch [7/100], Loss: 33.1491
Epoch [8/100], Loss: 28.7339
Epoch [9/100], Loss: 25.6306
Epoch [10/100], Loss: 23.0647
Epoch [11/100], Loss: 21.2434
Epoch [12/100], Loss: 19.5533
Epoch [13/100], Loss: 18.3677
Epoch [14/100], Loss: 17.1875
Epoch [15/100], Loss: 16.1351
Epoch [16/100], Loss: 15.2285
Epoch [17/100], Loss: 14.3782
Epoch [18/100], Loss: 13.2729
Epoch [19/100], Loss: 12.0439
Epoch [20/100], Loss: 11.2485
Epoch [21/100], Loss: 9.9805
Epoch [22/100], Loss: 9.0367
Epoch [23/100], Loss: 8.1473
Epoch [24/100], Loss: 7.3828
Epoch [25/100], Loss: 6.6815
Epoch [26/100], Loss: 6.0694
Epoch [27/100], Loss: 5.5393
Epoch [28/100], Loss: 5.1026
Epoch [29/100], Loss: 4.6931
Epoch [30/100], Loss: 4.3528
Epoch [31/100], Loss: 4.0591
Epoch [32/100], Loss: 3.8188
Epoch [33/100], Loss: 3.5706
Epoch [34/100], Loss: 3.3714


In [25]:
print("Model Comparison:")
print(f"Linear Regression MPSE: {mse_lr:.4f}")
print(f"Deep Learning MPSE: {test_loss/len(test_loader):.4f}")

Model Comparison:
Linear Regression MPSE: 68.0326
Deep Learning MPSE: 0.9268


    We can find that it's a great improvement from Linear Regression to Neural Network!