<a href="https://colab.research.google.com/github/Bread806/goldbach_backup_from_colab/blob/main/goldbach_1016_MixModle_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount("/content/Drive")

Mounted at /content/Drive


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler


def is_prime(num):
    """檢查一個數字是否為質數"""
    if num <= 1:
        return False
    for i in range(2, int(num ** 0.5) + 1):
        if num % i == 0:
            return False
    return True


def prime_table(x):
    """建立小於x的質數表"""
    primes = [num for num in range(2, x) if is_prime(num)]
    return primes


def convert_base_into_list(number, base, width=10):
    result = []  # init list

    for i in range(width):
        result.append([number % base])
        number = number // base
    result = result[::-1]
    return result


def prime_to_index(primeSize, primes, number):
    for index in range(primeSize):
        if number == primes[index]:
            return index
    return -1

def write_log(s ,filename='1016_Hybird_log.txt'):
  with open(filename, 'a', encoding='utf-8') as file:
    file.write(s)

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# 數據轉換函數
def convert_base_into_list(number, base, width=10):
    result = []
    for i in range(width):
        result.append(number % base)
        number = number // base
    return result[::-1]  # 返回扁平的列表

print("---loading data---")
# 加載數據
trainData = pd.read_csv('/content/Drive/MyDrive/實驗/goldbach/csv/traindata_min_size_1000.csv')
trainDataNumbers = trainData['Number'].values
trainDataLabel = trainData['Factors'].values
print("---loading data done.---")

# 打亂訓練數據
shuffleIndices = np.random.permutation(len(trainDataNumbers))
shuffledNumbers = trainDataNumbers[shuffleIndices]
shuffledLabel = trainDataLabel[shuffleIndices]

# 準備特徵
mergedTrainNumber = []
for num in shuffledNumbers:
    feature = convert_base_into_list(num, 2) + convert_base_into_list(num, 3) + \
              convert_base_into_list(num, 5) + convert_base_into_list(num, 7)
    mergedTrainNumber.append(feature)

mergedArray = np.array(mergedTrainNumber)
mergedLabel = np.array(shuffledLabel)

# 分割數據
X_train, X_val, y_train, y_val = train_test_split(mergedArray, mergedLabel, test_size=0.2, random_state=42)

# 創建 PyTorch 數據集和數據加載器
class GoldbachDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features)
        self.labels = torch.FloatTensor(labels).view(-1, 1)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

train_dataset = GoldbachDataset(X_train, y_train)
val_dataset = GoldbachDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)


---loading data---
---loading data done.---


In [9]:

# 定義模型
#input_size=40
class MLP(nn.Module):
  def __init__(self, input_size, hidden_size):
    super(MLP, self).__init__()
    self.mlp = nn.Sequential(
        nn.Linear(input_size, hidden_size),
        nn.ReLU(),
        nn.Linear(200, hidden_size),
        nn.ReLU(),
        nn.Linear(200, hidden_size),
        nn.ReLU(),
        nn.Linear(200, hidden_size),
        nn.ReLU(),
        nn.Linear(200, 1)
    )

  def forward (self, x):
    return self.mlp(x)


class LN(nn.Module):
  def __init__(self, input_size):
    super(LN, self).__init__()
    self.denseG1 = nn.Linear(input_size, 120) #8:2
    self.denseG2 = nn.Linear(input_size, 80)


  def forward(self, x):
    group1 = x
    group2 = F.relu(x)
    group2 = torch.log(group2+1)

    group1 = F.relu(self.denseG1(group1))
    group2 = self.denseG2(group2)
    group2 = torch.exp(group2)

    merged = torch.cat([group1, group2], dim=-1)
    return merged


class HybridModel(nn.Module):
  def __init__(self,input_size, hidden_size):
    super(HybridModel, self).__init__()
    self.Mmlp = MLP(input_size, hidden_size)
    self.Mln = LN(input_size)
    #self.model_weight = nn.Linear(input_size, 2)#輸出兩個模型的比例
    self.register_buffer('model_weight', torch.tensor([0.6, 0.4]))  # MLP:0.6, LN:0.4

  def forward(self, x):
    mlp_output = self.Mmlp(x)
    ln_output  = self.Mln(x)
    # 動態生成兩模型的權重 -> 靜態
    # model_weight = torch.softmax(self.model_weight(x), dim=1)

    # 以6:4的方式讀取mlp跟ln模型的權重
    combined_output = self.model_weight[0] * mlp_output + self.model_weight[1] * ln_output.mean(dim=-1, keepdim=True)
    #combined_output = model_weight[:, 0].unsqueeze(1) * mlp_output + model_weight[:, 1].unsqueeze(1) * ln_output.mean(dim=-1, keepdim=True)


    return combined_output

# 檢查是否有可用的GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"--- Using device: {device} ---")

# 創建模型實例
input_size = X_train.shape[1]
hiddne_size = 200 #neroal
print(f"Input size: {input_size}")
model = HybridModel(input_size, hiddne_size).to(device)
nn.init.constant_(model.model_weight , 0.5)

# 定義損失函數和優化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



--- Using device: cuda:0 ---
Input size: 40


In [None]:
# 訓練模型
print("---starting training---")
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_features, batch_labels in train_loader:
        batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

        optimizer.zero_grad()
        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # 驗證
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_features, batch_labels in val_loader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
            outputs = model(batch_features)
            val_loss += criterion(outputs, batch_labels).item()
    write_log(f"========== \n Epoch [{epoch+1}/{num_epochs}], Train Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f} \n ========== \n")
    # print("==========")
    # print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")
    # print("==========")

print("--------------------------------------------------------")

# 保存模型
torch.save(model.state_dict(), 'goldbach_model_0725.pth')
print("model saved.")

---starting training---
--------------------------------------------------------
model saved.


## training

In [None]:
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)

Shape of X_train: (800, 40)
Shape of y_train: (800,)


# predict

In [None]:
from google.colab import drive
drive.mount("/content/Drive")

Drive already mounted at /content/Drive; to attempt to forcibly remount, call drive.mount("/content/Drive", force_remount=True).


In [6]:
# loadding prime table
primeTable = pd.read_csv("/content/Drive/MyDrive/實驗/goldbach/csv/prime_table_5000000.csv")
primeTableNumber = primeTable["primes"].values
print ("---loading primes table---")

---loading primes table---


In [16]:
## load data
#predict

from tensorflow.keras.models import load_model
import csv
import pandas as pd
import numpy as np

# load torch model
model = HybridModel(input_size,hiddne_size)
model.load_state_dict(torch.load('/content/Drive/MyDrive/實驗/model_save/HybirdModel_1023_6v4.pth'))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)


# load test_set and prime data
testDataPath = '/content/Drive/MyDrive/實驗/goldbach/csv/test_set_G4_5M.csv'
testData = pd.read_csv(testDataPath)
testDataNumbers = testData['Number'].values
testDataLabel = testData['Factors'].values
testDataGroundTruth = testData['Partition'].values

print("---loading test data done.---")


---loading test data done.---


  model.load_state_dict(torch.load('/content/Drive/MyDrive/實驗/model_save/HybirdModel_1023_6v4.pth'))


In [17]:
# shuffle training data
shuffleIndices = np.random.permutation(len(testDataNumbers))
shuffledNumbers = testDataNumbers[shuffleIndices]
shuffledLabel = testDataLabel[shuffleIndices]

In [18]:
# generate 25 prime table
primeTable25 = primeTableNumber[:25]
print (primeTable25)

[ 2  3  5  7 11 13 17 19 23 29 31 37 41 43 47 53 59 61 67 71 73 79 83 89
 97]


## one number
torch version

In [14]:
# 評估模型
model.eval()
total_loss = 0
correct = 0
total = 0

with torch.no_grad():
    for batch_features, batch_labels in val_loader:
        batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)
        total_loss += loss.item()

average_loss = total_loss / len(val_loader)

print(f"Loss: {average_loss:.4f}")


Loss: 155945960.0000


In [19]:
import torch


# 準備數據
test_data = []  # 這裡放你想要預測的數字
new_features = []
for num in shuffledNumbers:
    feature = convert_base_into_list(num, 2) + convert_base_into_list(num, 3) + \
              convert_base_into_list(num, 5) + convert_base_into_list(num, 7)
    new_features.append(feature)



new_features = torch.FloatTensor(new_features).to(device)

# 進行預測
with torch.no_grad():
    predictions = model(new_features)

# # 輸出結果

result = predictions.cpu().numpy()[:,0]
print(result)


[1.9482116  0.72832745 1.6123277  0.6966759  1.4261906  0.7117688
 1.9212632  0.7736532  0.7013575  1.4450363  0.6988166  1.705915
 1.432617   0.9373112  0.7126024  0.8671815  0.71188796 1.4058646
 0.7381256  0.9570596  1.4041355  0.642774   0.65550727 0.7113078
 0.9407033  1.9085459  0.71116936 0.716448   0.7113451  1.6556264
 0.69951475 1.4671983  1.4488307  1.423585   1.437078   1.0365312
 0.7119828  0.7100744  1.6813571  0.68357164 1.4050374  0.736928
 0.94595337 0.7104683  1.4512129  1.4344677  0.71884346 0.7134578
 0.7115309  1.4241629 ]


In [None]:
roundPredictions = [int(i) for i in predictions]

In [None]:
print (len(testDataLabel))
print (len(roundPredictions))

50
50


# G4 normolize

In [20]:
G4_normolize = []
G4 = testData['G4'].values
for i in range(50):
  G4_normolize.append(result[i]*G4[i])

print (G4_normolize)

[49151.79670407263, 16790.971628298597, 43085.65688981386, 16605.933461880508, 37434.510147354515, 16478.759890363614, 50468.13904539849, 19305.85426948937, 18641.799463401137, 38133.10853369731, 18141.916952552718, 44298.448475393816, 37784.31538772856, 23875.533316420897, 17340.14104431132, 20344.67407933388, 16813.24863053085, 35696.107208887144, 19753.96079087368, 24285.846612558056, 33822.38140058022, 15922.481987139889, 15291.147763809364, 17178.21675673728, 22524.968951285744, 46533.874819190874, 17688.596938847542, 18902.424243603717, 18400.74803267745, 42093.237478167684, 18306.412688714256, 37617.67568817913, 36874.88065469374, 36847.42427497466, 36825.53704532495, 27315.97366481348, 17483.104766400524, 16890.533503360548, 40050.92315317608, 17797.08929623633, 33878.21607353953, 17686.764996551516, 21930.276670830386, 16974.766373623846, 34014.86451054372, 38273.61137285582, 18279.468736241455, 16505.41070785683, 16810.248219033296, 34422.63740293013]


In [21]:
for i in range(50):
  print (f"number : {testDataNumbers[i]} | Partitions : {testDataGroundTruth[i]} , prediction : {round(G4_normolize[i])}")

number : 5553744 | Partitions : 41873 , prediction : 49152
number : 5003876 | Partitions : 15970 , prediction : 16791
number : 5935308 | Partitions : 36998 , prediction : 43086
number : 5200696 | Partitions : 16630 , prediction : 16606
number : 5813676 | Partitions : 41419 , prediction : 37435
number : 5028418 | Partitions : 16170 , prediction : 16479
number : 5818872 | Partitions : 36489 , prediction : 50468
number : 5483820 | Partitions : 46016 , prediction : 19306
number : 5898626 | Partitions : 21873 , prediction : 18642
number : 5849802 | Partitions : 46833 , prediction : 38133
number : 5740300 | Partitions : 24297 , prediction : 18142
number : 5741998 | Partitions : 18243 , prediction : 44298
number : 5846038 | Partitions : 20365 , prediction : 37784
number : 5615658 | Partitions : 35177 , prediction : 23876
number : 5326454 | Partitions : 20254 , prediction : 17340
number : 5106104 | Partitions : 16261 , prediction : 20345
number : 5145688 | Partitions : 16516 , prediction : 168

## normolize with partition

In [22]:
def calculate_mse(predictions, targets):
    """
    計算均方誤差 (MSE)

    參數:
    predictions: 預測值的列表或陣列
    targets: 實際值的列表或陣列，與預測值對應

    返回值:
    mse: 均方誤差
    """
    # 確保預測值和實際值的長度相等
    if len(predictions) != len(targets):
        raise ValueError("預測值和實際值的長度不一致")

    # 計算平方誤差
    squared_errors = [(p - t) ** 2 for p, t in zip(predictions, targets)]

    # 計算均方誤差
    mse = sum(squared_errors) / len(predictions)

    return mse

# 使用 LN


# 使用 MLP(old)
#mse_result = calculate_mse(predictions, testDataPartition)


import math
mean = testData['Partition'].mean()
MSE = calculate_mse(G4_normolize, testDataGroundTruth)

RMSE= math.sqrt(MSE)
errorRate = RMSE/mean

print (f"MSE : {MSE} / ({MSE/1000000})\nRMSE : {RMSE}\nError Rate : {errorRate}  ->  {round(errorRate,7)*100}%")

MSE : 198634497.85231605 / (198.63449785231606)
RMSE : 14093.775145514279
Error Rate : 0.5103315413936494  ->  51.033150000000006%


## normolize with G4

In [None]:
def calculate_mse(predictions, targets):
    """
    計算均方誤差 (MSE)

    參數:
    predictions: 預測值的列表或陣列
    targets: 實際值的列表或陣列，與預測值對應

    返回值:
    mse: 均方誤差
    """
    # 確保預測值和實際值的長度相等
    if len(predictions) != len(targets):
        raise ValueError("預測值和實際值的長度不一致")

    # 計算平方誤差
    squared_errors = [(p - t) ** 2 for p, t in zip(predictions, targets)]

    # 計算均方誤差
    mse = sum(squared_errors) / len(predictions)

    return mse

# 使用 LN



import math
mean = testData['Factors'].mean()
MSE = calculate_mse(result, testDataLabel)

RMSE= math.sqrt(MSE)
errorRate = RMSE/mean

print (f"MSE : {MSE}\nRMSE : {RMSE}\nError Rate : {errorRate}  ->  {round(errorRate,7)*100}%")

MSE : 0.40544219032581863
RMSE : 0.6367434258206508
Error Rate : 0.5790622225267348  ->  57.90622%
