## 第1題

In [7]:
import pandas as pd
from sklearn.utils import shuffle

# 從CSV文件讀取數據到DataFrame
df = pd.read_csv('economic_freedom_data.csv')

# 打亂原始順序
df = shuffle(df,random_state=10)

# 進行前處理
# 檢查遺漏值
print("\n每個欄位的遺漏值個數：")
print(df.isnull().sum())# 發現無遺漏值

df.head()


每個欄位的遺漏值個數：
government_consumption             0
transfers                          0
gov_enterprises                    0
top_marg_tax_rate                  0
size_government                    0
judicial_independence              0
impartial_courts                   0
protection_property_rights         0
military_interference              0
integrity_legal_system             0
legal_enforcement_contracts        0
restrictions_sale_real_property    0
reliability_police                 0
business_costs_crime               0
gender_adjustment                  0
property_rights                    0
money_growth                       0
std_inflation                      0
inflation                          0
freedom_own_foreign_currency       0
sound_money                        0
tariffs                            0
regulatory_trade_barriers          0
black_market                       0
control_movement_capital_ppl       0
trade                              0
credit_market_reg        

Unnamed: 0,government_consumption,transfers,gov_enterprises,top_marg_tax_rate,size_government,judicial_independence,impartial_courts,protection_property_rights,military_interference,integrity_legal_system,...,tariffs,regulatory_trade_barriers,black_market,control_movement_capital_ppl,trade,credit_market_reg,labor_market_reg,business_reg,regulation,freedom
764,0.0,8.584783,10.0,5.0,5.896196,5.011383,3.525164,4.307329,6.5,6.231,...,7.498356,6.752429,10.0,2.432515,6.670825,7.988667,7.758627,4.855221,6.867505,not-freedom
278,4.844118,5.609294,7.0,5.5,5.738353,4.588303,3.72914,5.38775,10.0,7.5,...,8.3604,7.948301,10.0,5.494242,7.950736,8.843333,7.72797,6.512236,7.694513,freedom
1538,5.552941,9.96049,2.0,6.37522,5.837811,3.256946,3.727413,5.762411,5.0,5.833333,...,6.918692,4.676226,10.0,2.569951,6.041217,7.546693,5.840112,5.49534,6.294049,not-freedom
1780,1.167295,4.002725,8.0,1.5,3.667505,8.108974,8.717949,8.707483,9.166667,10.0,...,8.393422,8.709256,10.0,5.907099,8.252444,10.0,5.061734,8.023171,7.694969,freedom
123,6.908824,9.459158,0.0,6.5,5.716995,7.614717,7.012629,7.454309,6.1,6.231,...,7.049644,5.357441,10.0,5.653024,7.015027,7.739761,8.519242,8.230928,8.16331,freedom


## 第2題

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

# 特徵欄位為features，目標欄位為target
features = df.drop('freedom', axis=1)
target = df['freedom']

# 建立RandomForest分類器
rf_classifier = RandomForestClassifier(random_state=10)
rf_classifier.fit(features, target)

# 進行10次交叉驗證並計算F1-score
sl_rf_f1_scores = cross_val_score(rf_classifier, features, target, cv=10, scoring='f1_macro')

# 計算平均分數
sl_rf_average_f1_score = sl_rf_f1_scores.mean()

# 印出平均分數
print("Average F1-score:", sl_rf_average_f1_score)

Average F1-score: 0.9675983424705306


## 第3題

In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

# 特徵欄位為features，目標欄位為target
features = df.drop('freedom', axis=1)
target = df['freedom']

# 建立MLP分類器
mlp_classifier = MLPClassifier(hidden_layer_sizes=(64, 128), batch_size=8,random_state=10)
mlp_classifier.fit(features, target)

# 進行10次交叉驗證並計算F1-score
sl_mlp_f1_scores = cross_val_score(mlp_classifier, features, target, cv=10, scoring='f1_macro')

# 計算平均分數
sl_mlp_average_f1_score = sl_mlp_f1_scores.mean()

# 印出平均分數
print("Average F1-score:", sl_mlp_average_f1_score)

Average F1-score: 0.9546165449005615


In [4]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [18]:
pip install scikeras

Collecting scikeras
  Downloading scikeras-0.10.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.10.0
Note: you may need to restart the kernel to use updated packages.


## 第4題

In [10]:
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
#from scikeras.wrappers import KerasClassifier
import numpy as np

# 特徵欄位為features，目標欄位為target
features = df.drop('freedom', axis=1)
target = df['freedom']

# 定義建立Keras模型的函式
def create_model():
    model = Sequential()
    model.add(Dense(64, activation='relu', input_dim=features.shape[1]))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# 將Keras模型封裝成scikit-learn的推定器形式
estimator = KerasClassifier(build_fn=create_model, epochs=30, batch_size=8, verbose=0)
estimator.fit(features, target)

# 使用StratifiedKFold做為交叉驗證的方法
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=10)

# 進行交叉驗證並計算F1-score
kr_mlp_f1_scores = cross_val_score(estimator, features, target, cv=kfold, scoring='f1_macro')

# 計算平均分數
kr_mlp_average_f1_score = np.mean(kr_mlp_f1_scores)
print("Average F1-score:", kr_mlp_average_f1_score)

  estimator = KerasClassifier(build_fn=create_model, epochs=30, batch_size=8, verbose=0)


Average F1-score: 0.9603483631017424


## 第5題

以上3題皆是以兩者平均作為F1-score的依據，因為都是設為scoring='f1_macro'

## 第6題

In [12]:
import numpy as np
from scipy import stats
from scipy.stats import ttest_ind

# 計算t-value和p-value
t_value1, p_value1 = ttest_ind(sl_rf_f1_scores,sl_mlp_f1_scores)
t_value2, p_value2 = ttest_ind(sl_rf_f1_scores, kr_mlp_f1_scores)
t_value3, p_value3 = ttest_ind(sl_mlp_f1_scores,kr_mlp_f1_scores)
 
print("scikit-learn的RandomForest v.s. scikit-learn的MLP")
print("t-value:", t_value1)
print("p-value:", p_value1)
print()
print("scikit-learn的RandomForest v.s. keras的MLP")
print("t-value:", t_value2)
print("p-value:", p_value2)
print()
print("scikit-learn的MLP v.s. keras的MLP")
print("t-value:", t_value3)
print("p-value:", p_value3)

scikit-learn的RandomForest v.s. scikit-learn的MLP
t-value: 2.00312462194545
p-value: 0.06045703889742101

scikit-learn的RandomForest v.s. keras的MLP
t-value: 1.3580019610829492
p-value: 0.1912432035491307

scikit-learn的MLP v.s. keras的MLP
t-value: -0.9897027438982892
p-value: 0.3354378979117034


### 結論

將顯著水準設置為0.05:
以上三個模型兩兩比較的p-value都是大於0.05的，所以可以得到結論是此次實驗中三個模型相互間的表現比較都不存在明顯差異

## 第7題

In [13]:
# 導入newdata
new_data = pd.read_csv("newdata.csv")

rf_predict = rf_classifier.predict(new_data)
sl_mlp_predict = mlp_classifier.predict(new_data)
kr_mlp_predict = estimator.predict(new_data)
print("scikit-learn的RandomForest預測: ",rf_predict)
print("scikit-learn的MLP預測: ",sl_mlp_predict)
print("keras的MLP預測: ",kr_mlp_predict)

scikit-learn的RandomForest預測:  ['not-freedom']
scikit-learn的MLP預測:  ['not-freedom']
keras的MLP預測:  [['not-freedom']]


結果-用三個模型預測結果都是not-freedom

## 第8題

In [15]:
# 獲取特徵的重要性分數
importances = rf_classifier.feature_importances_

# 建立特徵與重要性分數的對應字典
feature_importance_dict = dict(zip(features.columns, importances))

# 根據重要性分數排序，由高到低
sorted_feature_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# 獲取最重要的特徵及其重要性分數
most_important_feature, highest_importance = sorted_feature_importances[0]

# 印出最重要的特徵及其重要性分數
print(f"最重要的屬性: {most_important_feature}, Importance:{highest_importance}")

print()

# 印出特徵的重要性排序
print("屬性重要性排序:")
for feature, importance in sorted_feature_importances:
    print(f" {feature:35} {importance}")

最重要的屬性: trade, Importance:0.15858059977664826

屬性重要性排序:
 trade                               0.15858059977664826
 sound_money                         0.15730887387114237
 freedom_own_foreign_currency        0.10042074184640819
 control_movement_capital_ppl        0.07065168020085259
 regulation                          0.05575997659138414
 property_rights                     0.05148322972661163
 business_reg                        0.04795694391491417
 regulatory_trade_barriers           0.04090180263937907
 size_government                     0.03733497260839916
 tariffs                             0.03531461686812836
 military_interference               0.0348529231297113
 protection_property_rights          0.0202452657028774
 labor_market_reg                    0.017625051682671207
 reliability_police                  0.0160885320291397
 impartial_courts                    0.015305872741631777
 std_inflation                       0.01491184201103592
 gov_enterprises                 