PCA NUM

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

df = pd.read_csv('CSV/CSV_Caculate/CSV_Caculate_results0.csv')

df.dropna(inplace=True)

features = ['總騎乘長度', '最高海拔', '平均海拔', '爬升坡率平均(趨勢)', '最大爬升坡率(趨勢)', 
            '下降坡率平均(趨勢)', '最大下降坡率(趨勢)', '總爬升海拔', '最大爬升海拔', 
            '平均爬升海拔', '總下降海拔', '最大下降海拔', '平均下降海拔', '爬升路段比例', 
            '下降路段比例', '平均路徑變化率']

for feature in features:
    df[feature] = pd.to_numeric(df[feature], errors='coerce')

df.dropna(subset=features, inplace=True)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[features])

pca = PCA()
pca.fit(scaled_features)

components = pd.DataFrame(pca.components_, columns=features)

feature_importance = components.abs().sum(axis=0)

sorted_features = feature_importance.sort_values(ascending=False)

print("Features sorted by overall importance:")
print(sorted_features)


RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

df['label'] = np.random.randint(0, 3, df.shape[0])

X = df[features]
y = df['label']

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X, y)

importances = rf.feature_importances_
feature_importance = pd.Series(importances, index=features).sort_values(ascending=False)
print("Feature Importances:")
print(feature_importance)
'''
plt.figure(figsize=(10, 6))
feature_importance.plot(kind='bar')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance from RandomForest')
plt.show()
'''

correlation

In [None]:
import pandas as pd

df1 = pd.read_csv('CSV/CSV_Caculate/CSV_Caculate_results0.csv')
df2 = pd.read_csv('CSV/CSV_Caculate/CSV_Caculate_results1.csv')
df = pd.concat([df1, df2], ignore_index=True)

df.dropna(inplace=True)

features = ['總騎乘長度', '最高海拔', '平均海拔', '爬升坡率平均(趨勢)', '最大爬升坡率(趨勢)', 
            '下降坡率平均(趨勢)', '最大下降坡率(趨勢)', '總爬升海拔', '最大爬升海拔', 
            '平均爬升海拔', '總下降海拔', '最大下降海拔', '平均下降海拔', '爬升路段比例', 
            '下降路段比例', '平均路徑變化率']

for feature in features:
    df[feature] = pd.to_numeric(df[feature], errors='coerce')

df.dropna(subset=features, inplace=True)

correlation_matrix = df[features].corr()

correlation_sums = correlation_matrix.abs().sum(axis=1)

sorted_correlation = correlation_sums.sort_values(ascending=False)

print("Features sorted by overall correlation:")
print(sorted_correlation)


Kmeans

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
# 讀取 CSV 檔案數據
df1 = pd.read_csv('CSV/CSV_Caculate/CSV_Caculate_results0.csv')
df2 = pd.read_csv('CSV/CSV_Caculate/CSV_Caculate_results1.csv')
df = pd.concat([df1, df2], ignore_index=True)
# 定義數值特徵用於聚類分析
numeric_features = [
    '最大爬升海拔', '最高海拔', '平均海拔', '平均爬升海拔', '最大下降海拔',
    '平均下降海拔', "爬升坡率平均(趨勢)", '下降坡率平均(趨勢)', "總爬升海拔",
    '爬升路段比例', '下降路段比例'
]
# 選取需要的特徵列
df_selected = df[numeric_features + ['File_Name']]
df_numeric = df_selected[numeric_features].apply(pd.to_numeric, errors='coerce')
# 去除有缺失值的數據行
df_numeric.dropna(inplace=True)
# 根據數值數據的索引更新原始選取的 DataFrame
df_selected = df_selected.loc[df_numeric.index]
# 標準化數值特徵
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numeric)
# 定義一個函數來優化 K 值選取
def optimise_k_means(data, max_k):
    means = list(range(1, max_k))
    cohesions = []
    for k in means:
        kmeans = KMeans(n_clusters=k, n_init=10)
        kmeans.fit(data)
        cohesion = np.sum(np.min(cdist(data, kmeans.cluster_centers_, 'euclidean'), axis=1)) / data.shape[0]
        cohesions.append(cohesion)
    optimal_k = means[np.argmin(cohesions)]
    return optimal_k
# 找到最佳的 K 值
optimal_k = optimise_k_means(df_scaled, 10)
# 使用最佳的 K 值進行 K-means 聚類
kmeans = KMeans(n_clusters=3, n_init=10)
kmeans.fit(df_scaled)
# 在 DataFrame 中添加聚類標籤
df_selected['cluster'] = kmeans.labels_
# 進行 PCA 降維以便於可視化
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_scaled)
# 繪製 PCA 降維後的數據分布
plt.figure(figsize=(6, 4))
scatter = plt.scatter(x=df_pca[:, 0], y=df_pca[:, 1], c=df_selected['cluster'], cmap='viridis')
plt.xlabel('PCA 第一主成分')
plt.ylabel('PCA 第二主成分')
plt.title(f'K-means 聚類 (k={3}) - PCA 降維')
plt.colorbar(scatter, label='群集')
plt.grid(True)
plt.show()
# 提取每個群集中的檔案名稱到字典
cluster_filenames = {k: df_selected[df_selected['cluster'] == k]['File_Name'].tolist() for k in range(optimal_k)}

看各群數量

In [None]:
print(len(cluster_filenames[0]))
print(len(cluster_filenames[1]))
print(len(cluster_filenames[2]))

存檔(手動...)要自己調參數
1. if i in cluster_filenames[i]:調整要存檔的類型
2. new_folder = 'CSV/CSV_Type/TC':設定該類別的資料夾

In [None]:
import pandas as pd
import os
for i in CSV:
    if i in cluster_filenames[2]:  # 檢查文件名是否在 cluster_filenames[0] 中
        # 讀取原始CSV文件
        df1 = pd.read_csv(f'CSV/CSV_Row_Data2/{i}')
        # 定義新資料夾的路徑
        new_folder = 'CSV/CSV_Type/TC'
        # 確保新資料夾存在，如果不存在則創建它
        if not os.path.exists(new_folder):
            os.makedirs(new_folder)
        # 定義新文件的完整路徑
        new_file_path = os.path.join(new_folder, i)
        # 將DataFrame保存到新路徑
        df1.to_csv(new_file_path, index=False)
        print(f"File saved to {new_file_path}")

檢查

In [None]:
import os

folder_path = 'CSV/CSV_Type/TC/'

CSV = []

file_names = os.listdir(folder_path)

csv_file_names = [file for file in file_names if file.lower().endswith('.csv')]

for file_name in csv_file_names:
    CSV.append(file_name)

print("共{",len(CSV),"}個檔案")
'''
for i in range(len(CSV)):
    #print(f"Route Number: {i:<5} | Route File: {CSV[i]:<20}")
    print(f"{CSV[i]:<20}")'''