# 薪資預測與職涯規劃

## 目錄
- 資料預處理
- 資料庫建立
- 模型建構
- 介面開發

### 資料預處理

### 資料庫建立

In [2]:
from sqlalchemy import create_engine, Column, Integer, String, Float, MetaData, Table, desc
from sqlalchemy.orm import sessionmaker
import pandas as pd
import os

# Database file path
db_file = 'salary_database.db'

# Delete the existing database file if it exists
if os.path.exists(db_file):
    os.remove(db_file)
    
# Define the database schema
metadata = MetaData()

# Define the table structure
data_table = Table('salary_data', metadata,
                   Column('id', Integer, primary_key=True),
                   Column('company_name', String),
                   Column('position', String),
                   Column('related_experience_years', Float),
                   Column('current_job_experience_years', Float),
                   Column('monthly_salary', Float),
                   Column('monthly_bonus', Float),
                   Column('average_monthly_working_hours', Integer),
                   Column('overtime_frequency', Integer),
                   Column('comfort_level', Integer),
                   Column('workload', Integer))

# Create a file-based SQLite database
engine = create_engine(f'sqlite:///{db_file}')
metadata.create_all(engine)

# Create a session
Session = sessionmaker(bind=engine)
session = Session()

# Load the CSV file
df = pd.read_csv('./rewrite.csv')  # Replace with the path to your CSV file

# Function to add a row to the database
def add_row(row):
    insert_statement = data_table.insert().values(
        company_name=row['公司名稱'],
        position=row['職務'],
        related_experience_years=row['相關年資(Y)'],
        current_job_experience_years=row['現職年資(Y)'],
        monthly_salary=row['月底薪(萬)'],
        monthly_bonus=row['Bonus (月)'],
        average_monthly_working_hours=row['每月平均工時'],
        overtime_frequency=row['加班頻率'],
        comfort_level=row['爽度(1~5) 5最爽'],
        workload=row['Loading(5最重)'])
    session.execute(insert_statement)

# Import data into the database
df.apply(add_row, axis=1)
session.commit()

# Querying the last 5 entries from the database
query_result = session.query(data_table).order_by(desc(data_table.c.id)).limit(5).all()
for row in query_result:
    print(row)  # This will print out the last five rows from the database

# Close the session
session.close()

0      None
1      None
2      None
3      None
4      None
       ... 
363    None
364    None
365    None
366    None
367    None
Length: 368, dtype: object

(368, 'MIC', 'Sales', 6.0, 2.0, 4.2, 2.0, 180, 2, 2, 4)
(367, 'ME', 'Sustomer service', 0.0, 1.0, 3.6, 1.0, 230, 0, 5, 0)
(366, 'QUANTA', 'Software Engineer', 5.0, 1.0, 4.5, 5.0, 200, 3, 3, 5)
(365, 'ME', 'VTuber project production', 0.0, 1.0, 3.9, 0.0, 104, 0, 5, 1)
(364, 'Yang Ming Marine Transport Corp', 'Legal', 0.0, 1.0, 4.3, 0.0, 160, 1, 4, 2)


### 模型建構

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 讀取數據
file_path = './rewrite.csv'
df = pd.read_csv(file_path)
df_2 = pd.read_csv(file_path)

# 將指定列轉換為浮點數
cols_to_float = ['相關年資(Y)', '現職年資(Y)', '月底薪(萬)', 'Bonus (月)', '每月平均工時', '加班頻率', 
                 '爽度(1~5) 5最爽', 'Loading(5最重)']
for col in cols_to_float:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 確定需要進行標籤編碼的非數值列
non_numeric_cols = df.select_dtypes(include=['object']).columns

df = df.dropna()

# 為每個非數值列進行標籤編碼
for col in non_numeric_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# 選擇特徵和目標變量
X = df.drop(columns=['月底薪(萬)'])  # 特徵數據
y = df['月底薪(萬)']  # 目標數據

# 分割數據集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

# 創建隨機森林回歸模型並訓練
rf_model = RandomForestRegressor(random_state=19)
rf_model.fit(X_train, y_train)

# 進行預測並評估模型
y_pred = rf_model.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred))
r2_rf = r2_score(y_test, y_pred)

print("RMSE:", rmse_rf)
print("R^2:", r2_rf)

### 介面開發

In [4]:
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import tkinter as tk
from tkinter import ttk

# 讀取數據
file_path = './rewrite.csv'
df_2 = pd.read_csv(file_path)

# 載入公司對應的編碼映射 指定檔案編碼為 utf-8
with open('company.txt', 'r', encoding='utf-8') as file:
    try:
        company = json.load(file)
        print("JSON 文件內容:")
        print(company)
    except json.JSONDecodeError as e:
        print("解碼 JSON 時發生錯誤:", e)

# 載入職位對應的編碼映射
with open('job_title_to_number_mapping.txt', 'r') as file:
    try:
        job_title_to_number = json.load(file)
        print("JSON 文件內容:")
        print(job_title_to_number)
    except json.JSONDecodeError as e:
        print(f"JSON 解碼錯誤: {e}")

def center_window(window):
    window.update_idletasks()
    width = window.winfo_width()
    height = window.winfo_height()
    x_coordinate = (window.winfo_screenwidth() - width) // 2
    y_coordinate = (window.winfo_screenheight() - height) // 2
    window.geometry(f"{width}x{height}+{x_coordinate}+{y_coordinate}")

# 創建應用程序窗口
app = tk.Tk()
app.title("薪資預測應用")

# 創建公司名稱下拉選單
company_label = tk.Label(app, text="公司名稱:")
company_label.grid(row=0, column=0, padx=10, pady=5)
company_var = tk.StringVar()
company_dropdown = ttk.Combobox(app, textvariable=company_var, values=list(company.keys()))
company_dropdown.grid(row=0, column=1, padx=10, pady=5)
company_dropdown.configure(width=30)

# 創建職務下拉選單
position_label = tk.Label(app, text="職務:")
position_label.grid(row=1, column=0, padx=10, pady=5)
position_var = tk.StringVar()
position_dropdown = ttk.Combobox(app, textvariable=position_var, values=list(job_title_to_number.keys()))
position_dropdown.grid(row=1, column=1, padx=10, pady=5)
position_dropdown.configure(width=30)

# 創建其他輸入框
labels = ["相關年資(Y)", "現職年資(Y)", "Bonus (月)", "每月平均工時", "加班頻率", "爽度(1~5) 5最爽", "Loading(5最重)"]
variables = [tk.DoubleVar(value = None) for _ in labels]
entries = [ttk.Entry(app, textvariable=var) for var in variables]

for i, label in enumerate(labels):
    tk.Label(app, text=label + ":").grid(row=i + 2, column=0, padx=10, pady=5)
    entries[i].grid(row=i + 2, column=1, padx=10, pady=5)
    entries[i].configure(width=30)

# 定義預測函數
def predict_salary(df):

    non_numeric_cols_2 = df_2.select_dtypes(include=['object']).columns

    if all(entry.get() != '' and entry.get().replace('.', '').isdigit() for entry in entries):
        
        data = pd.DataFrame({
            '公司名稱': [company_var.get()],
            '職務': [position_var.get()],
            '相關年資(Y)': [variables[0].get()],
            '現職年資(Y)': [variables[1].get()],
            'Bonus (月)': [variables[2].get()],
            '每月平均工時': [variables[3].get()],
            '加班頻率': [variables[4].get()],
            '爽度(1~5) 5最爽': [variables[5].get()],
            'Loading(5最重)': [variables[6].get()]
        })

        # 針對非數值進行標籤編碼
        label_encoders = {} 
        # 合併訓練集和測試集的非數值類型資料
        combined_data = pd.concat([df[non_numeric_cols_2], data[non_numeric_cols_2]], axis=0)

        # 創建 LabelEncoder 物件並對所有資料進行編碼
        label_encoders = {}
        for col in non_numeric_cols_2:
            label_encoders[col] = LabelEncoder()
            combined_data[col] = label_encoders[col].fit_transform(combined_data[col].astype(str))

        # 將編碼應用到訓練集和測試集的資料上
        for col in non_numeric_cols_2:
            df[col] = label_encoders[col].transform(df[col].astype(str))
            data[col] = label_encoders[col].transform(data[col].astype(str))

        predicted_salary = rf_model.predict(data)
        print("Predicted Salary:", predicted_salary)
        result_label.config(text=f"預測薪資: {round(predicted_salary[0], 1)} 萬",fg="black")
    else:
        result_label.config(text="請填寫所有輸入框，且輸入有效數字", fg="red")  # 設定文字顏色為紅色

# 添加預測按鈕
predict_button = tk.Button(app, text="預測薪資", command=lambda: predict_salary(df))
predict_button.grid(row=len(labels) + 2, column=0, columnspan=2, pady=10)

# 顯示預測結果的標籤
result_label = tk.Label(app, text="")
result_label.grid(row=len(labels) + 3, column=0, columnspan=2, pady=5)

# 設定 grid 權重，使其能夠自適應大小
for i in range(len(labels) + 4):
    app.grid_rowconfigure(i, weight=1)
    app.grid_columnconfigure(0, weight=1)
    app.grid_columnconfigure(1, weight=1)

# 啟動主循環
center_window(app)

# 啟動主循環
app.mainloop()

RandomForestRegressor(random_state=19)

RMSE: 2.3036399428917895
R^2: 0.5076826218908912
JSON 文件內容:
{'Acer': 63, 'Advantech': 64, 'AICS': 65, 'AMD': 66, 'Amway': 67, 'Andes': 68, 'Anpec': 69, 'ASA tools': 70, 'ASUS': 71, 'AUO': 72, 'Billion Technology Enginerring': 73, 'Buddist Tzu Chi Medical Foundation': 74, 'cacaFly': 75, 'Canon': 76, 'Chailease Finance': 77, 'CHT': 78, 'Compal': 79, 'cp': 80, 'Deloitte & Touche': 81, 'DELTA': 82, 'Digit Spark': 83, 'DIGITIMES': 84, 'eCloudvalley': 85, 'Edimax': 86, 'Elan': 87, 'Ememory': 88, 'Eternal Materials': 89, 'Etron': 90, 'Faraday': 91, 'FET': 92, 'Fiti': 93, 'FocalTech': 94, 'FWD': 95, 'GEMTEK': 96, 'Gf': 97, 'GREEN': 98, 'GS Marketing': 99, 'GUC': 100, 'Himax': 101, 'HOBOT Technology': 102, 'Holtek': 103, 'HTC': 104, 'Huawei': 105, 'iCatch': 106, 'Ilitek': 107, 'Infortrend': 108, 'InnoFusion': 109, 'Innolux': 110, 'Intel': 111, 'ITRI': 112, 'Jetly': 113, 'Kdan Mobile': 114, 'Keysight Technologies': 115, 'KGI Bank': 116, "King's Town Bank": 117, 'Kinmax': 118, 'KYEC': 119, 'Landb

''

KeyboardInterrupt: 

: 