# 第二部分-實際應用

#### 今日目標: <br>針對此網站https://www.u-car.com.tw/ 進行一些資料分析

### 步驟1-資料收集(爬蟲應用)

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}

url = 'https://newcar.u-car.com.tw/newcar/search?bodytype=0&minprice=&maxprice=&size=0&displacement=0&fueltype=0'

response = requests.get(url, headers=headers)


if response.status_code == 200:
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    car_info_elements = soup.select('.newcar_range_ideal') #縮小範圍

    with open('ucar_car_models.csv', 'w', newline='', encoding='utf-8') as csv_file:
        
        csv_writer = csv.writer(csv_file)
        
        csv_writer.writerow(['brand','product','cc','price'])

        for car_info_element in car_info_elements:
                     
            car_model = car_info_element.select_one('.title_brand').text.strip()
            car_style = car_info_element.select_one('.title').text.strip()
            cc=car_info_element.select_one('.cc_text').text.strip()
            car_price = car_info_element.select_one('.price_number strong').text.strip()
            
            csv_writer.writerow([car_model, car_style, cc,car_price])

    print("存於ucar_car_models.csv檔案。")

else:
    print("失敗:", response.status_code)

### 步驟2-資料處理

In [None]:
import pandas as pd #通用習慣pd
#如果沒有pandas-pip

In [None]:
# pd.read_csv()


In [None]:
#顯示出前10筆資料，預設值為5筆資料


### 技巧1 缺失值處理

In [None]:
#isnull().sum()

missing_values = df.isnull().sum()
print("\n缺失值統計:")
print(missing_values)

#### (參考)若有缺失值

In [None]:
# (參考)若有缺失值
df2 = pd.read_csv('./ucar_car_models2.csv',encoding='unicode_escape') 

df2

In [None]:
missing_values = df2.isnull().sum()
print("\n缺失值統計:")
print(missing_values)

#### 直接刪除有缺的部分 or 想辦法填補該值e.g. 平均值/中位數

In [None]:
df2 = df2.dropna(subset=['product']) #直接刪除
df2 = df2.dropna(subset=['price']) #直接刪除
df2

In [None]:
missing_values = df2.isnull().sum()
print("\n缺失值統計:")
print(missing_values)

### 技巧2 欄位統一格式
將車價原本為範圍的選取成高價之數字

In [None]:
import re
def process_price(value):
    # 使用正則表達式提取 "-" 後的數字
    match = re.search(r'-(.*)', value)
    
    # 如果有匹配到，返回 "-" 後的數字，否則返回原始值
    return match.group(1) if match else value

# 將函數應用於 "price" 欄位
df['price'] = df['price'].apply(process_price)

# 將 "price" 欄位轉換為數字
df['price'] = pd.to_numeric(df['price'], errors='coerce')

In [None]:
df

### 步驟3-資料分析

In [None]:
#查看欄位描述性統計


In [None]:
#查看各廠牌的數量


In [None]:
#查看各品牌特定cc數量

# 顯示結果


In [None]:
#查看有多少 cc=0 電動車


In [None]:
#將結果儲存
outputpath = 'ele_car.csv'
# outputpath是保存文件的路徑

ele_car.to_csv(outputpath,sep=',',index=False,header=True) 

#### 進階挑戰-機器學習入門
KNN演算法

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 選取 "brand" 是 "Toyota" 或 "BMW" 的資料
selected_brands = ['Mercedes-Benz', 'Toyota']
selected_data = df[df['brand'].isin(selected_brands)]

# 將 "brand" 轉換為數字，因為 KNN 算法需要數字型態的輸入
le = LabelEncoder()
selected_data['brand'] = le.fit_transform(selected_data['brand'])

# 分割特徵和目標變數
X = selected_data[['cc', 'price']]
y = selected_data['brand']

# 將資料分割為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 初始化 KNN 分類器，這裡假設 K=3
knn = KNeighborsClassifier(n_neighbors=3)

# 訓練模型
knn.fit(X_train, y_train)

# 預測
y_pred = knn.predict(X_test)

# 評估準確性
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# 視覺化
sns.scatterplot(x='cc', y='price', hue='brand', data=selected_data, palette='Set1')
plt.title('Scatter Plot of cc vs. price')
plt.show()

In [None]:
import numpy as np

# 新增一個點
new_cc = 3000  # 你的新 cc 值
new_price = 550.0  # 你的新 price 值
new_point = np.array([[new_cc, new_price]])

# 將新點轉換為與訓練集相同的格式
new_point_df = pd.DataFrame(new_point, columns=['cc', 'price'])
new_point_df['brand'] = knn.predict(new_point_df[['cc', 'price']])

# 視覺化
sns.scatterplot(x='cc', y='price', hue='brand', data=selected_data, palette='Set1')
plt.scatter(new_point_df['cc'], new_point_df['price'], marker='X', s=100, color='red', label='New Point')
plt.title('Scatter Plot of cc vs. price with New Point')
plt.legend()
plt.show()

# 打印新點的預測結果
predicted_brand = le.inverse_transform(new_point_df['brand'])[0]
print(f'The new point is predicted to belong to the brand: {predicted_brand}')


#### 步驟4-視覺化呈現

#### 練習1-cc數與價格的散布圖

In [None]:
# 視覺化散點圖

# 添加軸標籤和標題

# 顯示視覺化圖


#### 練習2-對'BMW', 'Porsche','Toyota' 的價格製作boxplot

In [None]:
selected_brands = ['BMW', 'Porsche','Toyota']
selected_data = df[df['brand'].isin(selected_brands)]

# 繪製盒狀圖
plt.figure(figsize=(10, 6))
plt.boxplot([selected_data[selected_data['brand'] == brand]['price'] for brand in selected_brands], labels=selected_brands, vert=True)
plt.xlabel('Price')
plt.title('Box Plot of Car Prices for BMW and Mercedes-Benz and Toyota')
plt.show()

#### 練習3-各家廠牌電動車的平均價格

In [None]:
#電動車

In [None]:
#計算平均

# 顯示結果


In [None]:
sns.set(style="whitegrid")

# 使用條形圖
plt.figure(figsize=(10, 6))
sns.barplot(x='brand', y='price', data=average_price_by_brand, palette='viridis')
plt.xlabel('Brand')
plt.ylabel('Price')
plt.title('Bar Chart of ele_Car Prices by Brand')
plt.xticks(rotation=45, ha='right')
plt.show()