# Smart Laptop Advisor - ML Pipeline

## Dự đoán giá Laptop bằng Machine Learning

**Dataset:** Kaggle - Laptop Price Estimation (INR)  
**Thuật toán:** Linear Regression, Random Forest, Gradient Boosting

---

### Muc luc
1. [Import thư viện](#1-import-thu-vien)
2. [Tải dữ liệu](#2-tai-du-lieu)
3. [Phân tích khám phá dữ liệu (EDA)](#3-phan-tich-kham-pha-du-lieu)
4. [Tiền xử lý dữ liệu](#4-tien-xu-ly-du-lieu)
5. [Kỹ thuật đặc trưng](#5-ky-thuat-dac-trung)
6. [Huấn luyện mô hình](#6-huan-luyen-mo-hinh)
7. [Đánh giá mô hình](#7-danh-gia-mo-hinh)
8. [Độ quan trọng đặc trưng](#8-do-quan-trong-dac-trung)
9. [Kết luận](#9-ket-luan)

---
## 1. Import thư viện (Import Libraries)

In [None]:
# Thu vien xu ly du lieu
import pandas as pd
import numpy as np

# Thu vien truc quan hoa
import matplotlib.pyplot as plt
import seaborn as sns

# Thu vien Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Tat canh bao
import warnings
warnings.filterwarnings('ignore')

# Cai dat style bieu do
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print('Da import thanh cong tat ca thu vien!')

---
## 2. Tai du lieu (Load Dataset)

In [None]:
# Tai dataset
# Thay doi duong dan phu hop voi vi tri file cua ban
df = pd.read_csv('data/laptop_prices.csv')

print(f'Kich thuoc dataset: {df.shape}')
print(f'Cac cot: {df.columns.tolist()}')
print(f'\n5 dong dau tien:')
df.head()

In [None]:
# Thong tin dataset
print('Thong tin Dataset:')
print('=' * 50)
df.info()

In [None]:
# Thong ke mo ta
print('Thong ke mo ta:')
df.describe()

---
## 3. Phan tich kham pha du lieu (Exploratory Data Analysis)

### 3.1 Phan tich gia tri thieu (Missing Values Analysis)

In [None]:
# Kiem tra gia tri thieu
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'So luong thieu': missing,
    'Ty le (%)': missing_pct
})

print('Phan tich gia tri thieu:')
print('=' * 50)
print(missing_df[missing_df['So luong thieu'] > 0])

if missing.sum() == 0:
    print('\nKhong co gia tri thieu!')

### 3.2 Phan tich bien muc tieu - Gia (Target Variable Analysis - Price)

In [None]:
# Phan phoi gia
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Histogram
axes[0].hist(df['Price'], bins=50, edgecolor='black', alpha=0.7, color='#1E88E5')
axes[0].axvline(df['Price'].mean(), color='red', linestyle='--', linewidth=2, label=f'Trung binh: {df["Price"].mean():,.0f}')
axes[0].axvline(df['Price'].median(), color='green', linestyle='--', linewidth=2, label=f'Trung vi: {df["Price"].median():,.0f}')
axes[0].set_xlabel('Gia (INR)', fontsize=12)
axes[0].set_ylabel('Tan suat', fontsize=12)
axes[0].set_title('Phan phoi Gia', fontsize=14, fontweight='bold')
axes[0].legend()

# Box plot
bp = axes[1].boxplot(df['Price'], patch_artist=True)
bp['boxes'][0].set_facecolor('#1E88E5')
axes[1].set_ylabel('Gia (INR)', fontsize=12)
axes[1].set_title('Bieu do hop Gia', fontsize=14, fontweight='bold')

# Log distribution
axes[2].hist(np.log1p(df['Price']), bins=50, edgecolor='black', alpha=0.7, color='#4CAF50')
axes[2].set_xlabel('Log(Gia)', fontsize=12)
axes[2].set_ylabel('Tan suat', fontsize=12)
axes[2].set_title('Phan phoi Gia sau Log', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Thong ke gia
print('\nThong ke Gia (INR):')
print('=' * 40)
print(f'   Trung binh: {df["Price"].mean():>12,.0f}')
print(f'   Trung vi:   {df["Price"].median():>12,.0f}')
print(f'   Do lech:    {df["Price"].std():>12,.0f}')
print(f'   Nho nhat:   {df["Price"].min():>12,.0f}')
print(f'   Lon nhat:   {df["Price"].max():>12,.0f}')

### 3.3 Phan tich dac trung phan loai (Categorical Features Analysis)

In [None]:
# Phan phoi thuong hieu
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# So luong theo thuong hieu
brand_counts = df['Company'].value_counts()
colors = plt.cm.Blues(np.linspace(0.3, 0.9, len(brand_counts)))
axes[0].barh(brand_counts.index, brand_counts.values, color=colors)
axes[0].set_xlabel('So luong', fontsize=12)
axes[0].set_title('So luong Laptop theo Thuong hieu', fontsize=14, fontweight='bold')
axes[0].invert_yaxis()

# Gia trung binh theo thuong hieu
brand_price = df.groupby('Company')['Price'].mean().sort_values(ascending=True)
colors = plt.cm.Greens(np.linspace(0.3, 0.9, len(brand_price)))
axes[1].barh(brand_price.index, brand_price.values, color=colors)
axes[1].set_xlabel('Gia trung binh (INR)', fontsize=12)
axes[1].set_title('Gia trung binh theo Thuong hieu', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Phan phoi he dieu hanh
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bieu do tron
os_counts = df['OpSys'].value_counts()
colors = plt.cm.Set3(np.linspace(0, 1, len(os_counts)))
axes[0].pie(os_counts.values, labels=os_counts.index, autopct='%1.1f%%', colors=colors, startangle=90)
axes[0].set_title('Phan phoi He dieu hanh', fontsize=14, fontweight='bold')

# Gia trung binh theo OS
os_price = df.groupby('OpSys')['Price'].mean().sort_values(ascending=True)
axes[1].barh(os_price.index, os_price.values, color='#FF7043')
axes[1].set_xlabel('Gia trung binh (INR)', fontsize=12)
axes[1].set_title('Gia trung binh theo He dieu hanh', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

### 3.4 Phan tich dac trung so (Numerical Features Analysis)

In [None]:
# RAM vs Gia
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Phan phoi RAM
ram_counts = df['Ram'].value_counts().sort_index()
axes[0].bar(ram_counts.index.astype(str), ram_counts.values, color='#2196F3', edgecolor='black')
axes[0].set_xlabel('RAM (GB)', fontsize=12)
axes[0].set_ylabel('So luong', fontsize=12)
axes[0].set_title('Phan phoi RAM', fontsize=14, fontweight='bold')

# Gia trung binh theo RAM
ram_price = df.groupby('Ram')['Price'].mean().sort_index()
axes[1].bar(ram_price.index.astype(str), ram_price.values, color='#4CAF50', edgecolor='black')
axes[1].set_xlabel('RAM (GB)', fontsize=12)
axes[1].set_ylabel('Gia trung binh (INR)', fontsize=12)
axes[1].set_title('Gia trung binh theo RAM', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

### 3.5 Phan tich tuong quan (Correlation Analysis)

In [None]:
# Chon cac cot so
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f'Cac cot so: {numerical_cols}')

# Ma tran tuong quan
corr_matrix = df[numerical_cols].corr()

# Heatmap
fig, ax = plt.subplots(figsize=(10, 8))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='RdBu_r', center=0, 
            fmt='.2f', linewidths=0.5, ax=ax)
ax.set_title('Ma tran tuong quan', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Tuong quan voi Gia
price_corr = corr_matrix['Price'].drop('Price').sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#4CAF50' if x > 0 else '#F44336' for x in price_corr.values]
ax.barh(price_corr.index, price_corr.values, color=colors, edgecolor='black')
ax.set_xlabel('Tuong quan voi Gia', fontsize=12)
ax.set_title('Tuong quan cac dac trung voi Gia', fontsize=14, fontweight='bold')
ax.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()

print('\nTuong quan voi Gia:')
print('=' * 40)
for idx, val in price_corr.items():
    print(f'   {idx:<15}: {val:>8.3f}')

---
## 4. Tien xu ly du lieu (Data Preprocessing)

In [None]:
# Tao ban sao de xu ly
df_processed = df.copy()

# Xoa ban ghi trung lap
initial_len = len(df_processed)
df_processed = df_processed.drop_duplicates()
print(f'Da xoa {initial_len - len(df_processed)} ban ghi trung lap')
print(f'Kich thuoc dataset: {len(df_processed)} ban ghi')

In [None]:
# Tao cot storage_gb tu SSD + HDD
df_processed['storage_gb'] = df_processed['SSD'] + df_processed['HDD']
df_processed.loc[df_processed['storage_gb'] == 0, 'storage_gb'] = 256  # Gia tri mac dinh

# Doi ten cot cho nhat quan
df_processed = df_processed.rename(columns={
    'Company': 'brand',
    'Ram': 'ram_gb',
    'Inches': 'screen_size',
    'Price': 'price',
    'OpSys': 'os',
    'Cpu': 'processor',
    'Gpu': 'gpu'
})

print('Da doi ten cot va tao storage_gb')
df_processed.head()

---
## 5. Ky thuat dac trung (Feature Engineering)

In [None]:
# Ham tinh diem Processor
def get_processor_score(cpu_str):
    if pd.isna(cpu_str):
        return 5
    cpu = str(cpu_str).lower()
    
    if '14 core' in cpu or '16 core' in cpu:
        return 9
    elif '12 core' in cpu or 'octa core' in cpu:
        return 8
    elif '10 core' in cpu:
        return 7
    elif 'hexa core' in cpu:
        return 6
    elif 'quad core' in cpu:
        return 5
    elif 'dual core' in cpu:
        return 3
    return 5

# Ham tinh diem GPU
def get_gpu_score(gpu_str):
    if pd.isna(gpu_str):
        return 3
    gpu = str(gpu_str).lower()
    
    if any(x in gpu for x in ['4090', '4080']):
        return 10
    elif any(x in gpu for x in ['4070', '4060']):
        return 8
    elif '4050' in gpu:
        return 7
    elif any(x in gpu for x in ['3080', '3070']):
        return 8
    elif '3060' in gpu:
        return 7
    elif '3050' in gpu:
        return 6
    elif any(x in gpu for x in ['2050', '1660']):
        return 5
    elif any(x in gpu for x in ['1650', 'mx']):
        return 4
    elif any(x in gpu for x in ['iris', 'xe']):
        return 3
    elif any(x in gpu for x in ['intel', 'uhd', 'integrated']):
        return 2
    return 3

# Ap dung feature engineering
df_processed['processor_score'] = df_processed['processor'].apply(get_processor_score)
df_processed['gpu_score'] = df_processed['gpu'].apply(get_gpu_score)

# Diem hieu nang tong hop
df_processed['performance_score'] = (
    df_processed['processor_score'] * 0.4 +
    df_processed['gpu_score'] * 0.3 +
    (df_processed['ram_gb'] / 64) * 10 * 0.2 +
    (df_processed['storage_gb'] / 2048) * 10 * 0.1
)

# Laptop gaming hay khong
df_processed['is_gaming'] = df_processed['Product'].str.lower().str.contains('gaming', na=False).astype(int)

print('Da hoan thanh Feature Engineering!')
print('\nCac dac trung moi:')
print('   - processor_score')
print('   - gpu_score')
print('   - performance_score')
print('   - is_gaming')

In [None]:
# Truc quan hoa cac dac trung moi
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Phan phoi diem processor
axes[0, 0].hist(df_processed['processor_score'], bins=10, edgecolor='black', color='#2196F3')
axes[0, 0].set_xlabel('Diem Processor')
axes[0, 0].set_title('Phan phoi diem Processor', fontweight='bold')

# Phan phoi diem GPU
axes[0, 1].hist(df_processed['gpu_score'], bins=10, edgecolor='black', color='#4CAF50')
axes[0, 1].set_xlabel('Diem GPU')
axes[0, 1].set_title('Phan phoi diem GPU', fontweight='bold')

# Diem hieu nang vs Gia
axes[1, 0].scatter(df_processed['performance_score'], df_processed['price'], alpha=0.5, c='#FF7043')
axes[1, 0].set_xlabel('Diem hieu nang')
axes[1, 0].set_ylabel('Gia (INR)')
axes[1, 0].set_title('Diem hieu nang vs Gia', fontweight='bold')

# Gaming vs Non-Gaming
gaming_price = df_processed.groupby('is_gaming')['price'].mean()
axes[1, 1].bar(['Thuong', 'Gaming'], gaming_price.values, color=['#2196F3', '#F44336'], edgecolor='black')
axes[1, 1].set_ylabel('Gia trung binh (INR)')
axes[1, 1].set_title('Gia trung binh: Gaming vs Thuong', fontweight='bold')

plt.tight_layout()
plt.show()

### 5.1 Ma hoa dac trung phan loai (Encode Categorical Features)

In [None]:
# Ma hoa Label Encoding cho cac cot phan loai
label_encoders = {}
categorical_cols = ['brand', 'os', 'processor', 'gpu']

for col in categorical_cols:
    if col in df_processed.columns:
        le = LabelEncoder()
        df_processed[f'{col}_encoded'] = le.fit_transform(df_processed[col].astype(str))
        label_encoders[col] = le
        print(f'Da ma hoa {col}: {len(le.classes_)} gia tri duy nhat')

print(f'\nTong so dac trung da ma hoa: {len(label_encoders)}')

---
## 6. Huan luyen mo hinh (Model Training)

### 6.1 Chuan bi du lieu (Prepare Data)

In [None]:
# Chon cac dac trung de huan luyen
feature_cols = [
    'ram_gb', 'storage_gb', 'screen_size',
    'processor_score', 'gpu_score', 'performance_score', 'is_gaming',
    'brand_encoded', 'os_encoded'
]

# Loc cac cot co san
available_features = [col for col in feature_cols if col in df_processed.columns]
print(f'Cac dac trung su dung: {available_features}')

# Chuan bi X va y
X = df_processed[available_features]
y = df_processed['price']

print(f'\nKich thuoc X: {X.shape}')
print(f'Kich thuoc y: {y.shape}')

In [None]:
# Chia du lieu train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f'Tap huan luyen: {X_train.shape}')
print(f'Tap kiem tra: {X_test.shape}')

# Chuan hoa dac trung
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print('\nDa chia va chuan hoa du lieu thanh cong!')

### 6.2 Huan luyen nhieu mo hinh (Train Multiple Models)

In [None]:
# Khoi tao cac mo hinh
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
}

# Huan luyen va danh gia
results = {}

print('=' * 70)
print('HUAN LUYEN VA DANH GIA MO HINH')
print('=' * 70)

for name, model in models.items():
    print(f'\nDang huan luyen {name}...')
    
    # Huan luyen
    model.fit(X_train_scaled, y_train)
    
    # Du doan
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)
    
    # Tinh cac chi so
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_root_mean_squared_error')
    cv_rmse = -cv_scores.mean()
    
    # Luu ket qua
    results[name] = {
        'model': model,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_mae': train_mae,
        'test_mae': test_mae,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'cv_rmse': cv_rmse,
        'predictions': y_test_pred
    }
    
    print(f'   Test RMSE: {test_rmse:,.0f} INR')
    print(f'   Test MAE:  {test_mae:,.0f} INR')
    print(f'   Test R2:   {test_r2:.4f}')
    print(f'   CV RMSE:   {cv_rmse:,.0f} INR')

---
## 7. Danh gia mo hinh (Model Evaluation)

### 7.1 So sanh mo hinh (Model Comparison)

In [None]:
# Tao bang so sanh
comparison_df = pd.DataFrame({
    'Mo hinh': list(results.keys()),
    'Test RMSE (INR)': [results[m]['test_rmse'] for m in results],
    'Test MAE (INR)': [results[m]['test_mae'] for m in results],
    'Test R2': [results[m]['test_r2'] for m in results],
    'CV RMSE (INR)': [results[m]['cv_rmse'] for m in results]
})

comparison_df = comparison_df.round(2)
print('Bang so sanh cac mo hinh:')
print('=' * 80)
print(comparison_df.to_string(index=False))

In [None]:
# Truc quan hoa so sanh mo hinh
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

models_names = list(results.keys())
colors = ['#2196F3', '#4CAF50', '#FF7043']

# So sanh RMSE
rmse_values = [results[m]['test_rmse'] for m in models_names]
axes[0].bar(models_names, rmse_values, color=colors, edgecolor='black')
axes[0].set_ylabel('RMSE (INR)', fontsize=12)
axes[0].set_title('So sanh Test RMSE', fontsize=14, fontweight='bold')
axes[0].tick_params(axis='x', rotation=15)
for i, v in enumerate(rmse_values):
    axes[0].text(i, v + 500, f'{v:,.0f}', ha='center', fontsize=10)

# So sanh MAE
mae_values = [results[m]['test_mae'] for m in models_names]
axes[1].bar(models_names, mae_values, color=colors, edgecolor='black')
axes[1].set_ylabel('MAE (INR)', fontsize=12)
axes[1].set_title('So sanh Test MAE', fontsize=14, fontweight='bold')
axes[1].tick_params(axis='x', rotation=15)
for i, v in enumerate(mae_values):
    axes[1].text(i, v + 500, f'{v:,.0f}', ha='center', fontsize=10)

# So sanh R2
r2_values = [results[m]['test_r2'] for m in models_names]
axes[2].bar(models_names, r2_values, color=colors, edgecolor='black')
axes[2].set_ylabel('R2 Score', fontsize=12)
axes[2].set_title('So sanh Test R2', fontsize=14, fontweight='bold')
axes[2].tick_params(axis='x', rotation=15)
axes[2].set_ylim(0, 1)
for i, v in enumerate(r2_values):
    axes[2].text(i, v + 0.02, f'{v:.3f}', ha='center', fontsize=10)

plt.tight_layout()
plt.show()

### 7.2 Bieu do Thuc te vs Du doan (Actual vs Predicted)

In [None]:
# Thuc te vs Du doan cho tat ca mo hinh
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for idx, (name, res) in enumerate(results.items()):
    ax = axes[idx]
    
    ax.scatter(y_test, res['predictions'], alpha=0.5, c=colors[idx], edgecolors='black', linewidth=0.5)
    
    # Duong du doan hoan hao
    min_val = min(y_test.min(), res['predictions'].min())
    max_val = max(y_test.max(), res['predictions'].max())
    ax.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='Du doan hoan hao')
    
    ax.set_xlabel('Gia thuc te (INR)', fontsize=12)
    ax.set_ylabel('Gia du doan (INR)', fontsize=12)
    ax.set_title(f'{name}\nR2 = {res["test_r2"]:.3f}', fontsize=12, fontweight='bold')
    ax.legend()

plt.tight_layout()
plt.show()

### 7.3 Phan tich phan du (Residual Analysis)

In [None]:
# Bieu do phan du
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

for idx, (name, res) in enumerate(results.items()):
    ax = axes[idx]
    residuals = y_test.values - res['predictions']
    
    ax.scatter(res['predictions'], residuals, alpha=0.5, c=colors[idx], edgecolors='black', linewidth=0.5)
    ax.axhline(y=0, color='red', linestyle='--', linewidth=2)
    
    ax.set_xlabel('Gia du doan (INR)', fontsize=12)
    ax.set_ylabel('Phan du (INR)', fontsize=12)
    ax.set_title(f'{name} - Bieu do phan du', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

---
## 8. Do quan trong dac trung (Feature Importance)

In [None]:
# Lay do quan trong dac trung tu mo hinh tot nhat
best_model_name = max(results.keys(), key=lambda x: results[x]['test_r2'])
best_model = results[best_model_name]['model']

print(f'Mo hinh tot nhat: {best_model_name}')
print(f'   R2 Score: {results[best_model_name]["test_r2"]:.4f}')

if hasattr(best_model, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'Dac trung': available_features,
        'Do quan trong': best_model.feature_importances_
    }).sort_values('Do quan trong', ascending=False)
    
    print('\nDo quan trong dac trung:')
    print('=' * 40)
    for _, row in importance_df.iterrows():
        print(f'   {row["Dac trung"]:<20}: {row["Do quan trong"]:.4f}')

In [None]:
# Truc quan hoa do quan trong dac trung
if hasattr(best_model, 'feature_importances_'):
    fig, ax = plt.subplots(figsize=(10, 6))
    
    colors_imp = plt.cm.Blues(np.linspace(0.4, 0.9, len(importance_df)))
    bars = ax.barh(importance_df['Dac trung'], importance_df['Do quan trong'], color=colors_imp, edgecolor='black')
    ax.set_xlabel('Do quan trong', fontsize=12)
    ax.set_title(f'Do quan trong dac trung ({best_model_name})', fontsize=14, fontweight='bold')
    ax.invert_yaxis()
    
    # Them nhan gia tri
    for bar, val in zip(bars, importance_df['Do quan trong']):
        ax.text(val + 0.01, bar.get_y() + bar.get_height()/2, f'{val:.3f}', va='center', fontsize=10)
    
    plt.tight_layout()
    plt.show()

---
## 9. Ket luan (Conclusion)

In [None]:
print('=' * 70)
print('TOM TAT DU AN')
print('=' * 70)

print(f'''
Muc tieu: Du doan gia laptop dua tren cau hinh

Dataset:
   - Nguon: Kaggle - Laptop Price Estimation
   - Tong so ban ghi: {len(df):,}
   - So dac trung su dung: {len(available_features)}
   - Bien muc tieu: Gia (INR - Rupee An Do)

Tien xu ly:
   - Xu ly gia tri thieu
   - Xoa ban ghi trung lap
   - Tao storage_gb tu SSD + HDD
   - Ma hoa cac dac trung phan loai
   - Chuan hoa dac trung bang StandardScaler

Feature Engineering:
   - processor_score (dua tren so nhan)
   - gpu_score (dua tren model GPU)
   - performance_score (ket hop co trong so)
   - is_gaming (tu ten san pham)

Cac mo hinh da huan luyen:
''')

for name, res in results.items():
    marker = '(Tot nhat)' if name == best_model_name else ''
    print(f'   {name} {marker}:')
    print(f'      - Test R2: {res["test_r2"]:.4f}')
    print(f'      - Test RMSE: {res["test_rmse"]:,.0f} INR')
    print(f'      - Test MAE: {res["test_mae"]:,.0f} INR')

print(f'''
Mo hinh tot nhat: {best_model_name}
   - R2 Score: {results[best_model_name]["test_r2"]:.4f}
   - RMSE: {results[best_model_name]["test_rmse"]:,.0f} INR
   - MAE: {results[best_model_name]["test_mae"]:,.0f} INR

Nhan xet chinh:
   1. RAM va performance_score la cac yeu to du doan quan trong nhat
   2. Laptop gaming thuong co gia cao hon
   3. Laptop Apple co gia trung binh cao nhat
   4. Mo hinh tree-based vuot troi hon Linear Regression
   5. Feature engineering cai thien hieu suat mo hinh
''')

print('=' * 70)
print('Hoan thanh du an!')
print('=' * 70)

In [None]:
# Bieu do tom tat cuoi cung
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Hieu suat mo hinh
ax = axes[0, 0]
models_names = list(results.keys())
r2_scores = [results[m]['test_r2'] for m in models_names]
colors_bar = ['#4CAF50' if m == best_model_name else '#2196F3' for m in models_names]
bars = ax.bar(models_names, r2_scores, color=colors_bar, edgecolor='black')
ax.set_ylabel('R2 Score', fontsize=12)
ax.set_title('Hieu suat mo hinh (R2 Score)', fontsize=14, fontweight='bold')
ax.set_ylim(0, 1)
for bar, score in zip(bars, r2_scores):
    ax.text(bar.get_x() + bar.get_width()/2, score + 0.02, f'{score:.3f}', ha='center', fontsize=11)

# 2. Do quan trong dac trung
ax = axes[0, 1]
if hasattr(best_model, 'feature_importances_'):
    top_features = importance_df.head(5)
    ax.barh(top_features['Dac trung'], top_features['Do quan trong'], color='#FF7043', edgecolor='black')
    ax.set_xlabel('Do quan trong', fontsize=12)
    ax.set_title('Top 5 dac trung quan trong', fontsize=14, fontweight='bold')
    ax.invert_yaxis()

# 3. Sai so du doan
ax = axes[1, 0]
best_preds = results[best_model_name]['predictions']
errors = np.abs(y_test.values - best_preds)
ax.scatter(y_test, errors, alpha=0.5, c='#9C27B0', edgecolors='black', linewidth=0.5)
ax.set_xlabel('Gia thuc te (INR)', fontsize=12)
ax.set_ylabel('Sai so tuyet doi (INR)', fontsize=12)
ax.set_title('Sai so du doan vs Gia thuc te', fontsize=14, fontweight='bold')

# 4. Phan phoi Thuc te vs Du doan
ax = axes[1, 1]
ax.hist(y_test, bins=30, alpha=0.5, label='Thuc te', color='#2196F3', edgecolor='black')
ax.hist(best_preds, bins=30, alpha=0.5, label='Du doan', color='#4CAF50', edgecolor='black')
ax.set_xlabel('Gia (INR)', fontsize=12)
ax.set_ylabel('Tan suat', fontsize=12)
ax.set_title('Phan phoi Thuc te vs Du doan', fontsize=14, fontweight='bold')
ax.legend()

plt.tight_layout()
plt.show()

print('\nDa hoan thanh notebook!')