In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import chardet
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))
#         path = os.path.join(dirname, filename)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import kagglehub

# Download latest version
dataset_dir = kagglehub.dataset_download("abdulmalik1518/mobiles-dataset-2025")
print("Path to dataset files:", dataset_dir)

# Find the CSV file within the diectory
for filename in os.listdir(dataset_dir):
  if filename.endswith(".csv"):
    csv_path = os.path.join(dataset_dir, filename)
    break #Stop after finding the first CSV file
print("CSV file found at:", csv_path)

with open(csv_path, 'rb') as file:
    Chardet = chardet.detect(file.read())
print(Chardet['encoding'])

In [None]:
df = pd.read_csv(csv_path, encoding=Chardet['encoding'])
df.head()

In [None]:
df.tail()

#**Data cleaning and preparations**

#1: Check missing data

In [None]:
df.info()

In [None]:
df.isnull().sum()


 No missing data

#2: Check and remove duplications

In [None]:
duplication = df.duplicated(keep='first').sum()
print(f'Duplicated rows: {duplication}')
df = df.drop_duplicates(keep='first')

#3: Review each column in dataframe

In [None]:
for col in df.columns:
  print(f'Unique values in {col}:\n', sorted(df[col].dropna().unique()), '\n')

In [None]:
# Convert ['Company Name', 'Model Name', 'Processor'] columns to categorical type
categorical_cols = ['Company Name', 'Model Name', 'Processor']
df[categorical_cols] = df[categorical_cols].astype('category')

# Convert 'Launched Year' to datetime, extract the year, and convert to integer type
df['Launched Year'] = pd.to_datetime(df['Launched Year'], format='%Y').dt.year.astype('int')

In [None]:
df.info()

In [None]:
# Mobile Weight column: Remove 'g' and convert to float
df['Mobile Weight (g)'] = df['Mobile Weight'].str.replace('g', '').astype(float)
df.drop('Mobile Weight', axis=1, inplace=True)

In [None]:
# RAM column: Remove 'GB' and convert to float
def clean_ram(item):
  item = item.replace('GB', '')
  if '/' in item:
    item = item.split('/')[1] # Take the higher value

  return item

df['RAM (GB)'] = df['RAM'].apply(clean_ram).astype(float)
df.drop('RAM', axis=1, inplace=True)

In [None]:
# Front Camera column: Remove 'MP' and convert to float
def clean_front_camera(item):
  item = [float(i) for i in re.findall(r'\d+', item)]
  return max(item) if item else 0.

df['Front Camera (MP)'] = df['Front Camera'].apply(clean_front_camera)
df.drop('Front Camera', axis=1, inplace=True)

In [None]:
# Front Camera column: Remove 'MP', convert to float and split into 4 columns [Main Camera, Ultra Camera, Telephoto Camera, Macro Camera]
def clean_back_camera(item):
  items = item.split('+')
  list_camera = [0., 0., 0., 0.]

  for idx, item in enumerate(items):
    if idx==2:
      if 'macro' not in item.lower():
        list_camera[2] = float(item.split('MP')[0])
      elif 'macro' in item.lower():
        list_camera[3] = float(item.split('MP')[0])
    else:
      list_camera[idx] = float(item.split('MP')[0])

  return list_camera

df[['Main Camera (MP)', 'Ultra Camera (MP)', 'Telephoto Camera (MP)', 'Macro Camera (MP)']] = df['Back Camera'].apply(lambda x: pd.Series(clean_back_camera(x)))
df.drop('Back Camera', axis=1, inplace=True)

In [None]:
# Battery Capacity column: Remove 'mAh' and convert to int
df['Battery Capacity (mAh)'] = df['Battery Capacity'].str.replace('mAh', '')
df['Battery Capacity (mAh)'] = df['Battery Capacity (mAh)'].str.replace(',', '').astype(int)
df.drop('Battery Capacity', axis=1, inplace=True)

In [None]:
# Screen Size column: Remove 'inches' and convert to float
df['Screen Size (inches)'] = df['Screen Size'].apply(lambda x: x.split('inches')[0]).astype(float)
df.drop('Screen Size', axis=1, inplace=True)

In [None]:
# Model Name column: Extract storage size column
def extract_storage(item):
  item = item.split(' ')[-1]
  if 'GB' in item:
    item = int(item.replace('GB', ''))
  elif 'TB' in item:
    item = int(item.replace('TB', '')) * 1024
  else:
    item = pd.NA

  return item

df['Storage (GB)'] = df['Model Name'].apply(lambda x: extract_storage(x))

In [None]:
for col in df.columns:
  print(f'Unique values in {col}:\n', sorted(df[col].dropna().unique()), '\n')

In [None]:
# Launched Price columns: we will create new fields for the country and currency
df['Launched Price (China)'] = df['Launched Price (China)'].str.replace('¬•', 'CNY ').str.replace(',', '').str.replace('\xa0', '')

price_columns = [
    'Launched Price (Pakistan)', 'Launched Price (India)',
    'Launched Price (China)', 'Launched Price (USA)', 'Launched Price (Dubai)'
]

# Melt the DataFrame: Convert wide format into long format
df_melted = df.melt(id_vars=[col for col in df.columns if col not in price_columns],
                     value_vars=price_columns,
                     var_name="Country", value_name="Price")

# Extract Country from column names
df_melted["Country"] = df_melted["Country"].str.extract(r'Launched Price \((.*?)\)')
df_melted['Country'] = df_melted['Country'].astype("category")

# Now, df_melted has a single "Price" column and a "Country" column
print(df_melted[['Country', 'Price']].head())

# üîπ Replace the original df with the transformed version
df = df_melted.copy()

In [None]:
# Function to extract currency and numeric price
def split_currency_price(value):
    if pd.isna(value) or value.strip() == "":
        return pd.NA, pd.NA  # Handle missing values

    match = re.match(r'([A-Za-z]+)\s*([\d,]+)', str(value))  # Extract currency and price
    if match:
        currency = match.group(1)  # Extract currency (e.g., USD, PKR, INR)
        price = match.group(2).replace(',', '')  # Remove commas in numbers
        return currency, float(price)  # Convert price to float

    return pd.NA, pd.NA  # If no match, return NaN

# Apply function to split into two new columns
df[['Currency', 'Price (Numeric)']] = df['Price'].apply(lambda x: pd.Series(split_currency_price(x)))

# Now, df has separate 'Currency' and 'Price (Numeric)' columns
print(df[['Country', 'Price', 'Currency', 'Price (Numeric)']].head())

In [None]:
# Convert 'Currency' to categorical
df['Currency'] = df['Currency'].astype('category')

# Convert 'Price (Numeric)' to integer
df['Price (Numeric)'] = df['Price (Numeric)'].astype('Int64')  # Supports NaN values

# Drop the original 'Price' column
df.drop(columns=['Price'], inplace=True)

In [None]:
# Display rows where Currency is missing
missing_currency_rows = df[df['Currency'].isna()]
print(missing_currency_rows['Country'])

In [None]:
df.loc[df['Country'] == 'Pakistan', 'Currency'] = 'PKR'
df.loc[df['Country'] == 'China', 'Currency'] = 'CNY'

In [None]:
# Display rows where Currency is missing
missing_currency_rows = df[df['Currency'].isna()]
print(missing_currency_rows['Country'])

In [None]:
# Define exchange rates to normalize prices using the currency column
exchange_rates = {
    'PKR': 0.0036,
    'INR': 0.012,
    'CNY': 0.14,
    'USD': 1.0,
    'AED': 0.27
}

# Function to normalize price
def normalize_price(row):
    if pd.isna(row['Price (Numeric)']) or pd.isna(row['Currency']):
        return np.nan  # Keep NaN if no price or currency
    return row['Price (Numeric)'] * exchange_rates.get(row['Currency'], np.nan)

# Apply normalization
df['Normalized Price (USD)'] = df.apply(normalize_price, axis=1)

# Now the prices are normalized
print(df[['Price (Numeric)', 'Currency', 'Normalized Price (USD)']].head())
# Drop the original 'Price Numeric' column
df.drop(columns=['Price (Numeric)'], inplace=True)

# Drop the currency column as it isn't needed any more
df.drop(columns=['Currency'], inplace=True)

In [None]:
df.head(50)

In [None]:
for col in df.columns:
  print(f'Unique values in {col}:\n', sorted(df[col].dropna().unique()), '\n')

In [None]:
# Lets check and see after
df.info()

In [None]:
df.isna().sum()

In [None]:
# The normalized price seems to be missing two records, lets investigate and fillin using the average price grouped by country

# Ensure we are working on a copy to avoid SettingWithCopyWarning
df = df.copy()

# Calculate the average Normalized Price per Country
avg_price_per_country = df.groupby('Country', observed=True)['Normalized Price (USD)'].transform('mean')

# Fill missing values safely
df.loc[df['Normalized Price (USD)'].isna(), 'Normalized Price (USD)'] = avg_price_per_country

# Verify if there are still missing values
print(df['Normalized Price (USD)'].isna().sum())  # Should print 0 if all missing values are filled

In [None]:
# Check summary statistics to look for inconsistancey and to verify
print(df[['Mobile Weight (g)', 'Screen Size (inches)', 'Battery Capacity (mAh)', 'Normalized Price (USD)']].describe())

In [None]:
df[df['Mobile Weight (g)'] == 732]['Storage (GB)'].unique()

In [None]:
# For Mobile Weight (g), Max = 732g seems very high (most phones weigh 150-250g)
# For Screen Size Max = 14.6 inches and a battery capacity of 11200 mAh is unusually large (could be tablets).
df[(df['Screen Size (inches)'] == 14.6) | (df['Mobile Weight (g)'] == 732)][['Storage (GB)', 'Screen Size (inches)', 'Battery Capacity (mAh)', 'Mobile Weight (g)']].drop_duplicates()

In [None]:
# Lets investigate the max price of 39622, it is most likely to be an input error
df[df['Normalized Price (USD)'] == 39622][['Storage (GB)','Processor', 'Screen Size (inches)', 'Battery Capacity (mAh)', 'Normalized Price (USD)']].drop_duplicates()

In [None]:
# Filter dataset for similar products (same processor, screen size close to 10.4 inches, and battery capacity close to 8200 mAh)
similar_products = df[
    (df['Processor'] == 'Unisoc T612') &
    (df['Screen Size (inches)'] == 10.4) &  # Screen size around 10.4 inches
    (df['Battery Capacity (mAh)'] == 8200)  # Battery capacity around 8200 mAh
]

# Display the filtered dataset with similar products
print(similar_products[['Storage (GB)', 'Processor', 'Screen Size (inches)', 'Battery Capacity (mAh)', 'Normalized Price (USD)']])

# Alternatively, you can check the price statistics of the similar products
print(similar_products['Normalized Price (USD)'].describe())

In [None]:
# It seems like the $39622 is really an input error, lets deal with it

# Step 1: Remove the outlier
df_filtered = df[(df['Model Name'] != 'T21') | (df['Normalized Price (USD)'] != 39622)]

# Step 2: Group by relevant features and calculate the mean price
grouped = df_filtered.groupby(['Processor', 'Screen Size (inches)', 'Battery Capacity (mAh)'])['Normalized Price (USD)'].mean().reset_index()

# Step 3: Find the correct mean value for the T21 outlier
mean_price = grouped[(grouped['Processor'] == 'Unisoc T612') &
                     (grouped['Screen Size (inches)'] == 10.4) &
                     (grouped['Battery Capacity (mAh)'] == 8200)]['Normalized Price (USD)'].values[0]

# Step 4: Update the outlier's price
df.loc[(df['Model Name'] == 'T21') & (df['Normalized Price (USD)'] == 39622), 'Normalized Price (USD)'] = mean_price

# Step 5: Verify the update
updated_price = df[df['Model Name'] == 'T21'][['Model Name', 'Processor', 'Screen Size (inches)', 'Battery Capacity (mAh)', 'Normalized Price (USD)']]
print(updated_price)

In [None]:
# Apply natural log transformation
df['log_normalized_price'] = np.log(df['Normalized Price (USD)'])

In [None]:
df['log_normalized_price'].head()

#4: Vizualization data
1: Distribution of variables:

In [None]:
# Histogram: Shows the frequency distribution of a continuous variable. This helps you see the shape of the distribution (e.g., normal, left-skewed, right-skewed, multi-peaked), the central values, and the spread of the data.
numerical_cols = ['Mobile Weight (g)',  'Screen Size (inches)',  'Battery Capacity (mAh)',
                  'Normalized Price (USD)',  'log_normalized_price',  'RAM (GB)',  'Front Camera (MP)',
                  'Main Camera (MP)',  'Ultra Camera (MP)',  'Telephoto Camera (MP)',
                  'Macro Camera (MP)',  'Storage (GB)',  'Launched Year']

for col in numerical_cols:
    plt.figure(figsize=(10, 6))
    sns.histplot(df[col], kde=True) # kde=True to display the Kernel density estimate line
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

Comment: Observe the distribution shape of 'Normalized Price (USD)' before and after applying log transformation (log_normalized_price). You will see that log transformation makes the distribution more symmetrical.

In [None]:
# Box Plot: Displays the quartiles, medians, and potential outliers of  'Normalized Price (USD)',  'log_normalized_price'.

for col in [ 'Normalized Price (USD)',  'log_normalized_price']:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=df[col])
    plt.title(f'Box Plot  {col}')
    plt.xlabel(col)
    plt.show()

2:  Distribution of categorical variables:


In [None]:
# Bar Chart: Shows the frequency or proportion of each category in a categorical variable.

categorical_cols = ['Company Name', 'Country']

for col in categorical_cols:
    plt.figure(figsize=(12, 6))
    sns.countplot(y=df[col], order=df[col].value_counts().index) # order to sort by frequency
    plt.title(f'Distribution of  {col}')
    plt.xlabel('Frequency')
    plt.ylabel(col)
    plt.show()

3. Relationship between variables:

In [None]:
# Correlation Heatmap: Displays the correlation matrix between continuous variables.

for col in numerical_cols:
    if df[col].isnull().any():
        print(f"Column '{col}' has missing values. Filling with mean.")
        df[col] = df[col].fillna(df[col].mean())

correlation_matrix = df[numerical_cols].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('The correlation matrix')
plt.show()

Both Normalized Price (USD) and log_normalized_price show significant positive correlations with factors such as RAM, storage, year of launch, battery capacity, and screen size. This makes economic and technical sense, as higher-end phones with better features typically cost more.

In [None]:
# Box Plot by Group: Displays the distribution of Normalized Price (USD) across categories of a categorical variable.

plt.figure(figsize=(15, 8))
sns.boxplot(x='Company Name', y='Normalized Price (USD)', data=df)
plt.title('Price by phone brand')
plt.xlabel('Phone brand')
plt.ylabel('Price (USD)')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
# Box Plot by Group: Displays the distribution of log_normalized_price across categories of a categorical variable.

plt.figure(figsize=(15, 8))
sns.boxplot(x='Company Name', y='log_normalized_price', data=df)
plt.title('Price by phone brand')
plt.xlabel('Phone brand')
plt.ylabel('Price')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

4. Trends over time

In [None]:
yearly_avg_price = df.groupby('Launched Year')['Normalized Price (USD)'].mean()
plt.figure(figsize=(10, 6))
plt.plot(yearly_avg_price.index, yearly_avg_price.values, marker='o')
plt.title('Average price by launched year')
plt.xlabel('Launched Year')
plt.ylabel('Average price (USD)')
plt.grid(True)
plt.show()

In [None]:
# For each Company
plt.figure(figsize=(12, 8))

for company in df['Company Name'].unique():
    company_data = df[df['Company Name'] == company]
    yearly_avg_price_company = company_data.groupby('Launched Year')['Normalized Price (USD)'].mean()
    plt.plot(yearly_avg_price_company.index, yearly_avg_price_company.values, marker='o', label=company)

plt.title('Average price by launched year and Company Name')
plt.xlabel('Launched Year')
plt.ylabel('Average price (USD)')
plt.legend(title='Company Name', bbox_to_anchor=(1.05, 1), loc='upper left') # Show legend outside the chart
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Using FacetGrid for compare across countries

g = sns.FacetGrid(df, col='Country', col_wrap=3, height=4, aspect=1.5)
g.map(sns.lineplot, 'Launched Year', 'Normalized Price (USD)', 'Company Name', marker='o', errorbar=None)
g.set_axis_labels('Launched Year', 'Average Price (USD)')
g.set_titles(col_template="{col_name}")
g.add_legend(title='Company Name', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout(rect=[0, 0, 0.9, 1]) # Adjust layout to make room for legend
plt.show()

In [None]:
# Prepare data (convert categorical to numeric if not already)
df_encoded = df.copy()
for col in ['Company Name', 'Model Name', 'Processor', 'Country']:
    if df_encoded[col].dtype == 'category':
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col])

X = df_encoded.drop(['Normalized Price (USD)', 'log_normalized_price'], axis=1)
y = df_encoded['Normalized Price (USD)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

feature_importances = model.feature_importances_
feature_names = X.columns

importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importances_df = importances_df.sort_values(by='Importance', ascending=False)
print(importances_df)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importances_df)
plt.title('Feature Importance from Random Forest')
plt.show()

In [None]:
df['Processor'].value_counts()

# Prepare data for the model:

In [None]:
df['Processor'].value_counts()

1: Based on the value_counts() result of the 'Processor' column, you see that there are many different processors, and most of them appear with very low frequency (only 5 times). This confirms the concern that one-hot encoding this column has created a large number of uninformative features, which can cause dimensionality and overfitting issues.

2: The idea: Instead of coding each processor model individually, you can group them by major manufacturer (e.g. Qualcomm Snapdragon, MediaTek Helio/Dimensity, Apple Bionic, Samsung Exynos, Google Tensor, Unisoc, Kirin).

In [None]:
# Grouping of processors based on brand:

def extract_processor_brand(processor):
    if 'Snapdragon' in processor:
        return 'Qualcomm Snapdragon'
    elif 'MediaTek' in processor or 'Dimensity' in processor or 'Helio' in processor:
        return 'MediaTek'
    elif 'Bionic' in processor or 'A' in processor:
        return 'Apple Bionic'
    elif 'Exynos' in processor:
        return 'Samsung Exynos'
    elif 'Tensor' in processor:
        return 'Google Tensor'
    elif 'Unisoc' in processor:
        return 'Unisoc'
    elif 'Kirin' in processor:
        return 'Kirin'
    else:
        return 'Other' # For rare cases

df['Processor_Brand'] = df['Processor'].apply(extract_processor_brand)
df = pd.get_dummies(df, columns=['Processor_Brand'], drop_first=True)
df.drop('Processor', axis=1, inplace=True)

In [None]:
# Encoding categorical variables:

df = pd.get_dummies(df, columns=['Company Name', 'Country'], drop_first=False)
for col in df.columns:
  print(col)

# 6: Split Data: Split the selected dataset into training set and test set.

In [None]:
numerical_features = [
    'Launched Year', 'Mobile Weight (g)', 'RAM (GB)', 'Front Camera (MP)',
    'Main Camera (MP)', 'Ultra Camera (MP)', 'Telephoto Camera (MP)',
    'Macro Camera (MP)', 'Battery Capacity (mAh)', 'Screen Size (inches)',
    'Storage (GB)'
]
X = df[numerical_features].copy()
y = df['log_normalized_price'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# joblib.dump(X_train_scaled.columns, os.path.join(models_dir, 'feature_columns.joblib'))

#7: Training model

In [None]:
np.random.seed(42)
num_samples = 100

pipelines = {
    'LinearRegression': Pipeline([
        ('regressor', LinearRegression())
    ]),
    'RandomForestRegressor': Pipeline([
        ('regressor', RandomForestRegressor(random_state=42))
    ]),
    'XGBRegressor': Pipeline([
        ('regressor', XGBRegressor(random_state=42, eval_metric='rmse', use_label_encoder=False))
    ]),
    'LGBMRegressor': Pipeline([
        ('regressor', LGBMRegressor(random_state=42))
    ])
}


In [None]:
param_grids = {
    'LinearRegression': {
        'regressor__fit_intercept': [True, False]
    },
    'RandomForestRegressor': {
        'regressor__n_estimators': [100, 200],
        'regressor__max_depth': [None, 10, 20],
        'regressor__min_samples_split': [2, 5]
    },
    'XGBRegressor': {
        'regressor__n_estimators': [100, 200],
        'regressor__learning_rate': [0.05, 0.1],
        'regressor__max_depth': [3, 5]
    },
    'LGBMRegressor': {
        'regressor__n_estimators': [100, 200],
        'regressor__learning_rate': [0.05, 0.1],
        'regressor__num_leaves': [31, 62]
    }
}


In [None]:
best_models = {}
results = []

for model_name, pipeline in pipelines.items():
    print(f"\nƒêang hu·∫•n luy·ªán {model_name}...")
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grids[model_name],
        cv=5, # S·ªë fold cho cross-validation
        scoring='neg_mean_squared_error', # S·ª≠ d·ª•ng RMSE (√¢m) ƒë·ªÉ t√¨m ki·∫øm t·ªët nh·∫•t
        n_jobs=-1, # S·ª≠ d·ª•ng t·∫•t c·∫£ c√°c core CPU
        verbose=1 # Hi·ªÉn th·ªã ti·∫øn tr√¨nh
    )

    # Hu·∫•n luy·ªán tr√™n d·ªØ li·ªáu ƒë√£ ƒë∆∞·ª£c scale
    grid_search.fit(X_train_scaled, y_train)

    best_estimator = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = -grid_search.best_score_ # Chuy·ªÉn ƒë·ªïi l·∫°i v·ªÅ RMSE d∆∞∆°ng

    print(f"M√¥ h√¨nh t·ªët nh·∫•t cho {model_name}:")
    print(f"  Tham s·ªë t·ªët nh·∫•t: {best_params}")
    print(f"  RMSE tr√™n t·∫≠p hu·∫•n luy·ªán (CV): {np.sqrt(best_score):.4f}")

    # ƒê√°nh gi√° tr√™n t·∫≠p ki·ªÉm tra (ƒë√£ ƒë∆∞·ª£c scale)
    y_pred = best_estimator.predict(X_test_scaled)
    mse_test = mean_squared_error(y_test, y_pred)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, y_pred)

    print(f"  RMSE tr√™n t·∫≠p ki·ªÉm tra: {rmse_test:.4f}")
    print(f"  R-squared tr√™n t·∫≠p ki·ªÉm tra: {r2_test:.4f}")

    best_models[model_name] = best_estimator
    results.append({
        'Model': model_name,
        'Best Parameters': best_params,
        'Train RMSE (CV)': np.sqrt(best_score),
        'Test RMSE': rmse_test,
        'Test R-squared': r2_test
    })

# --- 4. T√≥m t·∫Øt k·∫øt qu·∫£ ---

print("\n--- T√≥m t·∫Øt k·∫øt qu·∫£ ---")
results_df = pd.DataFrame(results)
print(results_df.sort_values(by='Test RMSE'))

# B·∫°n c√≥ th·ªÉ truy c·∫≠p c√°c m√¥ h√¨nh t·ªët nh·∫•t nh∆∞ sau:
# best_linear_regression = best_models['LinearRegression']
# best_random_forest = best_models['RandomForestRegressor']
# best_xgb = best_models['XGBRegressor']
# best_lgbm = best_models['LGBMRegressor']

#Conclusion:

- LinearRegression is not suitable for this problem, it only explains less than 50% of the variation in log-normalized prices.

- Ensemble tree-based models (RandomForest, XGBoost, LightGBM) give superior performance, explaining more than 90% of the variation. This confirms that the relationship between features and phone prices is non-linear and complex, which tree models can capture better.

- LGBMRegressor is the best model among the tested models, achieving the lowest RMSE and highest R-squared on the test set.

In [None]:
# ƒê∆∞·ªùng d·∫´n ƒë·∫øn th∆∞ m·ª•c models
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True) # T·∫°o th∆∞ m·ª•c n·∫øu ch∆∞a t·ªìn t·∫°i

# L∆∞u m√¥ h√¨nh LGBMRegressor t·ªët nh·∫•t
best_lgbm_model = best_models['LGBMRegressor']
model_path = os.path.join(models_dir, 'best_lgbm_regressor.joblib')
joblib.dump(best_lgbm_model, model_path)
print(f"M√¥ h√¨nh LGBMRegressor ƒë√£ ƒë∆∞·ª£c l∆∞u t·∫°i: {model_path}")

# L∆∞u StandardScaler
scaler_path = os.path.join(models_dir, 'scaler.joblib')
joblib.dump(scaler, scaler_path) # 'scaler' l√† bi·∫øn StandardScaler b·∫°n ƒë√£ t·∫°o tr∆∞·ªõc ƒë√≥
print(f"StandardScaler ƒë√£ ƒë∆∞·ª£c l∆∞u t·∫°i: {scaler_path}")

joblib.dump(X_train_scaled.columns, os.path.join(models_dir, 'feature_columns.joblib'))

In [None]:
def predict_new_phone_price(new_phone_data_raw, models_dir='models'):
    """
    D·ª± ƒëo√°n gi√° ƒëi·ªán tho·∫°i m·ªõi d·ª±a tr√™n d·ªØ li·ªáu ƒë·∫ßu v√†o th√¥.

    Args:
        new_phone_data_raw (dict): M·ªôt dictionary ch·ª©a d·ªØ li·ªáu v·ªÅ ƒëi·ªán tho·∫°i m·ªõi.
                                   C√°c kh√≥a ph·∫£i kh·ªõp v·ªõi t√™n c·ªôt g·ªëc tr∆∞·ªõc khi ti·ªÅn x·ª≠ l√Ω.
                                   V√≠ d·ª•: {'Mobile Weight': '180g', 'RAM': '8GB', ...}
        models_dir (str): Th∆∞ m·ª•c ch·ª©a m√¥ h√¨nh v√† scaler ƒë√£ l∆∞u.

    Returns:
        float: Gi√° ƒëi·ªán tho·∫°i d·ª± ƒëo√°n (tr√™n thang gi√° g·ªëc, kh√¥ng ph·∫£i logarit).
        None: N·∫øu c√≥ l·ªói trong qu√° tr√¨nh x·ª≠ l√Ω ho·∫∑c d·ª± ƒëo√°n.
    """
    # Load m√¥ h√¨nh v√† scaler ƒë√£ l∆∞u
    try:
        best_lgbm_model = joblib.load(os.path.join(models_dir, 'best_lgbm_regressor.joblib'))
        scaler = joblib.load(os.path.join(models_dir, 'scaler.joblib'))
    except FileNotFoundError:
        print(f"L·ªói: Kh√¥ng t√¨m th·∫•y m√¥ h√¨nh ho·∫∑c scaler trong th∆∞ m·ª•c '{models_dir}'.")
        return None

    # T·∫°o DataFrame t·ª´ d·ªØ li·ªáu th√¥
    df_new = pd.DataFrame([new_phone_data_raw])

    # --- C√°c b∆∞·ªõc ti·ªÅn x·ª≠ l√Ω gi·ªëng h·ªát nh∆∞ trong qu√° tr√¨nh hu·∫•n luy·ªán ---
    # B·∫°n ph·∫£i ƒë·∫£m b·∫£o r·∫±ng c√°c b∆∞·ªõc n√†y kh·ªõp ch√≠nh x√°c v·ªõi code ti·ªÅn x·ª≠ l√Ω c·ªßa b·∫°n

    # 1. Chuy·ªÉn ƒë·ªïi 'Launched Year'
    df_new['Launched Year'] = pd.to_datetime(df_new['Launched Year'], format='%Y').dt.year.astype('int')

    # 2. X·ª≠ l√Ω 'Mobile Weight (g)'
    df_new['Mobile Weight (g)'] = df_new['Mobile Weight'].str.replace('g', '').astype(float)
    df_new.drop('Mobile Weight', axis=1, inplace=True) # X√≥a c·ªôt g·ªëc

    # 3. X·ª≠ l√Ω 'RAM (GB)'
    def clean_ram(item):
        item = item.replace('GB', '')
        if '/' in item:
            item = item.split('/')[1]
        return float(item)
    df_new['RAM (GB)'] = df_new['RAM'].apply(clean_ram)
    df_new.drop('RAM', axis=1, inplace=True)

    # 4. X·ª≠ l√Ω 'Front Camera (MP)'
    def clean_front_camera(item):
        item = [float(i) for i in re.findall(r'\d+', str(item))]
        return max(item) if item else 0.
    df_new['Front Camera (MP)'] = df_new['Front Camera'].apply(clean_front_camera)
    df_new.drop('Front Camera', axis=1, inplace=True)

    # 5. X·ª≠ l√Ω 'Back Camera' th√†nh 4 c·ªôt
    def clean_back_camera(item):
        items = item.split('+')
        list_camera = [0., 0., 0., 0.]
        for idx, sub_item in enumerate(items):
            if idx == 2:
                if 'macro' not in sub_item.lower():
                    list_camera[2] = float(sub_item.split('MP')[0])
                elif 'macro' in sub_item.lower():
                    list_camera[3] = float(sub_item.split('MP')[0])
            else:
                list_camera[idx] = float(sub_item.split('MP')[0])
        return list_camera

    # T·∫°o c√°c c·ªôt t·∫°m th·ªùi tr∆∞·ªõc khi g√°n
    temp_cols = ['Main Camera (MP)', 'Ultra Camera (MP)', 'Telephoto Camera (MP)', 'Macro Camera (MP)']
    df_new[temp_cols] = df_new['Back Camera'].apply(lambda x: pd.Series(clean_back_camera(x)))
    df_new.drop('Back Camera', axis=1, inplace=True)

    # 6. X·ª≠ l√Ω 'Battery Capacity (mAh)'
    df_new['Battery Capacity (mAh)'] = df_new['Battery Capacity'].str.replace('mAh', '').str.replace(',', '').astype(int)
    df_new.drop('Battery Capacity', axis=1, inplace=True)

    # 7. X·ª≠ l√Ω 'Screen Size (inches)'
    df_new['Screen Size (inches)'] = df_new['Screen Size'].apply(lambda x: x.split('inches')[0]).astype(float)
    df_new.drop('Screen Size', axis=1, inplace=True)

    # 8. Tr√≠ch xu·∫•t 'Storage (GB)' t·ª´ 'Model Name'
    def extract_storage(item):
        item = str(item).split(' ')[-1] # ƒê·∫£m b·∫£o l√† string
        if 'GB' in item:
            return int(item.replace('GB', ''))
        elif 'TB' in item:
            return int(item.replace('TB', '')) * 1024
        else:
            return pd.NA

    df_new['Storage (GB)'] = df_new['Storage'].apply(extract_storage)

    # 9. X·ª≠ l√Ω Processor_Brand (ƒë√¢y l√† m·ªôt b∆∞·ªõc quan tr·ªçng v√¨ b·∫°n ƒë√£ thay th·∫ø Processor g·ªëc)
    def extract_processor_brand(processor):
        if 'Snapdragon' in processor: return 'Qualcomm Snapdragon'
        elif 'MediaTek' in processor or 'Dimensity' in processor or 'Helio' in processor: return 'MediaTek'
        elif 'Bionic' in processor or 'A' in processor: return 'Apple Bionic'
        elif 'Exynos' in processor: return 'Samsung Exynos'
        elif 'Tensor' in processor: return 'Google Tensor'
        elif 'Unisoc' in processor: return 'Unisoc'
        elif 'Kirin' in processor: return 'Kirin'
        else: return 'Other'
    df_new['Processor_Brand'] = df_new['Processor'].apply(extract_processor_brand)
    df_new.drop('Processor', axis=1, inplace=True) # X√≥a c·ªôt g·ªëc

    # 10. One-Hot Encoding cho Processor_Brand, Company Name, Country
    # C·∫ßn t·∫°o c√°c c·ªôt OHE gi·ªëng h·ªát nh∆∞ t·∫≠p hu·∫•n luy·ªán.
    # C√°ch t·ªët nh·∫•t l√† t·∫°o m·ªôt DataFrame v·ªõi t·∫•t c·∫£ c√°c category c√≥ th·ªÉ c√≥
    # v√† sau ƒë√≥ join ho·∫∑c reindex df_new.
    # Trong v√≠ d·ª• n√†y, t√¥i s·∫Ω gi·∫£ ƒë·ªãnh b·∫°n ƒë√£ c√≥ danh s√°ch t·∫•t c·∫£ c√°c c·ªôt OHE t·ª´ X_train.
    # N·∫øu kh√¥ng, b·∫°n c·∫ßn l∆∞u tr·ªØ danh s√°ch c√°c c·ªôt n√†y trong qu√° tr√¨nh hu·∫•n luy·ªán.

    # L·∫•y danh s√°ch ƒë·∫ßy ƒë·ªß c√°c c·ªôt (features) m√† m√¥ h√¨nh ƒë∆∞·ª£c hu·∫•n luy·ªán tr√™n ƒë√≥.
    # ƒêi·ªÅu n√†y c·ª±c k·ª≥ quan tr·ªçng ƒë·ªÉ ƒë·∫£m b·∫£o th·ª© t·ª± v√† s·ª± ƒë·∫ßy ƒë·ªß c·ªßa c√°c c·ªôt.
    # Trong code hu·∫•n luy·ªán c·ªßa b·∫°n, `X_train_scaled.columns` ch·ª©a t·∫•t c·∫£ c√°c c·ªôt c·∫ßn thi·∫øt.
    # B·∫°n c·∫ßn l∆∞u tr·ªØ danh s√°ch n√†y.

    # Gi·∫£ ƒë·ªãnh: B·∫°n ƒë√£ l∆∞u danh s√°ch c√°c c·ªôt nh∆∞ sau trong qu√° tr√¨nh hu·∫•n luy·ªán:
    # joblib.dump(X_train_scaled.columns, os.path.join(models_dir, 'feature_columns.joblib'))
    try:
        trained_feature_columns = joblib.load(os.path.join(models_dir, 'feature_columns.joblib'))
    except FileNotFoundError:
        print(f"L·ªói: Kh√¥ng t√¨m th·∫•y danh s√°ch c·ªôt ƒë·∫∑c tr∆∞ng ƒë√£ hu·∫•n luy·ªán. Vui l√≤ng l∆∞u `X_train_scaled.columns`.")
        return None

    # T√°ch c√°c c·ªôt s·ªë v√† c·ªôt OHE
    numerical_features_list = [
        'Launched Year', 'Mobile Weight (g)', 'RAM (GB)', 'Front Camera (MP)',
        'Main Camera (MP)', 'Ultra Camera (MP)', 'Telephoto Camera (MP)',
        'Macro Camera (MP)', 'Battery Capacity (mAh)', 'Screen Size (inches)',
        'Storage (GB)'
    ]

    # T·∫°o c√°c c·ªôt OHE cho df_new
    # Danh s√°ch c√°c c·ªôt ph√¢n lo·∫°i (sau khi b·∫°n ƒë√£ t·∫°o Processor_Brand)
    categorical_cols_for_ohe = ['Processor_Brand', 'Company Name', 'Country']

    # T·∫°o DataFrame t·∫°m th·ªùi ƒë·ªÉ ch·ª©a OHE
    df_ohe_temp = pd.get_dummies(df_new[categorical_cols_for_ohe], drop_first=False)

    # K·∫øt h·ª£p c√°c c·ªôt s·ªë v√† c√°c c·ªôt OHE
    # ƒê·∫£m b·∫£o df_new ch·ªâ ch·ª©a c√°c c·ªôt c·∫ßn thi·∫øt tr∆∞·ªõc khi concat
    df_processed = pd.concat([df_new[numerical_features_list], df_ohe_temp], axis=1)

    # ƒê·∫£m b·∫£o df_processed c√≥ ch√≠nh x√°c c√°c c·ªôt nh∆∞ `trained_feature_columns`
    # ƒê√¢y l√† b∆∞·ªõc c·ª±c k·ª≥ quan tr·ªçng ƒë·ªÉ x·ª≠ l√Ω c√°c category ch∆∞a th·∫•y.
    # T·∫°o m·ªôt DataFrame tr·ªëng v·ªõi c√°c c·ªôt mong mu·ªën, sau ƒë√≥ g√°n d·ªØ li·ªáu.
    X_processed = pd.DataFrame(columns=trained_feature_columns)

    for col in trained_feature_columns:
        if col in df_processed.columns:
            X_processed[col] = df_processed[col]
        else:
            X_processed[col] = 0 # ƒêi·ªÅn 0 cho c√°c category ch∆∞a th·∫•y

    # X·ª≠ l√Ω NaN (c√≥ th·ªÉ x·∫£y ra n·∫øu Storage (GB) kh√¥ng tr√≠ch xu·∫•t ƒë∆∞·ª£c)
    # B·∫°n ƒë√£ d√πng df[col] = df[col].fillna(df[col].mean()) trong ph·∫ßn visualization
    # Nh∆∞ng trong prediction, kh√¥ng c√≥ mean c·ªßa to√†n b·ªô t·∫≠p d·ªØ li·ªáu.
    # T·ªët nh·∫•t l√† x·ª≠ l√Ω NaN ·ªü ƒë√¢y. V√≠ d·ª•, ƒëi·ªÅn 0 ho·∫∑c m·ªôt gi√° tr·ªã m·∫∑c ƒë·ªãnh h·ª£p l√Ω.
    # Ho·∫∑c ƒë·∫£m b·∫£o r·∫±ng d·ªØ li·ªáu ƒë·∫ßu v√†o kh√¥ng c√≥ NaN cho c√°c c·ªôt s·ªë.
    # V√≠ d·ª•:
    for col in numerical_features_list:
        if col in X_processed.columns and X_processed[col].isna().any():
            print(f"C·∫£nh b√°o: C·ªôt '{col}' c√≥ gi√° tr·ªã thi·∫øu. ƒêang ƒëi·ªÅn 0.")
            X_processed[col] = X_processed[col].fillna(0) # Ho·∫∑c gi√° tr·ªã m·∫∑c ƒë·ªãnh kh√°c

    # S·∫Øp x·∫øp l·∫°i c·ªôt theo th·ª© t·ª± c·ªßa X_train_scaled.columns
    X_processed = X_processed[trained_feature_columns]

    # Chu·∫©n h√≥a d·ªØ li·ªáu b·∫±ng scaler ƒë√£ l∆∞u
    X_processed_scaled = scaler.transform(X_processed)

    # D·ª± ƒëo√°n (tr√™n thang logarit)
    log_predicted_price = best_lgbm_model.predict(X_processed_scaled)[0]

    # Chuy·ªÉn ƒë·ªïi ng∆∞·ª£c v·ªÅ thang gi√° g·ªëc
    predicted_price = np.exp(log_predicted_price)

    return predicted_price

# --- V√≠ d·ª• c√°ch s·ª≠ d·ª•ng h√†m d·ª± ƒëo√°n ---
# D·ªØ li·ªáu m·∫´u (thay th·∫ø b·∫±ng d·ªØ li·ªáu ƒëi·ªán tho·∫°i th·ª±c t·∫ø c·ªßa b·∫°n)
# R·∫§T QUAN TR·ªåNG: C√°c kh√≥a trong dictionary n√†y ph·∫£i l√† T√äN C·ªòT G·ªêC c·ªßa b·∫°n
# tr∆∞·ªõc khi c√°c b∆∞·ªõc l√†m s·∫°ch v√† ti·ªÅn x·ª≠ l√Ω b·∫Øt ƒë·∫ßu.
new_phone_data = {
    'Launched Year': 2024,
    'Mobile Weight': '190g',
    'RAM': '12GB',
    'Front Camera': '32MP',
    'Back Camera': '108MP+12MP+5MP', # V√≠ d·ª•: Main+Ultra+Macro (kh√¥ng Telephoto)
    'Battery Capacity': '5000mAh',
    'Screen Size': '6.7 inches',
    'Storage': '256GB',
    'Processor': 'Qualcomm Snapdragon 8 Gen 3',
    'Company Name': 'Samsung',
    'Country': 'USA' # Ph·∫£i l√† m·ªôt trong c√°c qu·ªëc gia b·∫°n ƒë√£ hu·∫•n luy·ªán
}

# Tr∆∞·ªõc khi ch·∫°y h√†m n√†y, h√£y ƒë·∫£m b·∫£o b·∫°n ƒë√£ l∆∞u `X_train_scaled.columns`
# trong qu√° tr√¨nh hu·∫•n luy·ªán c·ªßa b·∫°n ƒë·ªÉ h√†m c√≥ th·ªÉ t·∫£i n√≥ l√™n.
# V√≠ d·ª•, sau d√≤ng `X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)`
# B·∫°n th√™m d√≤ng sau:
# joblib.dump(X_train_scaled.columns, os.path.join(models_dir, 'feature_columns.joblib'))


# Sau khi b·∫°n ƒë√£ ch·∫°y code tr√™n ƒë·ªÉ l∆∞u m√¥ h√¨nh v√† scaler:
predicted_price = predict_new_phone_price(new_phone_data)

if predicted_price is not None:
    print(f"\nGi√° d·ª± ƒëo√°n c·ªßa ƒëi·ªán tho·∫°i m·ªõi l√†: ${predicted_price:.2f}")