In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import joblib

In [None]:
data_path = '/kaggle/input/phones/dataset.csv'
data = pd.read_csv(data_path)
data

In [None]:
data['Brand'] = data['Brand'].str.split().str[0].str.title()
data

In [None]:
brands = data['Brand'].unique()
models = {}

In [None]:
for brand in brands:
    print(f"Training model for brand: {brand}")
    
    brand_data = data[data['Brand'] == brand]
    X_brand = brand_data.drop(columns=['Brand', 'Price (USD)'])
    y_brand = brand_data['Price (USD)']
    
    scaler = StandardScaler()
    X_brand_scaled = scaler.fit_transform(X_brand)
    
    X_train, X_test, y_train, y_test = train_test_split(X_brand_scaled, y_brand, test_size=0.2, random_state=42)
    
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'max_features': ['sqrt', 'log2']
    }
    rf = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error', return_train_score=True)
    grid_search.fit(X_train, y_train)
    best_rf = grid_search.best_estimator_
    
    y_train_pred = best_rf.predict(X_train)
    y_test_pred = best_rf.predict(X_test)
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    print(f"Evaluation for brand {brand}:")
    print(f"  Train MSE: {train_mse:.2f}, Train R²: {train_r2:.2f}")
    print(f"  Test MSE: {test_mse:.2f}, Test R²: {test_r2:.2f}")

    model_filename = f'{brand}_rf_model.pkl'
    scaler_filename = f'{brand}_scaler.pkl'
    joblib.dump(best_rf, model_filename)
    joblib.dump(scaler, scaler_filename)
    print(f"Model and scaler for {brand} saved.")

    plt.figure(figsize=(8, 6))
    plt.scatter(y_train, y_train_pred, alpha=0.7, color='blue', label='Training Data')
    plt.scatter(y_test, y_test_pred, alpha=0.7, color='orange', label='Testing Data')
    plt.plot([y_brand.min(), y_brand.max()], [y_brand.min(), y_brand.max()], color='red', linestyle='--', label='Perfect Prediction')
    plt.xlabel('Actual Prices')
    plt.ylabel('Predicted Prices')
    plt.title(f'Actual vs Predicted Prices for {brand}')
    plt.legend()
    plt.show()

In [None]:
# def predict_price(brand, input_specs):
#     model_filename = f'{brand}_rf_model.pkl'
#     scaler_filename = f'{brand}_scaler.pkl'
#     try:
#         rf_model = joblib.load(model_filename)
#         scaler = joblib.load(scaler_filename)
#         input_scaled = scaler.transform([input_specs])
#         predicted_price = rf_model.predict(input_scaled)
#         return predicted_price[0]
#     except FileNotFoundError:
#         print(f"No model available for the brand: {brand}")
#         return None