In [4]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime

In [5]:
df = pd.read_csv('cleaned_all_phones.csv')

In [6]:
usd_to_inr = 88.72

In [7]:
df['price(INR)'] = df['price(USD)'] * usd_to_inr
df["price(INR)"] = df["price(INR)"].round(2)

In [11]:
df.drop(columns=['price(USD)', 'battery_type', 'phone_name'], inplace=True)
df = df.drop_duplicates().copy()

In [12]:
df['os'] = df['os'].astype(str).str.lower()

In [13]:
def simplify_os(value):
    if 'android' in value:
        return 'Android'
    elif 'ios' in value or 'iphone' in value or 'apple' in value:
        return 'iOS'
    else:
        return 'Other'

In [14]:
df['os'] = df['os'].apply(simplify_os)

In [15]:
df[["Resolution_Width", "Resolution_Height"]] = df['resolution'].str.lower().str.split("x", expand=True)
df['Resolution_Width'] = pd.to_numeric(df['Resolution_Width'], errors='coerce')

In [16]:
df['Resolution_Height'] = pd.to_numeric(df['Resolution_Height'], errors='coerce')
df['Total_px'] = df['Resolution_Width'] * df['Resolution_Height']

In [17]:
df.drop(columns=['Resolution_Width', 'Resolution_Height', 'resolution'], inplace=True)

In [18]:
df["Pixel_Density"] = np.sqrt(df["Total_px"]) / df["inches"]

In [19]:
camera_cols = ['video_720p', 'video_1080p', 'video_4K', 'video_8K',
               'video_30fps', 'video_60fps', 'video_120fps',
               'video_240fps', 'video_480fps', 'video_960fps']

In [21]:
df[camera_cols] = df[camera_cols].astype(str).apply(lambda x: x.str.lower())


In [22]:
for col in camera_cols:
    df[col] = df[col].apply(lambda x: 1 if any(keyword in x for keyword in ['yes', 'true', '1', 'supported', 'available', 'ok', 'present']) else 0)

In [23]:
df['announcement_date'] = pd.to_datetime(df['announcement_date'], errors='coerce')
median_date = df['announcement_date'].dropna().median()

In [24]:
df['announcement_date'].fillna(median_date, inplace=True)
current_date = pd.Timestamp.today()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['announcement_date'].fillna(median_date, inplace=True)


In [25]:
df['phone_age'] = (current_date - df['announcement_date']).dt.days / 365.25

In [26]:
# Derived/Composite Features
df['ppi'] = np.sqrt(df['Total_px']) / df['inches']
df['video_capabilities'] = df[['video_720p', 'video_1080p', 'video_4K', 'video_8K']].sum(axis=1) # Max Resolution support score
df['high_fps_support'] = df[['video_30fps', 'video_60fps', 'video_120fps', 'video_240fps', 'video_480fps', 'video_960fps']].sum(axis=1) # High FPS support count
df['performance_score'] = df['ram(GB)'] * 0.5 + df['storage(GB)'] * 0.3 + df['battery'] * 0.2
df['years_since_launch'] = 2025 - df['announcement_date'].dt.year

In [27]:
def age_category(age):
    if age <= 1:
        return 'New'
    elif age <= 3:
        return 'Mid'
    else:
        return 'Old'

In [28]:
df['phone_age_category'] = df['phone_age'].apply(age_category)

In [29]:
old_features_to_drop = [
    'inches', 'battery', 'ram(GB)', 'storage(GB)', 'Total_px', 
    'Pixel_Density', 'announcement_date', 'announcement_year', 'phone_age'
] + camera_cols

In [30]:
df = df.drop(columns=[col for col in old_features_to_drop if col in df.columns])

In [31]:
cols_for_outlier_cap = ['price(INR)', 'weight(g)', 'ppi', 'video_capabilities', 'high_fps_support', 'performance_score', 'years_since_launch']

In [32]:
def cap_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
    return df

In [33]:
for col in cols_for_outlier_cap:
    df = cap_outliers_iqr(df, col)

In [34]:
top_brands = ['Samsung', 'Xiaomi', 'Apple', 'Realme', 'OnePlus']
df['brand'] = df['brand'].apply(lambda x: x if x in top_brands else 'Others')

In [35]:
# --- Final Model Preparation (One-Hot Encoding and Scaling) ---
df_model = df.copy()

In [36]:
# Define X and y BEFORE scaling and final encoding
y = df_model['price(INR)']
X = df_model.drop('price(INR)', axis=1)

In [37]:
# One-hot encode using pandas (drop_first=True to avoid multicollinearity)
categorical_cols = ['brand', 'os', 'phone_age_category']
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

In [38]:
# Align columns to ensure the model training features match Streamlit app expectation
# All possible dummy columns based on Streamlit dropdowns (excluding dropped first level)
expected_dummy_cols = [
    'brand_OnePlus', 'brand_Others', 'brand_Realme', 'brand_Samsung', 'brand_Xiaomi', 
    'os_iOS', 
    'phone_age_category_Mid', 'phone_age_category_Old'
]

In [39]:
# Add missing columns with 0 and remove extra columns
for col in expected_dummy_cols:
    if col not in X.columns:
        X[col] = 0

In [40]:
X = X[['weight(g)', 'ppi', 'video_capabilities', 'high_fps_support', 
       'performance_score', 'years_since_launch'] + expected_dummy_cols].copy()

In [41]:
numeric_cols = ['weight(g)', 'ppi', 'video_capabilities', 'high_fps_support', 'performance_score', 'years_since_launch']
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

In [42]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [43]:
rf = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [200, 400],
    'max_depth': [10, 20],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4],
    'max_features': ['sqrt'] # 'auto' is deprecated, use 'sqrt'
}

In [44]:
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=10, 
    cv=5,
    verbose=0,
    random_state=42,
    n_jobs=-1
)

In [45]:
rf_random.fit(X_train, y_train)

In [46]:
# Use the best estimator
best_rf_model = rf_random.best_estimator_

In [47]:
# --- Final Evaluation ---
y_pred = best_rf_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [48]:
print("✅ Final Model Evaluation:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")

✅ Final Model Evaluation:
MAE: 10003.57
RMSE: 13069.39
R² Score: 0.3644


In [50]:
# --- Save Model and Scaler ---
# Save the BEST trained Random Forest model
with open("rf_model.pkl", "wb") as file:
    pickle.dump(best_rf_model, file)

In [49]:
# Save the scaler used on the final feature set
with open("scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)

In [51]:
print("✅ Model and Scaler saved successfully")

✅ Model and Scaler saved successfully
