# 📱 Mobile Phone Price Prediction — Clean Notebook (No caas_jupyter_tools)

In [1]:
import pandas as pd, os
path_csv='dataset.csv'
path_xlsx='Book1.csv.xlsx'
df = pd.read_csv(path_csv) if os.path.exists(path_csv) else pd.read_excel(path_xlsx)
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [2]:
df['price_range'].value_counts().sort_index()

price_range
0    500
1    500
2    500
3    500
Name: count, dtype: int64

In [3]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

df.columns = [str(c).strip().lower() for c in df.columns]
target_col = 'price_range'
X = df.drop(columns=[target_col])
y = df[target_col].astype(int)
numeric_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
preprocessor = ColumnTransformer([
    ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numeric_cols)
], remainder='drop')
models = {
    'LogReg': LogisticRegression(max_iter=500),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42)
}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
rows = []
for name, clf in models.items():
    pipe = Pipeline([('pre', preprocessor), ('clf', clf)])
    acc = cross_val_score(pipe, X, y, cv=cv, scoring='accuracy').mean()
    rows.append((name, acc))
pd.DataFrame(rows, columns=['Model','CV Accuracy'])

Unnamed: 0,Model,CV Accuracy
0,LogReg,0.961498
1,RandomForest,0.882496
2,GradientBoosting,0.889502


In [4]:
from sklearn.model_selection import train_test_split
best = 'RandomForest'  # change to whichever performed best
pipe = Pipeline([('pre', preprocessor), ('clf', models[best])])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
pipe.fit(X_train, y_train)
print('Accuracy:', accuracy_score(y_test, pipe.predict(X_test)))
print(classification_report(y_test, pipe.predict(X_test)))

Accuracy: 0.8775
              precision    recall  f1-score   support

           0       0.94      0.95      0.95       100
           1       0.81      0.84      0.82       100
           2       0.82      0.79      0.81       100
           3       0.94      0.93      0.93       100

    accuracy                           0.88       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.88      0.88      0.88       400



In [5]:
import joblib
joblib.dump({
    'pipeline': pipe,
    'best_model_name': best,
    'columns': list(X.columns),
    'target': target_col
}, 'mobile_price_model.joblib')
print('Model saved as mobile_price_model.joblib')

Model saved as mobile_price_model.joblib
