In [13]:
!pip install --user pandas



In [1]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('mobile_price.csv')
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nTarget distribution (price_range):")
print(df['price_range'].value_counts())

Dataset shape: (2000, 21)

First 5 rows:
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263      1716  2603    11     2          9   
3        131        6  ...       1216      1786  2769    16     8         11   
4        141        2  ...       1208      1212  1411     8     2         15  

In [3]:
# 1. Check basic info
print("=== Dataset Info ===")
print(df.info())
print("\n=== Missing Values ===")
print(df.isnull().sum())

# 2. Check data types
print("\n=== Data Types ===")
print(df.dtypes)

# 3. Check target distribution
print("\n=== Target Distribution (price_range) ===")
print(df['price_range'].value_counts())
print("\nPercentage distribution:")
print(df['price_range'].value_counts(normalize=True) * 100)

# 4. Separate features and target
X = df.drop('price_range', axis=1)
y = df['price_range']

print("\n=== Features shape ===")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

# 5. Split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\n=== Train-Test Split ===")
print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")
print(f"Train target distribution:\n{y_train.value_counts()}")
print(f"\nTest target distribution:\n{y_test.value_counts()}")

=== Dataset Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   

In [4]:
# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.preprocessing import LabelBinarizer
import warnings
warnings.filterwarnings('ignore')

# 1. Initialize and train model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

# 2. Make predictions
y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)

# 3. Calculate metrics
# For multi-class AUC, we need to binarize the labels
lb = LabelBinarizer()
lb.fit(y_test)
y_test_bin = lb.transform(y_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test_bin, y_pred_proba, multi_class='ovr', average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
mcc = matthews_corrcoef(y_test, y_pred)

# 4. Print results
print("=== Logistic Regression Results ===")
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC Score: {auc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"MCC Score: {mcc:.4f}")

# 5. Store results for later comparison
log_reg_results = {
    'Model': 'Logistic Regression',
    'Accuracy': accuracy,
    'AUC': auc,
    'Precision': precision,
    'Recall': recall,
    'F1': f1,
    'MCC': mcc
}

# Create a results DataFrame to store all model results
results_df = pd.DataFrame([log_reg_results])
print("\nResults stored for comparison table.")

=== Logistic Regression Results ===
Accuracy: 0.6700
AUC Score: 0.8968
Precision: 0.6809
Recall: 0.6700
F1 Score: 0.6746
MCC Score: 0.5605

Results stored for comparison table.


In [5]:
# Import Decision Tree
from sklearn.tree import DecisionTreeClassifier

# 1. Initialize and train Decision Tree
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

# 2. Make predictions
y_pred_dt = dt_clf.predict(X_test)
y_pred_proba_dt = dt_clf.predict_proba(X_test)

# 3. Calculate metrics
accuracy_dt = accuracy_score(y_test, y_pred_dt)
auc_dt = roc_auc_score(y_test_bin, y_pred_proba_dt, multi_class='ovr', average='weighted')
precision_dt = precision_score(y_test, y_pred_dt, average='weighted')
recall_dt = recall_score(y_test, y_pred_dt, average='weighted')
f1_dt = f1_score(y_test, y_pred_dt, average='weighted')
mcc_dt = matthews_corrcoef(y_test, y_pred_dt)

# 4. Print results
print("=== Decision Tree Classifier Results ===")
print(f"Accuracy: {accuracy_dt:.4f}")
print(f"AUC Score: {auc_dt:.4f}")
print(f"Precision: {precision_dt:.4f}")
print(f"Recall: {recall_dt:.4f}")
print(f"F1 Score: {f1_dt:.4f}")
print(f"MCC Score: {mcc_dt:.4f}")

# 5. Store results
dt_results = {
    'Model': 'Decision Tree',
    'Accuracy': accuracy_dt,
    'AUC': auc_dt,
    'Precision': precision_dt,
    'Recall': recall_dt,
    'F1': f1_dt,
    'MCC': mcc_dt
}

# Add to results DataFrame
results_df = pd.concat([results_df, pd.DataFrame([dt_results])], ignore_index=True)
print("\nDecision Tree results added to comparison table.")
print("\nCurrent Results Table:")
print(results_df)

=== Decision Tree Classifier Results ===
Accuracy: 0.8300
AUC Score: 0.8867
Precision: 0.8319
Recall: 0.8300
F1 Score: 0.8302
MCC Score: 0.7738

Decision Tree results added to comparison table.

Current Results Table:
                 Model  Accuracy       AUC  Precision  Recall        F1  \
0  Logistic Regression      0.67  0.896800   0.680872    0.67  0.674565   
1        Decision Tree      0.83  0.886667   0.831883    0.83  0.830168   

        MCC  
0  0.560537  
1  0.773811  


In [6]:
# Import KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# 1. Scale the features (important for KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 2. Initialize and train KNN (using k=5 as default)
knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train_scaled, y_train)

# 3. Make predictions
y_pred_knn = knn_clf.predict(X_test_scaled)
y_pred_proba_knn = knn_clf.predict_proba(X_test_scaled)

# 4. Calculate metrics
accuracy_knn = accuracy_score(y_test, y_pred_knn)
auc_knn = roc_auc_score(y_test_bin, y_pred_proba_knn, multi_class='ovr', average='weighted')
precision_knn = precision_score(y_test, y_pred_knn, average='weighted')
recall_knn = recall_score(y_test, y_pred_knn, average='weighted')
f1_knn = f1_score(y_test, y_pred_knn, average='weighted')
mcc_knn = matthews_corrcoef(y_test, y_pred_knn)

# 5. Print results
print("=== K-Nearest Neighbors Classifier Results ===")
print(f"Accuracy: {accuracy_knn:.4f}")
print(f"AUC Score: {auc_knn:.4f}")
print(f"Precision: {precision_knn:.4f}")
print(f"Recall: {recall_knn:.4f}")
print(f"F1 Score: {f1_knn:.4f}")
print(f"MCC Score: {mcc_knn:.4f}")

# 6. Store results
knn_results = {
    'Model': 'K-Nearest Neighbors',
    'Accuracy': accuracy_knn,
    'AUC': auc_knn,
    'Precision': precision_knn,
    'Recall': recall_knn,
    'F1': f1_knn,
    'MCC': mcc_knn
}

# Add to results DataFrame
results_df = pd.concat([results_df, pd.DataFrame([knn_results])], ignore_index=True)
print("\nKNN results added to comparison table.")
print("\nCurrent Results Table:")
print(results_df)

=== K-Nearest Neighbors Classifier Results ===
Accuracy: 0.5000
AUC Score: 0.7697
Precision: 0.5211
Recall: 0.5000
F1 Score: 0.5054
MCC Score: 0.3350

KNN results added to comparison table.

Current Results Table:
                 Model  Accuracy       AUC  Precision  Recall        F1  \
0  Logistic Regression      0.67  0.896800   0.680872    0.67  0.674565   
1        Decision Tree      0.83  0.886667   0.831883    0.83  0.830168   
2  K-Nearest Neighbors      0.50  0.769750   0.521130    0.50  0.505355   

        MCC  
0  0.560537  
1  0.773811  
2  0.334993  


In [7]:
# Import Naive Bayes
from sklearn.naive_bayes import GaussianNB

# 1. Initialize and train Gaussian Naive Bayes
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

# 2. Make predictions
y_pred_nb = nb_clf.predict(X_test)
y_pred_proba_nb = nb_clf.predict_proba(X_test)

# 3. Calculate metrics
accuracy_nb = accuracy_score(y_test, y_pred_nb)
auc_nb = roc_auc_score(y_test_bin, y_pred_proba_nb, multi_class='ovr', average='weighted')
precision_nb = precision_score(y_test, y_pred_nb, average='weighted')
recall_nb = recall_score(y_test, y_pred_nb, average='weighted')
f1_nb = f1_score(y_test, y_pred_nb, average='weighted')
mcc_nb = matthews_corrcoef(y_test, y_pred_nb)

# 4. Print results
print("=== Gaussian Naive Bayes Classifier Results ===")
print(f"Accuracy: {accuracy_nb:.4f}")
print(f"AUC Score: {auc_nb:.4f}")
print(f"Precision: {precision_nb:.4f}")
print(f"Recall: {recall_nb:.4f}")
print(f"F1 Score: {f1_nb:.4f}")
print(f"MCC Score: {mcc_nb:.4f}")

# 5. Store results
nb_results = {
    'Model': 'Naive Bayes',
    'Accuracy': accuracy_nb,
    'AUC': auc_nb,
    'Precision': precision_nb,
    'Recall': recall_nb,
    'F1': f1_nb,
    'MCC': mcc_nb
}

# Add to results DataFrame
results_df = pd.concat([results_df, pd.DataFrame([nb_results])], ignore_index=True)
print("\nNaive Bayes results added to comparison table.")
print("\nCurrent Results Table:")
print(results_df)

=== Gaussian Naive Bayes Classifier Results ===
Accuracy: 0.8100
AUC Score: 0.9506
Precision: 0.8113
Recall: 0.8100
F1 Score: 0.8105
MCC Score: 0.7468

Naive Bayes results added to comparison table.

Current Results Table:
                 Model  Accuracy       AUC  Precision  Recall        F1  \
0  Logistic Regression      0.67  0.896800   0.680872    0.67  0.674565   
1        Decision Tree      0.83  0.886667   0.831883    0.83  0.830168   
2  K-Nearest Neighbors      0.50  0.769750   0.521130    0.50  0.505355   
3          Naive Bayes      0.81  0.950567   0.811326    0.81  0.810458   

        MCC  
0  0.560537  
1  0.773811  
2  0.334993  
3  0.746804  


In [8]:
# Import Random Forest
from sklearn.ensemble import RandomForestClassifier

# 1. Initialize and train Random Forest
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# 2. Make predictions
y_pred_rf = rf_clf.predict(X_test)
y_pred_proba_rf = rf_clf.predict_proba(X_test)

# 3. Calculate metrics
accuracy_rf = accuracy_score(y_test, y_pred_rf)
auc_rf = roc_auc_score(y_test_bin, y_pred_proba_rf, multi_class='ovr', average='weighted')
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')
mcc_rf = matthews_corrcoef(y_test, y_pred_rf)

# 4. Print results
print("=== Random Forest Classifier Results ===")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"AUC Score: {auc_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1 Score: {f1_rf:.4f}")
print(f"MCC Score: {mcc_rf:.4f}")

# 5. Store results
rf_results = {
    'Model': 'Random Forest',
    'Accuracy': accuracy_rf,
    'AUC': auc_rf,
    'Precision': precision_rf,
    'Recall': recall_rf,
    'F1': f1_rf,
    'MCC': mcc_rf
}

# Add to results DataFrame
results_df = pd.concat([results_df, pd.DataFrame([rf_results])], ignore_index=True)
print("\nRandom Forest results added to comparison table.")
print("\nCurrent Results Table:")
print(results_df)

=== Random Forest Classifier Results ===
Accuracy: 0.8800
AUC Score: 0.9769
Precision: 0.8796
Recall: 0.8800
F1 Score: 0.8797
MCC Score: 0.8400

Random Forest results added to comparison table.

Current Results Table:
                 Model  Accuracy       AUC  Precision  Recall        F1  \
0  Logistic Regression      0.67  0.896800   0.680872    0.67  0.674565   
1        Decision Tree      0.83  0.886667   0.831883    0.83  0.830168   
2  K-Nearest Neighbors      0.50  0.769750   0.521130    0.50  0.505355   
3          Naive Bayes      0.81  0.950567   0.811326    0.81  0.810458   
4        Random Forest      0.88  0.976929   0.879614    0.88  0.879734   

        MCC  
0  0.560537  
1  0.773811  
2  0.334993  
3  0.746804  
4  0.840049  


In [10]:
# Import XGBoost
import xgboost as xgb

# 1. Initialize and train XGBoost
xgb_clf = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)
xgb_clf.fit(X_train, y_train)

# 2. Make predictions
y_pred_xgb = xgb_clf.predict(X_test)
y_pred_proba_xgb = xgb_clf.predict_proba(X_test)

# 3. Calculate metrics
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
auc_xgb = roc_auc_score(y_test_bin, y_pred_proba_xgb, multi_class='ovr', average='weighted')
precision_xgb = precision_score(y_test, y_pred_xgb, average='weighted')
recall_xgb = recall_score(y_test, y_pred_xgb, average='weighted')
f1_xgb = f1_score(y_test, y_pred_xgb, average='weighted')
mcc_xgb = matthews_corrcoef(y_test, y_pred_xgb)

# 4. Print results
print("=== XGBoost Classifier Results ===")
print(f"Accuracy: {accuracy_xgb:.4f}")
print(f"AUC Score: {auc_xgb:.4f}")
print(f"Precision: {precision_xgb:.4f}")
print(f"Recall: {recall_xgb:.4f}")
print(f"F1 Score: {f1_xgb:.4f}")
print(f"MCC Score: {mcc_xgb:.4f}")

# 5. Store results
xgb_results = {
    'Model': 'XGBoost',
    'Accuracy': accuracy_xgb,
    'AUC': auc_xgb,
    'Precision': precision_xgb,
    'Recall': recall_xgb,
    'F1': f1_xgb,
    'MCC': mcc_xgb
}

# Add to results DataFrame
results_df = pd.concat([results_df, pd.DataFrame([xgb_results])], ignore_index=True)
print("\nXGBoost results added to comparison table.")
print("\n=== FINAL COMPARISON TABLE ===")
print(results_df.to_string(index=False))

=== XGBoost Classifier Results ===
Accuracy: 0.9225
AUC Score: 0.9937
Precision: 0.9226
Recall: 0.9225
F1 Score: 0.9225
MCC Score: 0.8967

XGBoost results added to comparison table.

=== FINAL COMPARISON TABLE ===
              Model  Accuracy      AUC  Precision  Recall       F1      MCC
Logistic Regression    0.6700 0.896800   0.680872  0.6700 0.674565 0.560537
      Decision Tree    0.8300 0.886667   0.831883  0.8300 0.830168 0.773811
K-Nearest Neighbors    0.5000 0.769750   0.521130  0.5000 0.505355 0.334993
        Naive Bayes    0.8100 0.950567   0.811326  0.8100 0.810458 0.746804
      Random Forest    0.8800 0.976929   0.879614  0.8800 0.879734 0.840049
            XGBoost    0.9225 0.993700   0.922631  0.9225 0.922482 0.896719


In [11]:
# Import joblib for saving models
import joblib
import os

# 1. Create a directory to save models
model_dir = 'saved_models'
os.makedirs(model_dir, exist_ok=True)

# 2. Save all trained models
joblib.dump(log_reg, f'{model_dir}/logistic_regression.pkl')
joblib.dump(dt_clf, f'{model_dir}/decision_tree.pkl')
joblib.dump(knn_clf, f'{model_dir}/knn.pkl')
joblib.dump(nb_clf, f'{model_dir}/naive_bayes.pkl')
joblib.dump(rf_clf, f'{model_dir}/random_forest.pkl')
joblib.dump(xgb_clf, f'{model_dir}/xgboost.pkl')
joblib.dump(scaler, f'{model_dir}/scaler.pkl')  # Save the scaler for KNN

print("All models saved successfully in 'saved_models' directory!")
print(f"Models saved: {os.listdir(model_dir)}")

# 3. Create a well-formatted comparison table
print("\n" + "="*100)
print("FINAL MODEL COMPARISON TABLE")
print("="*100)

# Format the results for better display
formatted_results = results_df.copy()

# Round all metric values to 4 decimal places
metric_columns = ['Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC']
for col in metric_columns:
    formatted_results[col] = formatted_results[col].apply(lambda x: f"{x:.4f}")

# Display the table
print(formatted_results.to_string(index=False))

# 4. Save results to CSV for reference
results_df.to_csv('model_comparison_results.csv', index=False)
print("\nResults saved to 'model_comparison_results.csv'")

# 5. Identify best model for each metric
print("\n" + "="*100)
print("BEST PERFORMING MODELS FOR EACH METRIC")
print("="*100)

for metric in metric_columns:
    if metric in ['Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC']:
        # Find the model with highest value for this metric
        best_idx = results_df[metric].idxmax()
        best_model = results_df.loc[best_idx, 'Model']
        best_value = results_df.loc[best_idx, metric]
        print(f"{metric}: {best_model} ({best_value:.4f})")

All models saved successfully in 'saved_models' directory!
Models saved: ['logistic_regression.pkl', 'decision_tree.pkl', 'knn.pkl', 'naive_bayes.pkl', 'random_forest.pkl', 'xgboost.pkl', 'scaler.pkl']

FINAL MODEL COMPARISON TABLE
              Model Accuracy    AUC Precision Recall     F1    MCC
Logistic Regression   0.6700 0.8968    0.6809 0.6700 0.6746 0.5605
      Decision Tree   0.8300 0.8867    0.8319 0.8300 0.8302 0.7738
K-Nearest Neighbors   0.5000 0.7697    0.5211 0.5000 0.5054 0.3350
        Naive Bayes   0.8100 0.9506    0.8113 0.8100 0.8105 0.7468
      Random Forest   0.8800 0.9769    0.8796 0.8800 0.8797 0.8400
            XGBoost   0.9225 0.9937    0.9226 0.9225 0.9225 0.8967

Results saved to 'model_comparison_results.csv'

BEST PERFORMING MODELS FOR EACH METRIC
Accuracy: XGBoost (0.9225)
AUC: XGBoost (0.9937)
Precision: XGBoost (0.9226)
Recall: XGBoost (0.9225)
F1: XGBoost (0.9225)
MCC: XGBoost (0.8967)


In [12]:
import streamlit as st
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

# Set page configuration
st.set_page_config(
    page_title="Mobile Price Classification",
    page_icon="üì±",
    layout="wide"
)

# Title
st.title("üì± Mobile Price Classification App")
st.markdown("""
This app predicts mobile phone price range (0-3) based on various specifications.
Upload your test data (CSV) or use the sample data to see predictions.
""")

# Sidebar for navigation
st.sidebar.title("Navigation")
options = st.sidebar.radio("Select a page:", 
                          ["Home", "Upload & Predict", "Model Comparison", "About"])

# Load saved models
@st.cache_resource
def load_models():
    models = {
        "Logistic Regression": joblib.load('saved_models/logistic_regression.pkl'),
        "Decision Tree": joblib.load('saved_models/decision_tree.pkl'),
        "K-Nearest Neighbors": joblib.load('saved_models/knn.pkl'),
        "Naive Bayes": joblib.load('saved_models/naive_bayes.pkl'),
        "Random Forest": joblib.load('saved_models/random_forest.pkl'),
        "XGBoost": joblib.load('saved_models/xgboost.pkl')
    }
    scaler = joblib.load('saved_models/scaler.pkl')
    return models, scaler

# Home Page
if options == "Home":
    st.header("Welcome to Mobile Price Classifier")
    st.markdown("""
    ### Dataset Description
    This dataset contains information about 2000 mobile phones with the following features:
    
    **Features (20):**
    1. **battery_power** - Total energy a battery can store (mAh)
    2. **blue** - Has Bluetooth or not (0/1)
    3. **clock_speed** - Speed of microprocessor (GHz)
    4. **dual_sim** - Has dual sim support (0/1)
    5. **fc** - Front camera megapixels
    6. **four_g** - Has 4G or not (0/1)
    7. **int_memory** - Internal memory (GB)
    8. **m_dep** - Mobile depth (cm)
    9. **mobile_wt** - Weight of mobile phone
    10. **n_cores** - Number of cores of processor
    11. **pc** - Primary camera megapixels
    12. **px_height** - Pixel resolution height
    13. **px_width** - Pixel resolution width
    14. **ram** - Random Access Memory (MB)
    15. **sc_h** - Screen height (cm)
    16. **sc_w** - Screen width (cm)
    17. **talk_time** - Longest battery life on single charge (hours)
    18. **three_g** - Has 3G or not (0/1)
    19. **touch_screen** - Has touch screen or not (0/1)
    20. **wifi** - Has wifi or not (0/1)
    
    **Target:**
    - **price_range** - Price class (0: low cost, 1: medium cost, 2: high cost, 3: very high cost)
    
    ### Models Implemented
    1. Logistic Regression
    2. Decision Tree Classifier
    3. K-Nearest Neighbor Classifier
    4. Naive Bayes Classifier
    5. Random Forest (Ensemble)
    6. XGBoost (Ensemble)
    """)
    
    # Show sample data
    if st.checkbox("Show sample data"):
        sample_data = pd.read_csv('mobile_price.csv').head(10)
        st.dataframe(sample_data)

# Upload & Predict Page
elif options == "Upload & Predict":
    st.header("üì§ Upload Test Data and Predict")
    
    # Option 1: Upload CSV
    uploaded_file = st.file_uploader("Upload your test CSV file (without price_range column)", type=['csv'])
    
    # Option 2: Use sample test data
    use_sample = st.checkbox("Use sample test data")
    
    if uploaded_file is not None or use_sample:
        try:
            # Load data
            if uploaded_file is not None:
                test_df = pd.read_csv(uploaded_file)
            else:
                # Use last 100 rows as sample test data
                full_data = pd.read_csv('mobile_price.csv')
                test_df = full_data.drop('price_range', axis=1).tail(100)
            
            st.success("Data loaded successfully!")
            st.write(f"**Shape:** {test_df.shape}")
            st.write("**First 5 rows:**")
            st.dataframe(test_df.head())
            
            # Check if all required columns are present
            required_columns = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
                               'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
                               'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
                               'touch_screen', 'wifi']
            
            missing_cols = [col for col in required_columns if col not in test_df.columns]
            
            if missing_cols:
                st.error(f"Missing columns: {missing_cols}")
            else:
                # Model selection
                st.subheader("Select Model for Prediction")
                model_names = ["Logistic Regression", "Decision Tree", "K-Nearest Neighbors", 
                              "Naive Bayes", "Random Forest", "XGBoost"]
                selected_model = st.selectbox("Choose a model:", model_names)
                
                if st.button("Predict"):
                    # Load models
                    models, scaler = load_models()
                    model = models[selected_model]
                    
                    # Prepare data (scale if KNN)
                    if selected_model == "K-Nearest Neighbors":
                        X_test_scaled = scaler.transform(test_df)
                        predictions = model.predict(X_test_scaled)
                    else:
                        predictions = model.predict(test_df)
                    
                    # Add predictions to dataframe
                    results_df = test_df.copy()
                    results_df['Predicted_Price_Range'] = predictions
                    
                    # Map price range to labels
                    price_labels = {0: "Low Cost", 1: "Medium Cost", 2: "High Cost", 3: "Very High Cost"}
                    results_df['Predicted_Label'] = results_df['Predicted_Price_Range'].map(price_labels)
                    
                    # Display results
                    st.subheader("üìä Prediction Results")
                    st.dataframe(results_df[['Predicted_Price_Range', 'Predicted_Label']].head(20))
                    
                    # Show distribution
                    st.subheader("üìà Prediction Distribution")
                    fig, ax = plt.subplots(figsize=(10, 6))
                    results_df['Predicted_Label'].value_counts().plot(kind='bar', ax=ax, color='skyblue')
                    ax.set_title('Distribution of Predicted Price Ranges')
                    ax.set_xlabel('Price Range')
                    ax.set_ylabel('Count')
                    plt.xticks(rotation=45)
                    st.pyplot(fig)
                    
                    # Download results
                    csv = results_df.to_csv(index=False)
                    st.download_button(
                        label="Download Predictions as CSV",
                        data=csv,
                        file_name="mobile_price_predictions.csv",
                        mime="text/csv"
                    )
                    
        except Exception as e:
            st.error(f"Error: {str(e)}")

# Model Comparison Page
elif options == "Model Comparison":
    st.header("üìä Model Performance Comparison")
    
    # Load pre-calculated results
    try:
        results_df = pd.read_csv('model_comparison_results.csv')
        
        # Display metrics table
        st.subheader("Evaluation Metrics Table")
        st.dataframe(results_df.style.format({
            'Accuracy': '{:.4f}',
            'AUC': '{:.4f}',
            'Precision': '{:.4f}',
            'Recall': '{:.4f}',
            'F1': '{:.4f}',
            'MCC': '{:.4f}'
        }))
        
        # Create comparison chart
        st.subheader("Model Performance Visualization")
        
        # Let user select metric to visualize
        metric_to_plot = st.selectbox("Select metric to visualize:", 
                                     ['Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC'])
        
        fig, ax = plt.subplots(figsize=(12, 6))
        bars = ax.bar(results_df['Model'], results_df[metric_to_plot], color='lightcoral')
        ax.set_title(f'{metric_to_plot} Comparison Across Models')
        ax.set_xlabel('Model')
        ax.set_ylabel(metric_to_plot)
        ax.set_ylim([0, 1])
        plt.xticks(rotation=45)
        
        # Add value labels on bars
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                   f'{height:.3f}', ha='center', va='bottom')
        
        st.pyplot(fig)
        
        # Show observations
        st.subheader("üîç Model Performance Observations")
        st.markdown("""
        **Logistic Regression**: Simple linear model, decent performance for baseline.
        
        **Decision Tree**: May overfit but provides good interpretability.
        
        **K-Nearest Neighbors**: Distance-based, requires feature scaling.
        
        **Naive Bayes**: Fast but assumes feature independence.
        
        **Random Forest**: Ensemble method, reduces overfitting, generally robust.
        
        **XGBoost**: Advanced ensemble, often best for tabular data.
        """)
        
    except Exception as e:
        st.error(f"Error loading results: {str(e)}")
        st.info("Please run the model training notebook first to generate comparison results.")

# About Page
elif options == "About":
    st.header("‚ÑπÔ∏è About This Project")
    st.markdown("""
    ### Machine Learning Assignment 2
    
    **Objective**: Implement and compare 6 classification models for mobile price prediction.
    
    **Models Implemented**:
    1. Logistic Regression
    2. Decision Tree Classifier
    3. K-Nearest Neighbor Classifier
    4. Naive Bayes Classifier
    5. Random Forest (Ensemble)
    6. XGBoost (Ensemble)
    
    **Evaluation Metrics**:
    - Accuracy
    - AUC Score
    - Precision
    - Recall
    - F1 Score
    - Matthews Correlation Coefficient (MCC)
    
    **Dataset**: Mobile Price Classification Dataset (2000 samples, 20 features)
    
    **Deployment**: Streamlit Web Application
    
    ### How to Use
    1. Go to **Upload & Predict** page to test the models
    2. Upload a CSV file with mobile features (without price_range)
    3. Select a model from dropdown
    4. Click Predict to see results
    5. Check **Model Comparison** page to see performance metrics
    
    ### Technical Details
    - Built with Python, Scikit-learn, XGBoost
    - Web interface: Streamlit
    - Deployment: Streamlit Community Cloud
    """)
    
    st.info("**Note**: For assignment submission, this app is deployed on Streamlit Community Cloud with all required features.")

# Footer
st.sidebar.markdown("---")
st.sidebar.info(
    """
    **Assignment 2 - Machine Learning**  
    M.Tech (AIML/DSE)  
    BITS Pilani WILP  
    """
)

2026-02-06 20:31:41.245 
  command:

    streamlit run /home/cloud/.local/lib/python3.9/site-packages/ipykernel_launcher.py [ARGUMENTS]
2026-02-06 20:31:41.264 Session state does not function when running a script without `streamlit run`


DeltaGenerator(_root_container=1, _parent=DeltaGenerator())

In [16]:
# Get your actual metrics in the correct format
print("=== ACTUAL METRICS FOR README.md ===")
print("\nCopy and paste this table into your README.md:\n")

# Print the markdown table header
print("| ML Model Name | Accuracy | AUC | Precision | Recall | F1 | MCC |")
print("|---------------|----------|-----|-----------|--------|----|-----|")

# Print each row with your actual values
for index, row in results_df.iterrows():
    print(f"| {row['Model']} | {row['Accuracy']:.4f} | {row['AUC']:.4f} | {row['Precision']:.4f} | {row['Recall']:.4f} | {row['F1']:.4f} | {row['MCC']:.4f} |")

print("\n=== OBSERVATIONS TABLE ===")
print("\nCopy this format for your observations:")
print("""
| ML Model Name | Observation about model performance |
|---------------|--------------------------------------|
| Logistic Regression | [Your observation based on your results] |
| Decision Tree | [Your observation based on your results] |
| K-Nearest Neighbors | [Your observation based on your results] |
| Naive Bayes | [Your observation based on your results] |
| Random Forest (Ensemble) | [Your observation based on your results] |
| XGBoost (Ensemble) | [Your observation based on your results] |
""")

=== ACTUAL METRICS FOR README.md ===

Copy and paste this table into your README.md:

| ML Model Name | Accuracy | AUC | Precision | Recall | F1 | MCC |
|---------------|----------|-----|-----------|--------|----|-----|
| Logistic Regression | 0.6700 | 0.8968 | 0.6809 | 0.6700 | 0.6746 | 0.5605 |
| Decision Tree | 0.8300 | 0.8867 | 0.8319 | 0.8300 | 0.8302 | 0.7738 |
| K-Nearest Neighbors | 0.5000 | 0.7697 | 0.5211 | 0.5000 | 0.5054 | 0.3350 |
| Naive Bayes | 0.8100 | 0.9506 | 0.8113 | 0.8100 | 0.8105 | 0.7468 |
| Random Forest | 0.8800 | 0.9769 | 0.8796 | 0.8800 | 0.8797 | 0.8400 |
| XGBoost | 0.9225 | 0.9937 | 0.9226 | 0.9225 | 0.9225 | 0.8967 |

=== OBSERVATIONS TABLE ===

Copy this format for your observations:

| ML Model Name | Observation about model performance |
|---------------|--------------------------------------|
| Logistic Regression | [Your observation based on your results] |
| Decision Tree | [Your observation based on your results] |
| K-Nearest Neighbors | [Your obs

In [17]:
# Get missing metrics
print("=== MISSING METRICS ===")
print("Run this code to get Logistic Regression and Decision Tree metrics:\n")

# Filter for Logistic Regression and Decision Tree
missing_models = results_df[results_df['Model'].isin(['Logistic Regression', 'Decision Tree'])]

for index, row in missing_models.iterrows():
    print(f"{row['Model']}:")
    print(f"  Accuracy: {row['Accuracy']:.4f}")
    print(f"  AUC: {row['AUC']:.4f}")
    print(f"  Precision: {row['Precision']:.4f}")
    print(f"  Recall: {row['Recall']:.4f}")
    print(f"  F1: {row['F1']:.4f}")
    print(f"  MCC: {row['MCC']:.4f}")
    print()

=== MISSING METRICS ===
Run this code to get Logistic Regression and Decision Tree metrics:

Logistic Regression:
  Accuracy: 0.6700
  AUC: 0.8968
  Precision: 0.6809
  Recall: 0.6700
  F1: 0.6746
  MCC: 0.5605

Decision Tree:
  Accuracy: 0.8300
  AUC: 0.8867
  Precision: 0.8319
  Recall: 0.8300
  F1: 0.8302
  MCC: 0.7738



In [18]:
streamlit run app.py

SyntaxError: invalid syntax (3737097518.py, line 1)