In [2]:
import pandas as pd

# Load the dataset
file_path = 'smartphones.csv'
df = pd.read_csv(file_path)

# Inspect the dataset
print(df.head())
print(df.info())
print(df.describe())


                                     Smartphone     Brand           Model  \
0            Realme C55 8/256GB Sunshower Libre    Realme             C55   
1      Samsung Galaxy M23 5G 4/128GB Azul Libre   Samsung      Galaxy M23   
2  Motorola Moto G13 4/128GB Azul Lavanda Libre  Motorola        Moto G13   
3      Xiaomi Redmi Note 11S 6/128GB Gris Libre    Xiaomi  Redmi Note 11S   
4       Nothing Phone (2) 12/512GB Blanco Libre   Nothing       Phone (2)   

    RAM  Storage   Color Free  Final Price  
0   8.0    256.0  Yellow  Yes       231.60  
1   4.0    128.0    Blue  Yes       279.00  
2   4.0    128.0    Blue  Yes       179.01  
3   6.0    128.0    Gray  Yes       279.99  
4  12.0    512.0   White  Yes       799.00  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1816 entries, 0 to 1815
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Smartphone   1816 non-null   object 
 1   Brand        1816 non-null   o

In [6]:
df.isna().sum()

Smartphone     0
Brand          0
Model          0
RAM            0
Storage        0
Color          0
Free           0
Final Price    0
dtype: int64

In [3]:
from sklearn.preprocessing import LabelEncoder

# Fill missing values with median
df['RAM'].fillna(df['RAM'].median(), inplace=True)
df['Storage'].fillna(df['Storage'].median(), inplace=True)

# Encode categorical variables
label_encoders = {}
for column in ['Brand', 'Model', 'Color', 'Free']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Split data into features and target
X = df.drop(['Smartphone', 'Final Price'], axis=1)
y = df['Final Price']

# Split data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize/scale the numerical features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Dictionary to store the evaluation metrics for each model
results = {}

# Function to evaluate a model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        'RMSE': mean_squared_error(y_test, y_pred, squared=False),
        'MAE': mean_absolute_error(y_test, y_pred)
    }


# Decision Tree Regressor
decision_tree = DecisionTreeRegressor()
results['Decision Tree'] = evaluate_model(decision_tree, X_train, X_test, y_train, y_test)

# Random Forest Regressor
random_forest = RandomForestRegressor()
results['Random Forest'] = evaluate_model(random_forest, X_train, X_test, y_train, y_test)

# Support Vector Machine (SVR)
svr = SVR()
results['SVR'] = evaluate_model(svr, X_train, X_test, y_train, y_test)

# k-Nearest Neighbors (k-NN)
knn = KNeighborsRegressor()
results['k-NN'] = evaluate_model(knn, X_train, X_test, y_train, y_test)

# Gradient Boosting Regressor
gbr = GradientBoostingRegressor()
results['Gradient Boosting'] = evaluate_model(gbr, X_train, X_test, y_train, y_test)

# Gaussian Naive Bayes (not typically used for regression, but included for completeness)
# Convert target variable to categorical for Naive Bayes
y_train_nb = pd.cut(y_train, bins=5, labels=False)
y_test_nb = pd.cut(y_test, bins=5, labels=False)
naive_bayes = GaussianNB()
results['Naive Bayes'] = evaluate_model(naive_bayes, X_train, X_test, y_train_nb, y_test_nb)

# Print the results
for model, metrics in results.items():
    print(f"Model: {model}")
    for metric, score in metrics.items():
        print(f"{metric}: {score:.4f}")
    print("\n")

# Determine the best model based on a chosen metric (e.g., RMSE)
best_model = min(results, key=lambda x: results[x]['RMSE'])
print(f"The best model is: {best_model}")


Model: Decision Tree
RMSE: 223.9503
MAE: 138.8084


Model: Random Forest
RMSE: 201.1033
MAE: 124.4030


Model: SVR
RMSE: 436.8912
MAE: 272.3355


Model: k-NN
RMSE: 262.8670
MAE: 173.7760


Model: Gradient Boosting
RMSE: 192.2926
MAE: 125.0495


Model: Naive Bayes
RMSE: 2.4596
MAE: 1.8187


The best model is: Naive Bayes
