In [50]:
import pandas as pd
import numpy as np

In [51]:
ds = pd.read_csv('smartphones.csv') 

In [52]:
ds.shape


(1816, 8)

In [53]:
ds.isnull().sum()

Smartphone       0
Brand            0
Model            0
RAM            483
Storage         25
Color            0
Free             0
Final Price      0
dtype: int64

In [54]:
ds.iloc[0:1816]

Unnamed: 0,Smartphone,Brand,Model,RAM,Storage,Color,Free,Final Price
0,Realme C55 8/256GB Sunshower Libre,Realme,C55,8.0,256.0,Yellow,Yes,231.60
1,Samsung Galaxy M23 5G 4/128GB Azul Libre,Samsung,Galaxy M23,4.0,128.0,Blue,Yes,279.00
2,Motorola Moto G13 4/128GB Azul Lavanda Libre,Motorola,Moto G13,4.0,128.0,Blue,Yes,179.01
3,Xiaomi Redmi Note 11S 6/128GB Gris Libre,Xiaomi,Redmi Note 11S,6.0,128.0,Gray,Yes,279.99
4,Nothing Phone (2) 12/512GB Blanco Libre,Nothing,Phone (2),12.0,512.0,White,Yes,799.00
...,...,...,...,...,...,...,...,...
1811,Xiaomi Redmi Note 8 4/64GB Azul Libre,Xiaomi,Redmi Note 8,4.0,64.0,Blue,Yes,249.01
1812,Xiaomi Redmi Note 8T 4/128GB Azul Estelar Libre,Xiaomi,Redmi Note 8T,4.0,128.0,Blue,Yes,200.00
1813,Xiaomi Redmi Note 9 4/128GB Blanco Libre,Xiaomi,Redmi Note 9,4.0,128.0,White,Yes,269.00
1814,Xiaomi Redmi Note 9S 4/64GB Dual SIM Gris Libre,Xiaomi,Note 9S,4.0,64.0,Gray,Yes,211.00


In [55]:

ds.duplicated().sum()

0

# DATA PREPARATION

#there is no duplicate value but there are 485,25 null values pressent so we have to remove the following rows 

In [56]:
df = ds.dropna()
#ds.dropna(inplace=True)
#ds.shape

In [57]:
df.isnull().sum()

Smartphone     0
Brand          0
Model          0
RAM            0
Storage        0
Color          0
Free           0
Final Price    0
dtype: int64

In [58]:
dataset= df.drop(columns=['Smartphone', 'Brand', 'Model', 'Color', 'Free']) 

In [59]:
dataset.columns
dataset.iloc[0:10]

Unnamed: 0,RAM,Storage,Final Price
0,8.0,256.0,231.6
1,4.0,128.0,279.0
2,4.0,128.0,179.01
3,6.0,128.0,279.99
4,12.0,512.0,799.0
5,4.0,64.0,148.52
6,12.0,256.0,699.0
7,8.0,128.0,352.59
8,4.0,128.0,279.0
9,8.0,256.0,329.99


In [60]:
#dataset.sort_values('Storage')
dataset[dataset['Final Price']>600]

Unnamed: 0,RAM,Storage,Final Price
4,12.0,512.0,799.00
6,12.0,256.0,699.00
54,12.0,256.0,699.00
74,12.0,256.0,899.00
81,12.0,256.0,702.59
...,...,...,...
1756,12.0,256.0,747.18
1760,8.0,128.0,1000.00
1793,12.0,256.0,1802.94
1794,12.0,512.0,1922.93


In [61]:
dataset.rename(columns={'RAM': 'ram', 'Storage': 'storage', 'Final Price': 'price'}, inplace=True)
dataset.columns

Index(['ram', 'storage', 'price'], dtype='object')

# data visualization - smartphones.csv


In [62]:
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as pl
import plotly.graph_objects as go


In [63]:
#dataset.price.value_counts().plot(kind='bar')

#plt.title("graph of no of items in price range ")
#plt.xlabel("price")
#plt.ylabel("No of items ")
#plt.show()

In [64]:
fig = pl.scatter_3d(dataset, x='ram', y='storage', z='price',
                   color='price', size='price',
                   title='Mobile Price vs RAM & Storage',
                   labels={'price': 'Price ($)', 'ram': 'RAM (GB)', 'storage': 'Storage (GB)'})
fig.show()

In [65]:
corr_matrix = df[['ram', 'storage', 'price']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, 
            annot=True,          
            cmap='coolwarm',     
            center=0,            
            square=True,         
            linewidths=0.5)      
plt.title('Mobile Price: RAM vs Storage Correlation')
plt.tight_layout()
plt.show()

KeyError: "None of [Index(['ram', 'storage', 'price'], dtype='object')] are in the [columns]"

# MACHINE LEARNING MODEL

#feature training

In [68]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [69]:
X = dataset[['ram', 'storage']]  
y = dataset['price'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [70]:
print(X_train.shape)
print(X_test.shape)
print(y_test.shape)
print(y_train.shape)

(1066, 2)
(267, 2)
(267,)
(1066,)


In [71]:

model = LinearRegression()
model.fit(X_train,y_train)

In [72]:
#X_test
pred = model.predict(X_test)

In [73]:
pred

array([2.10952494e+02, 1.52889340e+02, 5.90232634e+02, 8.22485249e+02,
       2.10952494e+02, 3.76087977e+02, 1.28384753e+02, 1.28384753e+02,
       3.76087977e+02, 5.90232634e+02, 3.27078802e+02, 3.76087977e+02,
       5.90232634e+02, 3.27078802e+02, 7.03215988e+01, 2.59961669e+02,
       5.90232634e+02, 2.59961669e+02, 8.22485249e+02, 7.03215988e+01,
       1.28384753e+02, 7.03215988e+01, 8.22485249e+02, 4.92214284e+02,
       7.03215988e+01, 3.76087977e+02, 7.03215988e+01, 4.92214284e+02,
       3.76087977e+02, 1.28384753e+02, 3.76087977e+02, 5.90232634e+02,
       2.10952494e+02, 3.76087977e+02, 4.74106326e+02, 4.92214284e+02,
       3.76087977e+02, 5.90232634e+02, 2.59961669e+02, 2.59961669e+02,
       4.92214284e+02, 2.10952494e+02, 4.92214284e+02, 4.92214284e+02,
       5.90232634e+02, 1.16132459e+02, 3.27078802e+02, 5.80693051e+01,
       4.92214284e+02, 2.10952494e+02, 2.59961669e+02, 8.22485249e+02,
       1.52889340e+02, 4.74106326e+02, 1.28384753e+02, 3.76087977e+02,
      

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

#rf_r2score=r2_score(y_test,pred)
#print(rf_r2score)  
#mean_absolute_error=mean_absolute_error(y_test,pred)
#print(mean_absolute_error)  
#mean_squared_error=mean_squared_error(y_test,pred)
#print(mean_squared_error)  


0.5380246035630138


In [90]:
# Multiple Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_r2score = r2_score(y_test, lr_pred)

In [91]:
#  Decision Tree
dt_model = DecisionTreeRegressor(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
dt_r2score = r2_score(y_test, dt_pred)


In [92]:
# 3. Random Forest
rf_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_r2score = r2_score(y_test, rf_pred)

In [100]:
print("R² Scores Comparison :")
print(f"Multiple Linear Regression: {lr_r2score:.3f}")
print(f"Decision Tree: {dt_r2score:.3f}")
print(f"Random Forest: {rf_r2score:.3f}")

best_model = max([('Linear', lr_r2score), ('Decision Tree', dt_r2score), ('Random Forest', rf_r2score)], 
                 key=lambda x: x[1])
print(f"BEST MODEL : {best_model[0]} (R² = {best_model[1]:.3%} )")

R² Scores Comparison :
Multiple Linear Regression: 0.538
Decision Tree: 0.521
Random Forest: 0.516
BEST MODEL : Linear (R² = 53.802% )
