In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
df1=pd.read_csv('mobiles_cleaned.csv')

In [3]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [4]:
df1.head()

Unnamed: 0,mobile_name,brand,price,rating,5g,nfc,ir_blaster,processor_brand,processor_name,num_cores,processor_speed,ram,memory,battery_capacity,fast_charging_capacity,Fast_charging,screen_size,resolution_width,resolution_height,refresh_rate,Punch_Hole,Notch,primary_camera,front_camera,num_rear_camera,num_front_camera,Extra_storage_supported,Extra_storage(GB),os_name,os_version
0,OPPO Reno 11,oppo,29990,4.7,True,True,True,Dimensity,Dimensity 8200,8,3.1,8.0,256.0,4800.0,67.0,True,6.7,1080.0,2412.0,120.0,True,False,50.0,32,3,1,False,,Android,14.0
1,Poco X6 Pro 5G,poco,19999,4.5,True,True,True,Dimensity,Dimensity 8300 Ultra,8,3.35,12.0,256.0,5500.0,90.0,True,6.67,1220.0,2712.0,120.0,True,False,64.0,16,3,1,False,,Android,14.0
2,Xiaomi Redmi Note 13 Pro Plus,xiaomi,31999,4.35,True,True,True,Dimensity,Dimensity 7200 Ultra,8,2.8,8.0,256.0,5000.0,120.0,True,6.67,1220.0,2712.0,120.0,True,False,200.0,16,3,1,False,,Android,13.0
3,OPPO Reno 11 Pro,oppo,40990,4.75,True,True,True,Dimensity,Dimensity 8200,8,3.1,12.0,256.0,4600.0,80.0,True,6.74,1240.0,2772.0,120.0,True,False,50.0,32,3,1,False,,Android,14.0
4,Xiaomi Redmi Note 13 Pro Max 5G,xiaomi,33999,4.0,True,True,True,Snapdragon,Snapdragon 7 Gen1,8,2.4,12.0,256.0,5200.0,120.0,True,6.67,1220.0,2712.0,144.0,True,False,200.0,32,3,1,False,,Android,13.0


#### Data preprocessing


1.removing unnecassary columns

2.reducing categories in categorical cols

3.filling missing values

4.transformation based on EDA

5.Encoding

In [6]:
df1.drop(columns=['mobile_name','processor_name'],inplace=True)

In [7]:
df1= df1[df1['price']<=200000]

In [8]:
#applying transformation on price
np.log(df1['price']).skew()

0.425479353379509

In [9]:
df1=df1[df1['battery_capacity']<=11000]

In [10]:
df1=df1[df1['screen_size']>4.5]

In [11]:
df1=df1[df1['refresh_rate']<=165]

In [12]:
df1['is_fold']=df1['screen_size']>=8

In [13]:
df1.shape

(1392, 29)

In [14]:
df1.duplicated().sum()

5

In [15]:
df1.drop_duplicates(inplace=True)

In [16]:
#filling null values of rating columns
df1['rating'] = df1.groupby('brand')['rating'].transform(lambda x: x.fillna(x.mean()))
df1['rating'] = df1['rating'].fillna(df1['rating'].mean())


In [17]:
df1['processor_speed'] = df1.groupby('processor_brand')['processor_speed'].transform( lambda x: x.fillna(x.mean()))

# If some processor_brand groups have all NaNs, fill them with overall mean
df1['processor_speed'] = df1['processor_speed'].fillna(df1['processor_speed'].mean())


In [18]:
df1['fast_charging_capacity'] = df1.groupby('brand')['fast_charging_capacity'].transform(
    lambda x: x.fillna(x[x.notna()].mean())
)


In [19]:
df1.drop(columns=['Extra_storage(GB)'],inplace =True)

In [20]:
# Step 1: Fill NaNs with median per os_name
# Step 1: Fill NaNs with median per os_name
df1['os_version'] = df1.groupby('os_name')['os_version'].transform(
    lambda x: x.fillna(x.median())
)

# Step 2: Fill any remaining NaNs with overall median
df1['os_version'] = df1['os_version'].fillna(df1['os_version'].median())


In [21]:
# Convert to numeric, stripping any text
df1['front_camera']=df1['front_camera'].astype('float')
df1['front_camera'] = df1['front_camera'].fillna(
    df1.loc[df1['brand'].str.lower() == 'realme', 'front_camera'].mean()
)


In [22]:
df1.reset_index(inplace=True)

In [23]:
df1.drop(columns='index',inplace=True)

In [24]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1387 entries, 0 to 1386
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   brand                    1387 non-null   object 
 1   price                    1387 non-null   int64  
 2   rating                   1387 non-null   float64
 3   5g                       1387 non-null   bool   
 4   nfc                      1387 non-null   bool   
 5   ir_blaster               1387 non-null   bool   
 6   processor_brand          1387 non-null   object 
 7   num_cores                1387 non-null   int64  
 8   processor_speed          1387 non-null   float64
 9   ram                      1387 non-null   float64
 10  memory                   1387 non-null   float64
 11  battery_capacity         1387 non-null   float64
 12  fast_charging_capacity   1387 non-null   float64
 13  Fast_charging            1387 non-null   bool   
 14  screen_size             

In [25]:
#label Encoding
from sklearn.preprocessing import LabelEncoder

# Copy dataframe to avoid modifying original
df_encoded = df1.copy()

# 1. Encode boolean columns to int
bool_cols = df_encoded.select_dtypes(include=['bool']).columns
df_encoded[bool_cols] = df_encoded[bool_cols].astype(int)

# 2. Label encode object columns
obj_cols = df_encoded.select_dtypes(include=['object']).columns
le = LabelEncoder()

for col in obj_cols:
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))

df_encoded.head()


Unnamed: 0,brand,price,rating,5g,nfc,ir_blaster,processor_brand,num_cores,processor_speed,ram,memory,battery_capacity,fast_charging_capacity,Fast_charging,screen_size,resolution_width,resolution_height,refresh_rate,Punch_Hole,Notch,primary_camera,front_camera,num_rear_camera,num_front_camera,Extra_storage_supported,os_name,os_version,is_fold
0,14,29990,4.7,1,1,1,1,8,3.1,8.0,256.0,4800.0,67.0,1,6.7,1080.0,2412.0,120.0,1,0,50.0,32.0,3,1,0,0,14.0,0
1,16,19999,4.5,1,1,1,1,8,3.35,12.0,256.0,5500.0,90.0,1,6.67,1220.0,2712.0,120.0,1,0,64.0,16.0,3,1,0,0,14.0,0
2,22,31999,4.35,1,1,1,1,8,2.8,8.0,256.0,5000.0,120.0,1,6.67,1220.0,2712.0,120.0,1,0,200.0,16.0,3,1,0,0,13.0,0
3,14,40990,4.75,1,1,1,1,8,3.1,12.0,256.0,4600.0,80.0,1,6.74,1240.0,2772.0,120.0,1,0,50.0,32.0,3,1,0,0,14.0,0
4,22,33999,4.0,1,1,1,5,8,2.4,12.0,256.0,5200.0,120.0,1,6.67,1220.0,2712.0,144.0,1,0,200.0,32.0,3,1,0,0,13.0,0


In [26]:
from sklearn.model_selection import train_test_split

# Target variable
y = df_encoded['price']

# Features (dropping the target col)
X = df_encoded.drop(columns=['price'])

# Train–test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (1109, 27)
Test shape: (278, 27)


In [27]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Train Random Forest
rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# Predictions
y_pred_rf = rf.predict(X_test)

# Metrics
mae = mean_absolute_error(y_test, y_pred_rf)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2 = r2_score(y_test, y_pred_rf)

print(f"Random Forest Results:")
print(f"MAE:  {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²:   {r2:.6f}")


Random Forest Results:
MAE:  5656.3652
RMSE: 9818.0298
R²:   0.894410


In [28]:
from xgboost import XGBRegressor
# Train XGBoost
xgb = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    random_state=42,
    n_jobs=-1
)
xgb.fit(X_train, y_train)

# Predictions
y_pred_xgb = xgb.predict(X_test)

# Metrics
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost Results:")
print(f"MAE:  {mae_xgb:.4f}")
print(f"RMSE: {rmse_xgb:.4f}")
print(f"R²:   {r2_xgb:.6f}")


XGBoost Results:
MAE:  5230.8879
RMSE: 9540.7102
R²:   0.900291


In [29]:
df1.to_csv('after_missing_value_imputation.csv',index=False)