In [54]:
import pandas as pd 
import numpy as np

import pickle as pkl
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.feature_selection import RFECV

In [2]:
# load the dataset
df = pd.read_csv('smartphones.csv')

In [3]:
df.shape

(1009, 14)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   brand             1009 non-null   object 
 1   processor_name    1009 non-null   object 
 2   processor_cores   1009 non-null   float64
 3   processor_speed   1009 non-null   float64
 4   os                1009 non-null   object 
 5   ram               1009 non-null   float64
 6   storage           1009 non-null   float64
 7   battery_capacity  1009 non-null   int64  
 8   DualSim           1009 non-null   int64  
 9   is_5G             1009 non-null   int64  
 10  wifi              1009 non-null   int64  
 11  ppi               1009 non-null   float64
 12  camera            1009 non-null   float64
 13  price             1009 non-null   int64  
dtypes: float64(6), int64(5), object(3)
memory usage: 110.5+ KB


In [5]:
df.isnull().sum()

brand               0
processor_name      0
processor_cores     0
processor_speed     0
os                  0
ram                 0
storage             0
battery_capacity    0
DualSim             0
is_5G               0
wifi                0
ppi                 0
camera              0
price               0
dtype: int64

In [6]:
df.duplicated().sum()

0

In [7]:
# separating input variable and target variable 
X = df.drop(['price'],axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15,random_state=42)

# Categorical and numerical feature columns by index
categorical_features_indices = [0,1,4]  
numerical_features_indices = [2,3,5,6,7,8,9,10,11,12] 

# ColumnTransformer using column indices 
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_features_indices),
        ('num', StandardScaler(), numerical_features_indices)
    ],
    remainder='passthrough' 
)

### Linear Regression :

In [8]:
# Creating the pipeline with preprocessor and Linear Regression 
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
lr = r2_score(y_test,y_pred)
# print(lr)

# calculating cross_val_score for accurate selection
print(cross_val_score(pipe, X, y, cv=5, scoring='r2').mean())
print(cross_val_score(pipe, X, y, cv=5, scoring='neg_mean_absolute_error').mean())

0.7116221971294155
-10082.498689719718


### SVM :

In [9]:
# Creating the pipeline with preprocessor and SVM 
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', SVR())
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
svr = r2_score(y_test,y_pred)
# print(svr)

# calculating cross_val_score for accurate selection
print(cross_val_score(pipe, X, y, cv=5, scoring='r2').mean())
print(cross_val_score(pipe, X, y, cv=5, scoring='neg_mean_absolute_error').mean())

-0.1297004440778669
-18045.845341820706


### Decision Tree :

In [10]:
# Creating the pipeline with preprocessor and Decision Tree 
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(max_depth=9, min_samples_split=10))
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
dt = r2_score(y_test,y_pred)
# print(dt)

# calculating cross_val_score for accurate selection
print(cross_val_score(pipe, X, y, cv=5, scoring='r2').mean())
print(cross_val_score(pipe, X, y, cv=5, scoring='neg_mean_absolute_error').mean())

0.7162251027729114
-7833.17975997631


### Random Forest :

In [13]:
# Creating the pipeline with preprocessor and Random Forest
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100,max_depth=20,n_jobs=-1))
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
rf = r2_score(y_test,y_pred)
# print(rf)

# calculating cross_val_score for accurate selection
print(cross_val_score(pipe, X, y, cv=5, scoring='r2').mean())
print(cross_val_score(pipe, X, y, cv=5, scoring='neg_mean_absolute_error').mean())

0.8465784104147858
-6108.592347157055


### XGBoost :

In [12]:
# Creating the pipeline with preprocessor and XGBoost
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb.XGBRegressor(n_estimators=500))
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
xgboost = r2_score(y_test,y_pred)
# print(xgboost)

# calculating cross_val_score for accurate selection
print(cross_val_score(pipe, X, y, cv=5, scoring='r2').mean())
print(cross_val_score(pipe, X, y, cv=5, scoring='neg_mean_absolute_error').mean())

0.8229102476107893
-6246.82784762782


# Hyperparameter Tuning

### 1. Random Forest

In [None]:
# Hyperparameter tuning with random forest
param_grid = {
    'model__n_estimators': [100, 300, 500],
    'model__max_depth': [None, 5, 10, 20, 25, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__bootstrap': [True, False]
}

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42,n_jobs=-1))
])

grid = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, n_jobs=-1)

grid.fit(X_train, y_train)
print(grid.best_params_)

y_pred = grid.predict(X_test)

print(r2_score(y_test, y_pred))

### 2. Decision Tree :

In [None]:
# Hyperparameter tuning with random forest
param_grid = {
            'model__max_depth': [None, 5, 8, 9, 10, 20, 30],
            'model__min_samples_split': [2, 10, 15, 20],
            'model__min_samples_leaf': [1, 2, 3, 4, 5],
}

pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor())
])

grid = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, n_jobs=-1)

grid.fit(X_train, y_train)
print(grid.best_params_)

y_pred = grid.predict(X_test)

print(r2_score(y_test, y_pred))

# Dimensionality Reduction 

### Using Random_Forest_feature_importance :

In [14]:
print(pipe)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  OneHotEncoder(drop='first'),
                                                  [0, 1, 4]),
                                                 ('num', StandardScaler(),
                                                  [2, 3, 5, 6, 7, 8, 9, 10, 11,
                                                   12])])),
                ('model', RandomForestRegressor(max_depth=20, n_jobs=-1))])


In [15]:
model = pipe.named_steps['model']
features_imp = model.feature_importances_

preprocessor = pipe.named_steps['preprocessor']
features_names = preprocessor.get_feature_names_out()

# Creating a DataFrame to display feature importances with corresponding feature names
importance_df = pd.DataFrame({
    'Feature': features_names,
    'Importance': features_imp
}).sort_values(by='Importance', ascending=False)

# importance_df

In [49]:
# removing unimportant columns ---> wifi, processor_name
new_df = df.drop(['processor_name','wifi'],axis=1)

new_df.head()

Unnamed: 0,brand,processor_cores,processor_speed,os,ram,storage,battery_capacity,DualSim,is_5G,ppi,camera,price
0,OnePlus,8.0,3.2,Android,12.0,256.0,5000,1,1,525.921017,66.0,54999
1,OnePlus,8.0,2.2,Android,6.0,128.0,5000,1,1,401.024751,80.0,19989
2,Samsung,8.0,2.4,Android,4.0,64.0,5000,1,1,399.864072,63.0,16499
3,Motorola,8.0,2.2,Android,6.0,128.0,5000,1,1,401.802361,66.0,14999
4,Realme,8.0,2.6,Android,6.0,128.0,5000,1,1,394.440763,124.0,24999


In [50]:
new_df.shape

(1009, 12)

In [51]:
# separating input variable and target variable 
X = new_df.drop(['price'],axis=1)
y = new_df['price']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15,random_state=42)

# Categorical and numerical feature columns by index
categorical_features_indices = [0,3]  
numerical_features_indices = [1,2,4,5,6,7,8,9,10] 

# ColumnTransformer using column indices 
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_features_indices),
        ('num', StandardScaler(), numerical_features_indices)
    ],
    remainder='passthrough' 
)

In [52]:
# Creating the pipeline with preprocessor and Random Forest
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100,max_depth=20,n_jobs=-1))
])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
rf = r2_score(y_test,y_pred)
# print(rf)

# calculating cross_val_score for accurate selection
print(cross_val_score(pipe, X, y, cv=5, scoring='r2').mean())
print(cross_val_score(pipe, X, y, cv=5, scoring='neg_mean_absolute_error').mean())

0.8439371277711121
-6152.864966138128


In [53]:
pipe

In [56]:
req_df = df[['brand','os']]

In [58]:
# getting pipeline and df
with open('pipe.pkl', 'wb') as file:
    pkl.dump(pipe, file)
with open('data.pkl', 'wb') as file:
    pkl.dump(req_df, file)