In [153]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import time
import joblib
import os

from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import GridSearchCV,cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

In [154]:
dataset_path = "mobile phone price prediction.csv"
random_state = 29
K_cv = [8]
polynom_degree = [2,3]
alpha_values = [0.01,0.05,0.1,0.2,0.5,1,2,5,10,11,12,13,14,15,18,20]
max_iterations = [10000, 20000, 30000, 40000, 50000]
tolerance = [1, 1e-1, 1e-2, 1e-3]

feature_scaler = StandardScaler()
polynomial_feature_scaler = PolynomialFeatures()

models = {
    "Simple":LinearRegression(), # Linear Regression
    "Poly":LinearRegression(), # Poly Regression
    "Lasso":Lasso(), # L1 Regression
    "Ridge":Ridge() # L2 Regression
}

param_grids = {
    "Simple": {
        "model__fit_intercept": [True, False],
    },
    "Poly": {
        "poly__degree": polynom_degree,
        "model__fit_intercept": [True, False], 
    },
    "Lasso": {
        "model__alpha": alpha_values,
        "model__max_iter": max_iterations,
        "model__tol": tolerance
    },
    "Ridge": {
        "model__alpha": alpha_values,
        "model__max_iter": max_iterations,
        "model__solver": ["auto", "saga"],
        "model__tol": tolerance
    }
}

In [155]:
dataset_original = pd.read_csv(dataset_path)
length_dataset = len(dataset_original)
#print(dataset_original.info())
#print(dataset_original.head(3))

# renaming so all columns start with Capital letter
dataset_original.rename(columns={'Processor': 'Processor_core','company':'Company','fast_charging':'Fast_charging'},inplace=True)

## CHOSING WHICH COLUMNS ARE THE BEST FOR OUR PRICE PREDICTIONS

In [156]:
categorical_columns = ["Processor_core" , "Company" , "Processor_name"]
numerical_columns = ["Ram" , "Battery" , "Display" , "Android_version" , "Inbuilt_memory" , "Fast_charging", "Camera"]
target_column = ["Price"]

In [157]:
# function to remove outliers using our knowledge 
# it means for example RAM column should have values between 2 and 16 for example , LIKE WE KNOW THIS IN REAL LIFE ( OUR REAL LIFE KNOWLEDGE )
def remove_outliers_with_domain_knowledge(df, lower_bound, upper_bound):
    """
    Removing outliers with our real life knowledge on the topic
    
    Args:
        df: The Target Dataframe.
        lower_bound: Lowest values for our features.
        upper_bound: Highest values for our features.
    
    Returns:
        A Dataframe with clipped borders thus no extreme values, without the outliers.
    """
    df_cleaned = df.copy()
    for idx, col in enumerate(numerical_columns + target_column):
        #print(f"Before: Min = {df[col].min()}, Max = {df[col].max()}")
        # REMOVING VALUES THAT ARE LOWER OR BIGGER THE THE BOUNDS
        df_cleaned[col] = df[col].clip(lower=lower_bound[idx], upper=upper_bound[idx])
        #print(f"After: Min = {df_cleaned[col].min()}, Max = {df_cleaned[col].max()}")
        
    return df_cleaned

In [158]:
def box_plot_visualization(df,state):
    """
    Box plot visualization 
    
    Args:
        df: The Target Dataframe.
        state: State of our df, 'before or after'
    
    Returns:
        
    """
    columns = numerical_columns + target_column

    # Set the number of columns for each row (in this case, 2)
    num_columns = 2
    num_rows = (len(columns) + 1) // num_columns  # Calculate the number of rows needed

    # Create subplots
    _, axes = plt.subplots(num_rows, num_columns, figsize=(15, 5 * num_rows))
    
    # Flatten the axes array for easier iteration
    axes = axes.flatten()
    
    # Plot each histogram on the corresponding subplot
    for idx, col in enumerate(columns):
        sns.boxplot(x=df[col],ax=axes[idx])
        axes[idx].set_title(f"{col} State: {state} Boxplot")
        
    # Adjust layout to prevent overlap
    plt.tight_layout()
    plt.show()

In [159]:
def hist_plot_visualization(df):
    """
    Histograme plot visualization 
    
    Args:
        df: The Target Dataframe.

    Returns:
        
    """
    # Combine numerical columns and target column
    columns = numerical_columns + target_column

    # Set the number of columns for each row (in this case, 2)
    num_columns = 2
    num_rows = (len(columns) + 1) // num_columns  # Calculate the number of rows needed

    # Create subplots
    _, axes = plt.subplots(num_rows, num_columns, figsize=(15, 5 * num_rows))
    
    # Flatten the axes array for easier iteration
    axes = axes.flatten()

    # Plot each histogram on the corresponding subplot
    for idx, col in enumerate(columns):
        sns.histplot(df[col], kde=True, bins=30, ax=axes[idx])
        axes[idx].set_title(f"{col} : Capacity Distribution")
        
    # Adjust layout to prevent overlap
    plt.tight_layout()
    plt.show()

In [160]:
def heatmap_visualization(df):
    """
    Heatmap, Correlation plot visualization 
    
    Args:
        df: The Target Dataframe.

    Returns:
    
    """
    correlation_matrix = df[numerical_columns+target_column].corr()
    # Plot the correlation heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
    plt.title('Feature Correlation Matrix')
    plt.show()

In [161]:
def feature_importance_visualization(X,y):
    """
    Feature importance visualization
    
    Args:
        X: Our Features.
        y: Target Feature.

    Returns:
    
    """
    importance = mutual_info_regression(X, y.values.ravel())

    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importance
    })

    # Sort features by importance
    feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

    plt.figure(figsize=(12, 10))
    sns.barplot(
        data=feature_importance, 
        x='Importance', 
        y='Feature', 
        hue='Feature',  # Assign the `y` variable (Feature) to `hue`
        dodge=False,    # Ensure no separation by hue
        palette='viridis'
    )
    plt.title('Feature Importance Based on Mutual Information')
    plt.xlabel('Mutual Information Score')
    plt.ylabel('Features')
    plt.show()


In [162]:
def train_evaluate_with_gridsearch(X, y, models, param_grids):
    """
    Train and evaluate models using GridSearchCV for hyperparameter tuning.
    
    Args:
        X: Features dataframe.
        y: Target dataframe.
        models: Dictionary of model names and their initialized estimators.
        param_grids: Dictionary of model names and their hyperparameter grids.
    
    Returns:
        A DataFrame with results for each model and hyperparameter combination.
    """
    results = []
    scoring = {
        'r2': 'r2',
        'neg_mean_squared_error': 'neg_mean_squared_error',
        'neg_mean_absolute_error': 'neg_mean_absolute_error'
    }
    
    for name, model in models.items():
        for k_split in K_cv:
            param_grid = param_grids[name]
            # Create the preprocessing pipeline to include features scaling
            if 'Poly' in name:
                pipeline = Pipeline([
                ("poly", polynomial_feature_scaler),
                ("scaler", feature_scaler),
                ("model", model)
                ])
            else:
                pipeline = Pipeline([
                    ("scaler", feature_scaler),
                    ("model", model)
                ])
            
            # GridSearchCV setup
            grid_search = GridSearchCV(
                estimator=pipeline,
                param_grid=param_grid,
                scoring=scoring,  # Multiple scoring metrics
                refit="r2",  # Choose the best model based on R2
                cv=k_split,
                n_jobs=-1  # Parallelize for faster execution
            )

            begin_train_time = time.time()
            # Train GridSearchCV
            grid_search.fit(X, y)
            end_train_time = time.time()
            
            y_pred = cross_val_predict(grid_search.best_estimator_, X, y, cv=k_split)
            y = np.array(y).ravel() ## making y to 1-d array to match with y_pred shape
            
            # Calculating the metrics :
            y_mean = np.mean(y)
            mse_calculated = np.mean((y - y_pred) ** 2) # mse = 1/n * sum( (y_real - y_predicted)**2 )
            # note : the mse will be so big cause of the y values here ( prices ) are big numbers.
            mape_calculated = np.mean(np.abs((y - y_pred) / y)) # mape = 1/n * sum( abs(y_real - y_predicted) )
            
            sce = np.sum((y_pred - y_mean) ** 2)
            sst = np.sum((y - y_mean) ** 2)
            r2_calculated = sce / sst # r2 = SCE / SST
            
            results.append({
                "model_name": name,
                "model": grid_search.best_estimator_,
                "K_cv": k_split,
                "best_params": grid_search.best_params_,
                "train_time": end_train_time - begin_train_time,
                "mse_calculated":mse_calculated,
                "mape_calculated":mape_calculated,
                "r2_calculated":r2_calculated
            })
            
    results_df = pd.DataFrame(results)
    return results_df

In [163]:
def predict_price(input_data, real_prices, model, X, lower_bound, upper_bound):
    """
    Train and evaluate models using GridSearchCV for hyperparameter tuning.
    
    Args:
        input_data: Input Dataframe.
        real_prices: Real prices list that corresponds to our input_data.
        model: Dictionary of best model name and it initialized estimators.
        X: Features dataframe.
        lower_bound: Lowest values for our features.
        upper_bound: Highest values for our features.
    
    Returns:
        
    """
    input_data_encoded = pd.get_dummies(input_data, columns=categorical_columns)
    input_data_encoded = input_data_encoded.reindex(columns=X.columns, fill_value=0)

    prediction = model.predict(input_data_encoded)
    # Clip to bounds in case of bad predictions
    prediction = np.clip(prediction, lower_bound[-1], upper_bound[-1])

    for idx, pred in enumerate(prediction):
        #print(f"Type of prediction: {type(pred)}, Value: {pred}")
        print(f"The predicted price of the phone is: {pred.item():.2f}")
        print(f"Difference in price: {int(pred.item()) - real_prices[idx]}")

In [164]:
def save_model(model,want_save):
    """
    Saving Model Based on Condition.
    
    Args:
        model: Dictionary of best model name and it initialized estimators.
        want_save : Boolean to confirm the saving process.
    Returns:
        
    """
    if not want_save:
        return
    model_filename = f"{model['model_name']}_model_k{model['K_cv']}_r2_{model['mean_test_r2']:.3f}.joblib"
    joblib.dump(model['model'], model_filename)
    print(f"Model saved as: {model_filename}")

## DROPING COLUMNS THAT ARE NOT SUITED FOR OUR PREDICTIONS

In [165]:
for col in dataset_original.columns:
    if col not in categorical_columns+numerical_columns+target_column:
        dataset_original.drop(col,axis=1,inplace=True)
        
dataset_original.drop_duplicates(inplace=True)
dataset_original.dropna(axis=0,how='all',inplace=True)

In [None]:
box_plot_visualization(dataset_original,'before')

## FOR CATEGORICAL DATA

In [167]:
dataset_original["Processor_core"] = dataset_original["Processor_core"].str.strip()
dataset_original.loc[dataset_original["Processor_core"].str.contains(r'\d+\.?\d*',regex=True,na=False), "Processor_core"] = np.nan
dataset_original["Processor_core"].replace({
    'Octa Core Processor': 'Octa Core',
    'Nine-Cores': 'Nine Core',
    'Nine Cores': 'Nine Core',
    'Deca Core Processor': 'Deca Core'
},regex=False,inplace=True)

In [168]:
dataset_original["Company"] = dataset_original["Company"].astype(str)
dataset_original["Company"].replace("Nothing",np.nan,regex=False,inplace=True)

In [169]:
dataset_original["Processor_name"] = dataset_original["Processor_name"].astype(str)

dataset_original.loc[dataset_original["Processor_name"].str.contains(r'Core|Wifi|GB', na=False, case=False,regex=True), "Processor_name"] = np.nan
dataset_original.loc[dataset_original["Processor_name"].str.contains("Samsung", na=False, case=False), "Processor_name"] = "Exynos"
dataset_original.loc[dataset_original["Processor_name"].str.contains(r"Sanpdragon|Snapdragon",regex=True,na=False, case=False), "Processor_name"] = "Snapdragon"
dataset_original.loc[dataset_original["Processor_name"].str.contains("Dimensity", na=False, case=False), "Processor_name"] = "Dimensity"

dataset_original["Processor_name"].replace(r'\s+\d+\w*|\s+\w+',"",regex=True,inplace=True)
dataset_original["Processor_name"].replace("",np.nan,inplace=True)

## FILLING MISSING VALUES WITH MODE FOR CATEGORICAL DATA

In [None]:
for col in categorical_columns:
    dataset_original[col] = dataset_original[col].str.strip().str.lower()
    print(f"the ratio of nan {col}: {dataset_original[col].isna().sum()*100 / len(dataset_original):.2f}")
    dataset_original[col].fillna(dataset_original[col].mode()[0],inplace=True)
    print(dataset_original[col].unique())

## ONE HOT ENCODING FOR OUR CATEGORICAL DATA

In [171]:
dataset_original = pd.get_dummies(dataset_original, columns=categorical_columns)

## FOR NUMERICAL DATA

In [172]:
dataset_original["Ram"].replace(r"\s*GB\s*RAM", "",regex=True,inplace=True)
dataset_original["Battery"].replace(r"\s*mAh\s*Battery", "",regex=True,inplace=True)
dataset_original["Display"].replace(r"\s*inches", "",regex=True,inplace=True)
dataset_original["Inbuilt_memory"].replace(r"\s*GB\s*inbuilt", "",regex=True,inplace=True)

dataset_original["Fast_charging"] = dataset_original["Fast_charging"].str.extract(r"(\d+\.?\d*)")

dataset_original["Camera"] = dataset_original["Camera"].str.strip().str.lower()
dataset_original.loc[dataset_original["Camera"].str.contains(r'display|memory', case=False, na=False),'Camera'] = np.nan
dataset_original["Camera_rear"] = dataset_original["Camera"].str.extract(r"(\d+)\s*mp")
dataset_original["Camera_front"] = dataset_original["Camera"].str.extract(r";\s*(\d+)\s*mp")
dataset_original.drop("Camera",axis=1,inplace=True)

dataset_original["Price"].replace(',','', regex=True,inplace=True)

In [173]:
# here we remove camera column and change it to new 2 columns ( camera rear and camera front )
numerical_columns.pop()
numerical_columns.extend(["Camera_rear", "Camera_front"])
# regressor columns are columns that we will fill using another model predictions cause these columns have NAN values ratio > 5-30%
# so its better then using mean or median
regressor_columns = ["Android_version","Fast_charging","Camera_rear", "Camera_front"]

### casting the numerical columns to be float using pandas.to_numeric and removing the nan values except for regressor columns

In [None]:
for col in numerical_columns+target_column:
    dataset_original[col] = pd.to_numeric(dataset_original[col],errors='coerce')
    print(f"the ratio of nan {col}: {dataset_original[col].isna().sum()*100 / len(dataset_original):.2f}")
    if col not in regressor_columns:
        dataset_original[col].fillna(dataset_original[col].median(),inplace=True)

# REMOVING OUTLIERS AFTER GOT DETECTED USING BOX PLOT WITH OUR KNOWLEDGE

In [175]:
# HERE THE COLUMNS ORDER IS :
# Ram , Battery , Display , Android_version , Inbuilt_memory , Fast_charging , Camera_rear, Camera_front , Price
# for example here LOWER BOUND : lowest ram = 2 , lowest battery = 2500 , lowest display = 5.5 ETC
# SAME FOR UPPER BOUND                                 
lower_bound = [2,2500,5.5,8,16,10,5,2,6950]
upper_bound = [16,6500,7.5,14,256,140,80,60,70000]

dataset_cleaned_encoded = remove_outliers_with_domain_knowledge(dataset_original, lower_bound, upper_bound)

# FILLING NAN VALUES FOR THE REGRESSOR COLUMNS WITH ANOTHER MODEL

### note : ["Android_version","Fast_charging","Camera_rear", "Camera_front"] are the regressor columns cause : 
#### they have the highest nan values presented , so its better to use advanced technique 'Filling With Regressor Predictions' instead of the regular mean or median

In [None]:
# MAKING A DATASET FOR ALL COLUMNS EXCEPT THE TARGET COLUMN ( PRICE ) , ITS BETTER SO WE DONT OVERRIDE OUR ORIGINAL DATASET
dataset_for_regressor = dataset_cleaned_encoded[[col for col in dataset_cleaned_encoded.columns if col not in target_column]].copy()
# we use random forest regressor to fill nan values here
regressor = RandomForestRegressor(n_estimators=100,random_state=random_state)

for col in regressor_columns:
    # making train dataset using rows that have NOT NULL values in the regressor column 
    # meaning for example all rows where android_version is not null
    train_dataset_for_regressor = dataset_for_regressor[dataset_for_regressor[col].notna()]
    # making test dataset using rows that have NULL values in the regressor column
    test_dataset_for_regressor = dataset_for_regressor[dataset_for_regressor[col].isna()]

    # we drop the column that we want to predict FROM THE DATASET WHERE col is NOT NULL , means all rows where COL was NOT NULL SO WE COULD MAKE PREDICTIONS
    X_train = train_dataset_for_regressor.drop(columns=col)
    # y is the column that we want to predict FROM THE DATASET WHERE col is NOT NULL , so we could make predictions 
    y_train = train_dataset_for_regressor[col]
    # we drop the column that we want to predict FROM THE DATASET WHERE col is NULL , means all rows where COL was NULL
    X_test = test_dataset_for_regressor.drop(columns=col)

    regressor.fit(X_train,y_train)
    print(f"prediction for column {col} is done !")
    # FILLING THE COLUMNS WHERE ROW OF col IS NULL WITH THE RESULT OF THE REGRESSOR PRECITION ON THE TEST DATA
    dataset_cleaned_encoded.loc[dataset_cleaned_encoded[col].isna(),col] = regressor.predict(X_test)

# NOW " DATASET_CLEANED_ENCODED " READY FOR ALL PREDICTIONS !

In [None]:
hist_plot_visualization(dataset_cleaned_encoded)
box_plot_visualization(dataset_cleaned_encoded,'after')
heatmap_visualization(dataset_cleaned_encoded)

In [178]:
dataset_cleaned_encoded.to_csv('dataset_cleaned_encoded.csv',index=False) # saving the cleaned dataset to a csv file

In [179]:
dataset_cleaned_encoded = pd.read_csv('dataset_cleaned_encoded.csv')
X = dataset_cleaned_encoded.drop(target_column,axis=1)
y = dataset_cleaned_encoded[target_column]

In [None]:
results_df = train_evaluate_with_gridsearch(X, y, models, param_grids)
results_df.to_csv('grid_search_results.csv',index=False) # saving the results to a csv file

In [None]:
# iterate over all rows and print the value of each column in the rows
for _, row in results_df.iterrows():
    for column in results_df.columns:
        # rounding the value in case of float number 
        if isinstance(row[column], float):
            print(f"{column}: {row[column]:.4f}") # rounding float values
        else:
            print(f"{column}: {row[column]}")
    print("\n")
#save_model(best_model,'yes')

## dataset for making prediction

In [182]:
input_data =pd.DataFrame({
    'Ram': [4,6,6],
    'Battery': [5100,5000,6000],
    'Display': [6.67,6.71,6.5],
    'Android_version': [14,12,14],
    'Inbuilt_memory': [128,128,128],
    'Fast_charging': [45,15,25],
    'Camera_rear': [8,50,50],
    'Camera_front': [5,5,13],
    'Processor_core': ["octa core","octa core","octa core"],
    'Company': ["oppo","poco","samsung"],
    'Processor_name': ["dimensity","helio","dimensity"]
    })
real_prices = [13500,11000,14500]

#### loading the downloaded model ' best model '

In [None]:
"""
for file in os.listdir(os.getcwd()):
    if file.endswith('.joblib'):
        model_path = os.path.join(os.getcwd(),file)
print(model_path)
"""

### making prediction for each best model

In [None]:
for model_name,best_model in zip(results_df['model_name'], results_df['model']):
    print(f"\n{model_name} model predictions :")
    predict_price(input_data, real_prices, best_model, X, lower_bound, upper_bound)

### Simple model predictions :
The predicted price of the phone is: 16279.51
Difference in price: 2779
The predicted price of the phone is: 11132.38
Difference in price: 132
The predicted price of the phone is: 9468.65
Difference in price: -5032

---
### Poly model predictions :
The predicted price of the phone is: 23630.33
Difference in price: 10130
The predicted price of the phone is: 6950.00
Difference in price: -4050
The predicted price of the phone is: 15176.92
Difference in price: 676

---
### Lasso model predictions :
The predicted price of the phone is: 16252.78
Difference in price: 2752
The predicted price of the phone is: 11373.38
Difference in price: 373
The predicted price of the phone is: 9492.55
Difference in price: -5008

---
### Ridge model predictions :
The predicted price of the phone is: 18878.68
Difference in price: 5378
The predicted price of the phone is: 9261.51
Difference in price: -1739
The predicted price of the phone is: 15923.85
Difference in price: 1423

In [None]:

#best_model = joblib.load(model_path)
filtered = results_df.loc[(results_df['r2_calculated'] > 0.0) & (results_df['r2_calculated'] < 1.0) ]
best_model = results_df.loc[filtered['r2_calculated'].idxmax()] # chosing best model based on the highest r2 score
## we dont need to do predictions again , we already made them above
print(best_model)

model_name                                                     Lasso

---
model              (StandardScaler(), Lasso(alpha=20, max_iter=10...

---
K_cv                                                               8

---
best_params        {'model__alpha': 20, 'model__max_iter': 10000,...

---
train_time                                                 14.624879

---
mse_calculated                                      193589163.421738

---
mape_calculated                                             0.448297

---
r2_calculated                                               0.647146

---
Name: 2, dtype: object