# **Smartphones Sale Datasets**

## **Library imports**

In [None]:
import pandas as pd # For data manipulation and analysis.
import numpy as np # For numerical operations, especially with arrays.
import matplotlib.pyplot as plt # For creating static, interactive, and animated visualizations.
import missingno as msno # For displaying the missing values in columns.


import seaborn as sns # For statistical data visualization.
from scipy.stats import skewnorm # For working with skewed normal distributions.
from sklearn.model_selection import train_test_split # For splitting data into training and testing sets.
from statsmodels.graphics.mosaicplot import mosaic # For creating mosaic plots.

from sklearn.preprocessing import LabelEncoder # Label encoding for categorical variables.
from sklearn.preprocessing import StandardScaler # For standardizing features by removing the mean and scaling to unit variance.
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score # For evaluating regression models.
from sklearn.linear_model import LinearRegression # For implementing linear regression.
from sklearn.ensemble import RandomForestRegressor # For implementing random forest regression.

## Connect to y Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Mount Google Drive to access files.

In [None]:
# Load smartphone sales dataset.
df_SmartphonesSales = pd.read_csv('/content/drive/MyDrive/CMP6202/datasets/SmartphonesSales.csv')
df_SmartphonesSales.head(6)

Check columns and rows

In [None]:

df_SmartphonesSales.head()

In [None]:
# Initial check for missing values in each column.
df_SmartphonesSales.isnull().sum()

In [None]:
# Display data information.
df_SmartphonesSales.info()

In [None]:
# Get the mean for the 'Rating' data.
df_SmartphonesSales['Rating'].mean()

In [None]:
# Check missing values again after attempting to impute 'Rating'.
df_SmartphonesSales.isnull().sum()

**Spliting the dataset**

In [None]:
# Split the dataset into Training (80%) and Testing (20%).

# X = features, y = target (Selling Price).
x = df_SmartphonesSales.drop ('Selling Price', axis=1)
y = df_SmartphonesSales['Selling Price']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) # This ensures reproducibility.

### **2.3 EDA Process and Results**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Checking basic statistics.
df_SmartphonesSales.describe(include='all')

**Univariate Analysis**

Distribution of Smartphone Selling Prices

In [None]:
# Right-skewed histogram of selling price.
plt.figure(figsize=(8, 5))
sns.histplot(df_SmartphonesSales['Selling Price'], bins=20, kde=True, color='skyblue')
plt.title('Distribution of Smartphone Selling Prices')
plt.xlabel('Selling Price (₹)')
plt.ylabel('Frequency')
plt.show()

Identify Outliers

In [None]:
# Boxplot of selling price to identify outliers.
plt.figure(figsize=(8,5))
sns.boxplot(x=df_SmartphonesSales['Selling Price'])
plt.title("Boxplot of Smartphone Selling Prices")
plt.xlabel("Selling Price (₹)")
plt.show()

**Skewness Calculation**

In [None]:
# Display skewness calculation for 'Selling Price'.
df_SmartphonesSales['Selling Price'].skew()

**Bivariate Analysis**

**Relationship between original and selling prices**

In [None]:
# Display the relationship between original and selling prices.
plt.figure(figsize=(8, 5))
sns.scatterplot(x='Original Price', y='Selling Price', data=df_SmartphonesSales, alpha=0.6)
plt.title('Relationship between Original and Selling Prices')
plt.xlabel('Original Price (₹)')
plt.ylabel('Selling Price (₹)')
plt.show()

**Correlation analysis**

In [None]:
# Display the correlation heatmap of key numerical features.
Corr = df_SmartphonesSales[['Selling Price', 'Original Price', 'Rating', 'discount percentage']].corr()
sns.heatmap(Corr, annot=True, cmap='Blues')
plt.show()

Correlation analysis between the Selling Price and Memory

In [None]:
# To demonstrate correlations with unscaled data,
# we construct a temporary DataFrame from `x` and `y` before extensive preprocessing.
df_temp_for_plots = pd.concat([x, y], axis=1)

# Apply Memory cleaning so that it only shows numbers
df_temp_for_plots['Memory'] = df_temp_for_plots['Memory'].astype(str).str.extract('(\\d+)').astype(float)

# Display Selling Price vs Memory
plt.figure(figsize=(8, 5))
sns.scatterplot(x='Memory', y='Selling Price', data=df_temp_for_plots, alpha=0.8)
plt.title('Selling Price vs Memory')
plt.xlabel('Memory (GB)')
plt.ylabel('Selling Price (₹)')
plt.show()


Correlation analysis between the Selling Price and Storage

In [None]:
# Apply Storage cleaning so that it only shows numbers
df_temp_for_plots['Storage'] = df_temp_for_plots['Storage'].astype(str).str.extract('(\\d+)').astype(float)

# Dispaly Selling Price vs Storage
plt.figure(figsize=(8, 5))
sns.scatterplot(x='Storage', y='Selling Price', data=df_temp_for_plots, alpha=0.8)
plt.title('Selling Price vs Storage')
plt.xlabel('Storage (GB)')
plt.ylabel('Selling Price (₹)')
plt.show()

Correlation analysis between the Selling Price and  Discount Amount

In [None]:
# Dispaly Selling Price vs Discount Amount
plt.figure(figsize=(8, 5))
sns.scatterplot(x='Discount', y='Selling Price', data=df_temp_for_plots, alpha=0.6)
plt.title('Selling Price vs Discount Amount')
plt.xlabel('Discount Amount (₹)')
plt.ylabel('Selling Price (₹)')
plt.show()

**Missing values check**

In [None]:
# Check for missing values in each column.
df_SmartphonesSales.isnull().sum()

**Detecting missing data visually using Missingno library**

In [None]:
# Visualize missing data using a bar plot.
msno.bar(df_SmartphonesSales)

Cleaning data

In [None]:
import re

# Remove all non-numeric characters before converting 'Memory' to float.
df_SmartphonesSales['Memory'] = df_SmartphonesSales['Memory'].str.extract('(\d+)').astype(float)
# Remove all non-numeric characters before converting 'Storage' to float.
df_SmartphonesSales['Storage'] = df_SmartphonesSales['Storage'].str.extract('(\d+)').astype(float)

Ensure conversion was success

In [None]:
# Test that the converted columns are now numeric.
df_SmartphonesSales['Memory'].unique()[:20]
df_SmartphonesSales['Storage'].unique()[:20]

Conirms all coumns are numeric

In [None]:
# Confirm columns are now numeric.
df_SmartphonesSales.dtypes

#**Imputing Missing Values**

**Mode Imputation for Categorical Columns (Memory and Storage)**

Finding the mode values for the Memory and Storge data

In [None]:
# Display the mode for 'Memory' and 'Storage' data.
print("Mode of Memory:", df_SmartphonesSales['Memory'].mode()[0])
print("Mode of Storage:", df_SmartphonesSales['Storage'].mode()[0])

Imputing the mode values for the Memory and Storge data

In [None]:
# Impute missing categorical values using mode (most frequent value).
df_SmartphonesSales['Memory'] = df_SmartphonesSales['Memory'].fillna(df_SmartphonesSales['Memory'].mode()[0])
df_SmartphonesSales['Storage'] = df_SmartphonesSales['Storage'].fillna(df_SmartphonesSales['Storage'].mode()[0])

Label Encoding application for Categorical Variables

In [None]:
# Label encoding for categorical variables.

# Original code used for column names: 'Brands', 'Colors', 'Models'.
# This is to implement a safe mapping to handle slight naming differences.
def find_col(df, patterns):
    for patt in patterns:
        for c in df.columns:
            if patt.lower() == c.lower() or patt.lower() in c.lower():
                return c
    return None

brand_col = find_col(df_SmartphonesSales, ['Brands', 'Brand', 'brand'])
model_col = find_col(df_SmartphonesSales, ['Models', 'Model', 'model'])
color_col = find_col(df_SmartphonesSales, ['Colors', 'Colour', 'Color'])

print("\nDetected categorical columns -> Brand:", brand_col, "Model:", model_col, "Color:", color_col)

label_encoders = {}
for col in [brand_col, model_col, color_col]:
    if col is not None:
        le = LabelEncoder()
        df_SmartphonesSales[col] = df_SmartphonesSales[col].astype(str)
        df_SmartphonesSales[col] = le.fit_transform(df_SmartphonesSales[col])
        label_encoders[col] = le
        print(f"Label-encoded {col}: {len(le.classes_)} classes")

**Mode Imputation for Numerical Columns (Rating)**

Finding the mean for the Rating data

In [None]:
# Display the mean for 'Rating'.
print("Mean Rating:", df_SmartphonesSales['Rating'].mean())

Imputing the mean value for the Rating data

In [None]:
# Impute missing numerical values using the mean of 'Rating'.
df_SmartphonesSales['Rating'] = df_SmartphonesSales['Rating'].fillna(df_SmartphonesSales['Rating'].mean())

Scaling Numerical Features

In [None]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns for scaling.
numeric_cols = [c for c in ['Memory','Storage','Original Price','discount percentage','Discount','Rating'] if c in df_SmartphonesSales.columns]
print("\nNumeric columns selected for scaling:", numeric_cols)

# Initialize and apply StandardScaler to the selected columns.
scaler = StandardScaler()
df_SmartphonesSales[numeric_cols] = scaler.fit_transform(df_SmartphonesSales[numeric_cols])

Data imputation check

Verifying Missing Values After Cleaning

In [None]:
# Final verification before modeling.
print("\nData types after preprocessing:")
print(df_SmartphonesSales.dtypes)
print("\nAny remaining missing values?")
print(df_SmartphonesSales.isnull().sum())

# Predictive Modelling / Model Development

**Modelling Process**

Separate data

In [None]:
# Separate features and target.
target = 'Selling Price'
if target not in df_SmartphonesSales.columns:
    raise KeyError(f"Target column '{target}' not found in dataset")

# Convert the 'Camera' column to numerical values: 1 for Yes, 0 for No.
if 'Camera' in df_SmartphonesSales.columns:
    df_SmartphonesSales['Camera'] = df_SmartphonesSales['Camera'].map({'Yes': 1, 'No': 0})
    print("Converted 'Camera' column to numerical (1=Yes, 0=No).")

# Drop textual duplicate column 'Mobile' if it exists, as it's represented by brand+model.
mobile_col = find_col(df_SmartphonesSales, ['Mobile', 'mobile'])
drop_cols = []
if mobile_col is not None:
    drop_cols.append(mobile_col)

# Create feature matrix X and target vector y.
X = df_SmartphonesSales.drop(columns=[target] + drop_cols)
y = df_SmartphonesSales[target].copy()

print("\nFeatures used for modeling:", X.columns.tolist())

In [None]:
# Perform a train-test split (80/20).
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

print("Training samples:", X_train.shape)
print("Testing samples:", X_test.shape)

Linear Regression Training

In [None]:
# Train a Linear Regression Model as a baseline.
from sklearn.linear_model import LinearRegression


lr_model = LinearRegression()   # Default OLS (Ordinary Least Squares).
lr_model.fit(X_train, y_train)  # Fit the model to the training data.

# Make predictions on the training and test sets.
y_train_pred_lr = lr_model.predict(X_train)
y_test_pred_lr = lr_model.predict(X_test)

Evaluating the Linear Regrssion on “Seen” Data

In [None]:
# Evaluation for Linear Regression.
def eval_regression(y_true, y_pred, label="Model"):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"\n{label} performance:")
    print(f"MAE : {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R²  : {r2:.4f}")
    return mae, rmse, r2

eval_regression(y_train, y_train_pred_lr, "Linear Regression (TRAIN)")
eval_regression(y_test, y_test_pred_lr, "Linear Regression (TEST)")

# Residual plot for Linear Regression (test set).
plt.figure(figsize=(6,4))
residuals_lr = y_test - y_test_pred_lr
sns.scatterplot(x=y_test_pred_lr, y=residuals_lr, alpha=0.6)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel("Predicted (₹)")
plt.ylabel("Residuals")
plt.title("Residuals vs Predicted - Linear Regression (Test)")
plt.show()

Random Forest Training

In [None]:
# Train a Random Forest Regressor model.
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=100,         # Number of decision trees in the forest.
    random_state=42,          # For reproducibility.
    n_jobs=-1                 # Use all available cores for efficiency.
)
rf_model.fit(X_train, y_train)

# Make predictions.
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

Evaluating the Random Forest on “Seen” Data

In [None]:
# Evaluation for Random Forest.
eval_regression(y_train, y_train_pred_rf, "Random Forest (TRAIN)")
eval_regression(y_test, y_test_pred_rf, "Random Forest (TEST)")

# Predicted vs Actual for Random Forest algorithm (test set).
plt.figure(figsize=(6,6))
sns.scatterplot(x=y_test, y=y_test_pred_rf, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Actual Selling Price")
plt.ylabel("Predicted Selling Price (₹)")
plt.title("Actual vs Predicted - Random Forest (Test)")
plt.show()

Model Improvement

In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validation for Random Forest.
cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='r2', n_jobs=-1)
print("\nRandom Forest 5-fold CV R² scores:", np.round(cv_scores, 4))
print("Mean CV R²:", np.round(cv_scores.mean(), 4))

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

# hyperparameter tuning (GridSearchCV).
do_grid_search = True
if do_grid_search:
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    }
    grid = GridSearchCV(RandomForestRegressor(random_state=42, n_jobs=-1),
                        param_grid, scoring='neg_mean_absolute_error', cv=3, n_jobs=-1)
    grid.fit(X_train, y_train)

print("\nGridSearch best params:", grid.best_params_)
best_rf = grid.best_estimator_
# Evaluate the best estimator from GridSearch.
y_test_pred_best = best_rf.predict(X_test)
eval_regression(y_test, y_test_pred_best, "Random Forest (GridSearch Best)")

In [None]:
print("\nGridSearch best params:", grid.best_params_)
best_rf = grid.best_estimator_
# Evaluate the best estimator from GridSearch.
y_test_pred_best = best_rf.predict(X_test)
eval_regression(y_test, y_test_pred_best, "Random Forest (GridSearch Best)")

In [None]:
# Final summary of model performance.
print("\n--- Final Summary ---")
lr_test_r2 = r2_score(y_test, y_test_pred_lr)
rf_test_r2 = r2_score(y_test, y_test_pred_rf)
print(f"Linear Regression Test R²: {lr_test_r2:.4f}")
print(f"Random Forest Test R²:     {rf_test_r2:.4f}")
print("\nIf RF R² >> LR R², the pricing relationship is likely non-linear and benefits from the ensemble model.")

Linear Regression Algorithm

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Linear Regression performance on training data.
lr_train_pred = lr_model.predict(X_train)

lr_mae = mean_absolute_error(y_train, lr_train_pred)
lr_rmse = np.sqrt(mean_squared_error(y_train, lr_train_pred))
lr_r2 = r2_score(y_train, lr_train_pred)

print("Linear Regression (Training Data)")
print("MAE:", lr_mae)
print("RMSE:", lr_rmse)
print("R² Score:", lr_r2)

Perform Random Forest Regressor

In [None]:
# Random Forest performance on training data.
rf_train_pred = rf_model.predict(X_train)

rf_mae = mean_absolute_error(y_train, rf_train_pred)
rf_rmse = np.sqrt(mean_squared_error(y_train, rf_train_pred))
rf_r2 = r2_score(y_train, rf_train_pred)

print("\nRandom Forest (Training Data)")
print("MAE:", rf_mae)
print("RMSE:", rf_rmse)
print("R² Score:", rf_r2)

# Algorithms Comparison

## Generate Cross-Validation Scores for Linear Regression


In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validation for Linear Regression.
lr_cv_scores = cross_val_score(lr_model, X, y, cv=5, scoring='r2', n_jobs=-1)
print("\nLinear Regression 5-fold CV R scores:", np.round(lr_cv_scores, 4))
print("Mean CV R:", np.round(lr_cv_scores.mean(), 4))

## Generate Cross-Validation Scores for Random Forest

In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validation for Random Forest.
rf_cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='r2', n_jobs=-1)
print("\nRandom Forest 5-fold CV R scores:", np.round(rf_cv_scores, 4))
print("Mean CV R:", np.round(rf_cv_scores.mean(), 4))

In [None]:
# Comparative box plot of CV scores
cv_scores_df = pd.DataFrame({
    'Linear Regression': lr_cv_scores,
    'Random Forest': rf_cv_scores
})
sns.boxplot(data=cv_scores_df)
plt.title('5-fold Cross-Validation R² Scores Comparison')
plt.ylabel('R² Score')
plt.xlabel('Model')
plt.show()

## EDA Questions with Visualizations

To fully address this project EDA questions, I implement the following visualisations:

-   **Q1: How do specifications like RAM, storage, and camera quality affect the price?**
    -   Provides visualisations showing the relationship between 'Memory', 'Storage', 'Camera' and 'Selling Price'.
-   **Q2: Outliers discussion.**
    -   Provides a markdown explanation regarding the identified outliers and their non-treatment given the model performance.
-   **Q4: Does brand influence pricing independently of technical specifications?**
    -   Provides a visualisation showing the distribution of 'Selling Price' across different brands.
-   **Q5: How do discounts interact with selling price?**
    -   Provides a visualisation directly showing the interaction between the 'Discount' amount and 'Selling Price'.

### Q1: Relationship between Specifications (Memory, Storage, Camera) and Selling Price

In [None]:
# Q1 answer
# Relationship between Memory and Selling Price
plt.figure(figsize=(12, 6))
sns.boxplot(x='Memory', y='Selling Price', data=df_SmartphonesSales)
plt.title('Selling Price by Memory (GB)')
plt.xlabel('Memory (GB)')
plt.ylabel('Selling Price (₹)')
plt.xticks(rotation=45)
plt.show()

# Relationship between Storage and Selling Price
plt.figure(figsize=(12, 6))
sns.boxplot(x='Storage', y='Selling Price', data=df_SmartphonesSales)
plt.title('Selling Price by Storage (GB)')
plt.xlabel('Storage (GB)')
plt.ylabel('Selling Price (₹)')
plt.xticks(rotation=45)
plt.show()

# Relationship between Camera and Selling Price
plt.figure(figsize=(6, 5))
sns.boxplot(x='Camera', y='Selling Price', data=df_SmartphonesSales)
plt.title('Selling Price by Camera Availability')
plt.xlabel('Camera (1=Yes, 0=No)')
plt.ylabel('Selling Price (₹)')
plt.xticks([0, 1], ['No', 'Yes'])
plt.show()

### Q2: Discussion on Outliers

Discussion provide on the report for the Q2.

### Which features are most correlated with the selling price?

In [None]:
# Q3 answer
# Feature importance from the Random Forest model.
importances = rf_model.feature_importances_
feat_names = X.columns
feat_imp = pd.Series(importances, index=feat_names).sort_values(ascending=True)

plt.figure(figsize=(8,6))
feat_imp.plot(kind='barh')
plt.title("Feature Importance (Random Forest)")
plt.xlabel("Importance")
plt.show()

print("\nTop features by importance:")
print(feat_imp.sort_values(ascending=False).head(10))

### Q4: Does brand influence pricing independently of technical specifications?

In [None]:
# Q4 answer
# To visualise brand influence, first we decode the 'Brands' column for better readability.
# This is for visualisation only because of the encoded 'Brands' were used for modeling.

# Retrieve the LabelEncoder used for 'Brands'
original_brands_le = None
if 'Brands' in label_encoders:
    original_brands_le = label_encoders['Brands']

if original_brands_le is not None:
    # Create a temporary DataFrame for plotting with original brand names
    temp_df = df_SmartphonesSales.copy()
    temp_df['Brands_decoded'] = original_brands_le.inverse_transform(temp_df['Brands'])

    # Plotting: Selling Price by Brand (top N brands for clarity)
    # Get top 10 brands by average selling price for better visualisation
    top_brands = temp_df.groupby('Brands_decoded')['Selling Price'].mean().nlargest(10).index
    plt.figure(figsize=(14, 7))
    sns.boxplot(x='Brands_decoded', y='Selling Price', data=temp_df[temp_df['Brands_decoded'].isin(top_brands)], order=top_brands)
    plt.title('Selling Price Distribution by Top 10 Brands')
    plt.xlabel('Brand')
    plt.ylabel('Selling Price (₹)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print("LabelEncoder for 'Brands' not found or not created. Cannot decode for visualization.")

### Q5: How do discounts interact with selling price?

In [None]:
# Q5 answer
# Relationship between Discount amount and Selling Price
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Discount', y='Selling Price', data=df_SmartphonesSales, alpha=0.6)
plt.title('Interaction between Discount Amount and Selling Price')
plt.xlabel('Discount (₹)')
plt.ylabel('Selling Price (₹)')
plt.show()

# It's also relevant to visualise how discount percentage relates to selling price,
# even though the 'Original Price' is the primary driver here.
# This will help us to see if higher discount percentages lead to lower selling prices relative to original prices:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount percentage', y='Selling Price', data=df_SmartphonesSales, alpha=0.6)
plt.title('Interaction between Discount Percentage and Selling Price')
plt.xlabel('Discount Percentage')
plt.ylabel('Selling Price (₹)')
plt.show()