Load the dataset from the upload



In [2]:
import pandas as pd


df = pd.read_csv('BDSMARTS.csv')

print("✅ Dataset loaded successfully!")

✅ Dataset loaded successfully!


Data Info (first 5 rows,summary,stat for overall data)

In [3]:

print("--- First 5 Rows ---")
print(df.head())

print("\n--- Dataset Info ---")
df.info()

print("\n--- Numerical Stats ---")
print(df.describe())

--- First 5 Rows ---
     Brand Released Year          OS Display (inches) Display (pixels) Camera  \
0   Doogee          2020  Android 10             6.3"        1080x2340  2.1MP   
1  Ulefone          2021  Android 11            6.52"        1080x2400   20MP   
2  Oukitel          2023  Android 13            6.58"        1080x2408   48MP   
3  Oukitel          2023  Android 13            6.58"        1080x2408   48MP   
4      ZTE          2023  Android 13            12.1"        1600x2560   13MP   

   Camera Resolution   RAM Battery Capacity       Price  
0               1080   6GB         10000mAh  ৳18,000.00  
1               2160   4GB         10000mAh  ৳16,000.00  
2               2160  16GB         10000mAh  ৳35,000.00  
3               2160   8GB         10000mAh  ৳35,000.00  
4               1080  16GB         10000mAh  ৳60,000.00  

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3338 entries, 0 to 3337
Data columns (total 10 columns):
 #   Column    

In [5]:


# Print the shape of the data BEFORE cleaning
print(f"Shape before dropping missing value: {df.shape}")

# Drop any row that has a missing value
df.dropna(inplace=True)

# Print the shape of the data AFTER cleaning
print(f"Shape after dropping missing value: {df.shape}")

# Verify with .info() again. You will see all columns now have the same count.
print("\n--- Cleaned Data Info (No Missing Values) ---")
df.info()

Shape before dropping missing value: (3337, 10)
Shape after dropping missing value: (3337, 10)

--- Cleaned Data Info (No Missing Values) ---
<class 'pandas.core.frame.DataFrame'>
Index: 3337 entries, 0 to 3337
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Brand              3337 non-null   object
 1   Released Year      3337 non-null   object
 2   OS                 3337 non-null   object
 3   Display (inches)   3337 non-null   object
 4   Display (pixels)   3337 non-null   object
 5   Camera             3337 non-null   object
 6   Camera Resolution  3337 non-null   int64 
 7   RAM                3337 non-null   object
 8   Battery Capacity   3337 non-null   object
 9   Price              3337 non-null   object
dtypes: int64(1), object(9)
memory usage: 286.8+ KB


In [11]:


# Function to safely clean and convert a column to numeric
def clean_and_convert_numeric(df, column, char_to_remove, new_dtype):
    # Check if the column is still an object type before attempting string operations
    if df[column].dtype == 'object':
        # Remove specified characters
        df[column] = df[column].astype(str).str.replace(char_to_remove, '', regex=False)
        # Convert to the specified numeric type, coercing errors to NaN
        df[column] = pd.to_numeric(df[column], errors='coerce')
    else:
        print(f"Column '{column}' is already of type {df[column].dtype}. Skipping cleaning.")
    return df

# Clean the Price column
df = clean_and_convert_numeric(df, 'Price', '৳', float)
df = clean_and_convert_numeric(df, 'Price', ',', float) # Handle the comma separately

print("Cleaned 'Price' column. First 5 values:")
print(df['Price'].head())


# Clean the RAM column
df = clean_and_convert_numeric(df, 'RAM', 'GB', int)

print("\nCleaned 'RAM' column. First 5 values:")
print(df['RAM'].head())


# Step 3.2.3: Clean the Battery Capacity column
df = clean_and_convert_numeric(df, 'Battery Capacity', 'mAh', int)

print("\nCleaned 'Battery Capacity' column. First 5 values:")
print(df['Battery Capacity'].head())


# Clean the Display (inches) column
# Remove the inch symbol (")
# Note: Sometimes you need to escape special characters with a backslash, but let's try without first.
df = clean_and_convert_numeric(df, 'Display (inches)', '"', float)

print("\nCleaned 'Display (inches)' column. First 5 values:")
print(df['Display (inches)'].head())


# Clean the Released Year column
# Just in case there are hidden spaces or errors, we convert it directly
# No characters to remove, just convert to numeric
if df['Released Year'].dtype == 'object':
     df['Released Year'] = pd.to_numeric(df['Released Year'], errors='coerce')
else:
    print(f"Column 'Released Year' is already of type {df['Released Year'].dtype}. Skipping cleaning.")


print("\nCleaned 'Released Year' column. First 5 values:")
print(df['Released Year'].head())

Column 'Price' is already of type float64. Skipping cleaning.
Column 'Price' is already of type float64. Skipping cleaning.
Cleaned 'Price' column. First 5 values:
0    18000.0
1    16000.0
2    35000.0
3    35000.0
4    60000.0
Name: Price, dtype: float64
Column 'RAM' is already of type float64. Skipping cleaning.

Cleaned 'RAM' column. First 5 values:
0     6.0
1     4.0
2    16.0
3     8.0
4    16.0
Name: RAM, dtype: float64
Column 'Battery Capacity' is already of type int64. Skipping cleaning.

Cleaned 'Battery Capacity' column. First 5 values:
0    10000
1    10000
2    10000
3    10000
4    10000
Name: Battery Capacity, dtype: int64

Cleaned 'Display (inches)' column. First 5 values:
0     6.30
1     6.52
2     6.58
3     6.58
4    12.10
Name: Display (inches), dtype: float64

Cleaned 'Released Year' column. First 5 values:
0    2020.0
1    2021.0
2    2023.0
3    2023.0
4    2023.0
Name: Released Year, dtype: float64


In [12]:

print("\n--- Data Info After Cleaning ---")
df.info()


--- Data Info After Cleaning ---
<class 'pandas.core.frame.DataFrame'>
Index: 3337 entries, 0 to 3337
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Brand              3337 non-null   object 
 1   Released Year      3310 non-null   float64
 2   OS                 3337 non-null   object 
 3   Display (inches)   3311 non-null   float64
 4   Display (pixels)   3337 non-null   object 
 5   Camera             3337 non-null   object 
 6   Camera Resolution  3337 non-null   int64  
 7   RAM                3337 non-null   float64
 8   Battery Capacity   3337 non-null   int64  
 9   Price              3337 non-null   float64
dtypes: float64(4), int64(2), object(4)
memory usage: 286.8+ KB


In [13]:
# One-Hot Encoding

print(f"Original shape of the data: {df.shape}")

# Use pd.get_dummies() to create new columns for 'Brand' and 'OS'
# We set drop_first=True to avoid a statistical issue called multicollinearity
df = pd.get_dummies(df, columns=['Brand', 'OS'], drop_first=True)

print(f"New shape of the data after One-Hot Encoding: {df.shape}")

# Let's see our new columns!
print("\n--- First 5 rows of the new DataFrame ---")
pd.set_option('display.max_columns', None) #
print(df.head())

Original shape of the data: (3337, 10)
New shape of the data after One-Hot Encoding: (3337, 141)

--- First 5 rows of the new DataFrame ---
   Released Year  Display (inches) Display (pixels) Camera  Camera Resolution  \
0         2020.0              6.30        1080x2340  2.1MP               1080   
1         2021.0              6.52        1080x2400   20MP               2160   
2         2023.0              6.58        1080x2408   48MP               2160   
3         2023.0              6.58        1080x2408   48MP               2160   
4         2023.0             12.10        1600x2560   13MP               1080   

    RAM  Battery Capacity    Price  Brand_Alcatel  Brand_Allview  Brand_Apple  \
0   6.0             10000  18000.0          False          False        False   
1   4.0             10000  16000.0          False          False        False   
2  16.0             10000  35000.0          False          False        False   
3   8.0             10000  35000.0          False

In [14]:
#Drop remaining complex columns
df.drop(['Display (pixels)', 'Camera'], axis=1, inplace=True)
print("\n--- Final DataFrame Info ---")
df.info()



--- Final DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
Index: 3337 entries, 0 to 3337
Columns: 139 entries, Released Year to OS_iOS 9.3.2
dtypes: bool(133), float64(4), int64(2)
memory usage: 615.9 KB


In [16]:
# Define Features and Target

# The 'Price' column is what we want to predict
y = df['Price']

# All other columns are the features our model will use
X = df.drop('Price', axis=1)

print("--- Features (X) ---")
print(X.columns) # Let's see all our new feature columns!
print(f"\nShape of X: {X.shape}")

print("\n--- Target (y) ---")
print(y.head())

--- Features (X) ---
Index(['Released Year', 'Display (inches)', 'Camera Resolution', 'RAM',
       'Battery Capacity', 'Brand_Alcatel', 'Brand_Allview', 'Brand_Apple',
       'Brand_Archos', 'Brand_Asus',
       ...
       'OS_iOS 14', 'OS_iOS 14.1', 'OS_iOS 15', 'OS_iOS 16', 'OS_iOS 17',
       'OS_iOS 6', 'OS_iOS 7', 'OS_iOS 8', 'OS_iOS 9', 'OS_iOS 9.3.2'],
      dtype='object', length=138)

Shape of X: (3337, 138)

--- Target (y) ---
0    18000.0
1    16000.0
2    35000.0
3    35000.0
4    60000.0
Name: Price, dtype: float64


In [25]:
# Select and Train the Model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Split the data into training (80%) and testing (20%) sets
# The data (X and y) was already cleaned of NaNs in a previous step (cell 459aa287)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Linear Regression model blueprint
lr_model = LinearRegression()

# Train the model using the training data
lr_model.fit(X_train, y_train)

print("\n✅ Model has been trained successfully!")


✅ Model has been trained successfully!


In [26]:
# Evaluate the Model ---
from sklearn.metrics import mean_absolute_error, r2_score

# Make predictions on the test data (the data the model has never seen)
y_pred = lr_model.predict(X_test)

# Calculate the evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n================= MODEL EVALUATION =================")
print(f"Mean Absolute Error (MAE): {mae:.2f} BDT")
print(f"This means, on average, our model's price prediction is off by ~{int(mae)} Taka.")
print(f"\nR-squared (R²): {r2:.2f}")
print(f"This means our model can explain about {r2:.0%} of the variation in smartphone prices.")
print("==================================================")

# Let's look at a few actual predictions vs. real prices
comparison_df = pd.DataFrame({'Actual Price': y_test, 'Predicted Price': y_pred})
print("\n--- Sample of Predictions vs. Actual Prices ---")
print(comparison_df.head(10))


Mean Absolute Error (MAE): 12705.10 BDT
This means, on average, our model's price prediction is off by ~12705 Taka.

R-squared (R²): 0.61
This means our model can explain about 61% of the variation in smartphone prices.

--- Sample of Predictions vs. Actual Prices ---
      Actual Price  Predicted Price
2252       35000.0     37592.028630
1105       38000.0     60363.448397
1688       22000.0     31141.014266
1684       42000.0     57098.039529
1481       11990.0     11111.635721
1641       26990.0     30071.730016
883        38990.0     45604.183880
247        15000.0     39826.904825
2622       20000.0     31955.887148
2712        8590.0     10542.116664
