In [1]:
import pandas as pd
from pandas import json_normalize
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_json('datasets/evomag_2024_11_13.json')

In [3]:
def extract_smartphones(df):
    """
    Extract all products that have "Smartphone": "Da" in their specifications column.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing product information with a 'specifications' column
                          that contains dictionaries with product specs
    
    Returns:
    pandas.DataFrame: A new DataFrame containing only smartphone products
    """
    # Create a mask to filter products where specifications contains "Smartphone": "Da"
    smartphone_mask = df['specifications'].apply(
        lambda specs: isinstance(specs, dict) and specs.get('Smartphone') == 'Da'
    )
    
    # Apply the mask to get only smartphone products
    smartphones_df = df[smartphone_mask].copy()
    
    return smartphones_df

# Example usage:
# smartphones = extract_smartphones(df)
# print(f"Found {len(smartphones)} smartphones out of {len(df)} total products")


In [4]:
import pandas as pd
from pandas import json_normalize

def flatten_json_column(df, json_column):
    """
    Flatten a JSON column in a DataFrame so that the fields become separate columns.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the JSON column to flatten
    json_column : str
        The name of the column containing the JSON data to flatten
        
    Returns:
    --------
    pandas.DataFrame
        A new DataFrame with the JSON column flattened into separate columns
    """
    # Create a copy to avoid modifying the original DataFrame
    result_df = df.copy()
    
    # Check if the JSON column exists in the DataFrame
    if json_column not in result_df.columns:
        raise ValueError(f"Column '{json_column}' not found in DataFrame")
    
    # Normalize the JSON column
    try:
        # Handle cases where some rows might have None/NaN values in the JSON column
        mask = result_df[json_column].notna()
        
        if mask.any():
            # Apply json_normalize only to rows that have valid JSON
            normalized_df = json_normalize(result_df.loc[mask, json_column])
            
            # Drop the original JSON column from the result
            result_subset = result_df.loc[mask].drop(json_column, axis=1)
            
            # Combine the original DataFrame (minus the JSON column) with the normalized data
            flattened_subset = pd.concat([result_subset.reset_index(drop=True), 
                                          normalized_df.reset_index(drop=True)], 
                                         axis=1)
            
            # Merge back with rows that had None/NaN values
            if (~mask).any():
                result_df = pd.concat([flattened_subset, 
                                       result_df.loc[~mask]]).sort_index()
            else:
                result_df = flattened_subset
        
        return result_df
        
    except Exception as e:
        raise ValueError(f"Error flattening JSON column: {str(e)}")

In [5]:
df_smartphone = extract_smartphones(df)
df_smartphone_normalised = flatten_json_column(df_smartphone, 'specifications')

In [6]:
df_smartphone_normalised.head()

Unnamed: 0,timestamp,name,price,rating,number_of_reviews,is_in_stoc,url,product_code,online_mag,manufacturer,...,Editie,Model Procesor,Tip incarcator,Rezistent la apa si praf,Frecventa (MHz),Ecran secundar,Versiunea terminalului,Blitz Camera Fata,DNLA,Limba utilizare
0,1976-05-31 17:19:00.029,"Telefon Mobil Motorola Moto G24, Procesor Octa...",439.99,0,0,1,https://www.evomag.ro/telefoane-tablete-acceso...,PB180003PL,evomag,Motorola,...,,,,,,,,,,
1,1976-05-31 17:19:00.029,"Telefon Mobil Apple iPhone 16 Pro Max, LTPO Su...",7399.99,0,0,1,https://www.evomag.ro/telefoane-tablete-acceso...,4181235,evomag,Apple,...,,,,,,,,,,
2,1976-05-31 17:19:00.029,"Telefon Mobil Samsung Galaxy A05s, Procesor Oc...",549.99,0,0,1,https://www.evomag.ro/telefoane-tablete-acceso...,SM-A057GZKUEUE,evomag,Samsung,...,,,,,,,,,,
3,1976-05-31 17:19:00.029,"Telefon Mobil Xiaomi 13T Pro, Procesor Mediate...",2499.99,5,10,1,https://www.evomag.ro/telefoane-tablete-acceso...,4121640,evomag,Xiaomi,...,,,,,,,,,,
4,1976-05-31 17:19:00.029,"Telefon Mobil Apple iPhone 16 Pro Max, LTPO Su...",7499.99,0,0,1,https://www.evomag.ro/telefoane-tablete-acceso...,4181250,evomag,Apple,...,Natural Titanium,,,,,,,,,


Raw features that we use in prediction

In [7]:
df_model_training = pd.DataFrame()
df_model_training['5G'] = df_smartphone_normalised['5G']
df_model_training['4G'] = df_smartphone_normalised['4G']

df_model_training[['resolution width', 'resolution height']] = df_smartphone_normalised['Rezolutie maxima (px)'].str.split(' x ', expand=True)

df_model_training['Diagonala'] = df_smartphone_normalised['Diagonala (inch)']
df_model_training['Numar nuclee'] = df_smartphone_normalised['Numar nuclee']
df_model_training['Memorie Flash'] = df_smartphone_normalised['Memorie Flash']
df_model_training['Memorie RAM'] = df_smartphone_normalised['Memorie RAM']
df_model_training['Incarcare Wireless'] = df_smartphone_normalised['Incarcare Wireless']
df_model_training['Capacitate Baterie'] = df_smartphone_normalised['Capacitate'] 
df_model_training['Dual SIM'] = df_smartphone_normalised['Dual SIM']
# df_model_training['Manufacturer'] = df_smartphone_normalised['manufacturer']
df_model_training['price'] = df_smartphone_normalised['price']

df_model_training.head()

Unnamed: 0,5G,4G,resolution width,resolution height,Diagonala,Numar nuclee,Memorie Flash,Memorie RAM,Incarcare Wireless,Capacitate Baterie,Dual SIM,price
0,,Da,1612.0,720.0,6.56,8 (Octa Core),128 GB,4 GB,,5000 mAh,Da,439.99
1,Da,,1320.0,2868.0,6.9,6 (Hexa-Core),256 GB,,Da,,,7399.99
2,,Da,1080.0,2400.0,6.71,8 (Octa Core),64 GB,4 GB,,5000 mAh,Da,549.99
3,Da,,,,6.67,8 (Octa Core),512 GB,12 GB,,5000 mAh,Da,2499.99
4,Da,,1320.0,2868.0,6.9,6 (Hexa-Core),256 GB,,Da,,,7499.99


Cleaned up features that we use in predicitons

In [8]:
df_model_training['5G'].fillna(0, inplace=True)
df_model_training['5G'].replace('Da', 1, inplace=True)
df_model_training['5G'].replace('Nu', 0, inplace=True)


df_model_training['4G'].fillna(0, inplace=True)
df_model_training['4G'].replace('Da', 1, inplace=True)
df_model_training['4G'].replace('Nu', 0, inplace=True)


df_model_training['resolution width'] = pd.to_numeric(df_model_training['resolution width'], errors='coerce')
df_model_training['resolution height'] = pd.to_numeric(df_model_training['resolution height'], errors='coerce')

df_model_training['resolution height'].fillna(0, inplace=True)
df_model_training['resolution width'].fillna(0, inplace=True)

df_model_training['Diagonala'] = pd.to_numeric(df_model_training['Diagonala'], errors='coerce')
df_model_training['Diagonala'].fillna(0, inplace=True)

df_model_training['Numar nuclee'] = df_model_training['Numar nuclee'].str.split('(').str[0]
df_model_training['Numar nuclee'] = pd.to_numeric(df_model_training['Numar nuclee'], errors='coerce')
df_model_training['Numar nuclee'].fillna(0, inplace=True)

df_model_training['Memorie RAM'] = df_model_training['Memorie RAM'].str.split(' ').str[0]
df_model_training['Memorie RAM'] = pd.to_numeric(df_model_training['Memorie RAM'], errors='coerce')
df_model_training['Memorie RAM'].fillna(0, inplace=True)

df_model_training['Memorie Flash'] = df_model_training['Memorie Flash'].str.split(' ').str[0]
df_model_training['Memorie Flash'] = pd.to_numeric(df_model_training['Memorie Flash'], errors='coerce')
df_model_training['Memorie Flash'].fillna(0, inplace=True)

df_model_training['Incarcare Wireless'].fillna(0, inplace=True)
df_model_training['Incarcare Wireless'].replace('Da', 1, inplace=True)
df_model_training['Incarcare Wireless'].replace('Nu', 0, inplace=True)


df_model_training['Capacitate Baterie'] = df_model_training['Capacitate Baterie'].str.split(' ').str[0]
df_model_training['Capacitate Baterie'] = pd.to_numeric(df_model_training['Capacitate Baterie'], errors='coerce')
df_model_training['Capacitate Baterie'].fillna(0, inplace=True)

df_model_training['Dual SIM'].fillna(0, inplace=True)
df_model_training['Dual SIM'].replace('Da', 1, inplace=True)
df_model_training['Dual SIM'].replace('Nu', 0, inplace=True)


# manufacturers = df_model_training['Manufacturer'].unique()
# manufacturer_mapping = {manufacturer: i for i, manufacturer in enumerate(sorted(manufacturers))}

# # Apply the mapping to create a new encoded column
# df_model_training['Manufacturer'] = df_model_training['Manufacturer'].map(manufacturer_mapping)


df_model_training

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_model_training['5G'].fillna(0, inplace=True)
  df_model_training['5G'].replace('Da', 1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_model_training['4G'].fillna(0, inplace=True)
  df_model_training['4G'].replace('Nu', 0, inplace=True)
The behavior will chan

Unnamed: 0,5G,4G,resolution width,resolution height,Diagonala,Numar nuclee,Memorie Flash,Memorie RAM,Incarcare Wireless,Capacitate Baterie,Dual SIM,price
0,0,1,1612.0,720.0,6.56,8.0,128.0,4.0,0,5000.0,1,439.99
1,1,0,1320.0,2868.0,6.90,6.0,256.0,0.0,1,0.0,0,7399.99
2,0,1,1080.0,2400.0,6.71,8.0,64.0,4.0,0,5000.0,1,549.99
3,1,0,0.0,0.0,6.67,8.0,512.0,12.0,0,5000.0,1,2499.99
4,1,0,1320.0,2868.0,6.90,6.0,256.0,0.0,1,0.0,0,7499.99
...,...,...,...,...,...,...,...,...,...,...,...,...
1473,0,1,576.0,1156.0,6.00,8.0,64.0,4.0,0,6300.0,1,647.99
1474,0,1,576.0,1156.0,6.00,8.0,64.0,4.0,0,6300.0,1,647.99
1475,0,1,576.0,1280.0,6.52,8.0,256.0,6.0,0,10600.0,1,1019.99
1476,0,1,576.0,1280.0,6.52,8.0,256.0,6.0,0,10600.0,1,1019.99


In [9]:
df_reg = df_model_training.drop('price', axis=1)
y = df_model_training['price']

In [10]:
# Approach 1: If df_model_training and y have different lengths
# Make sure they have the same index and align them
df_reg = df_reg.loc[y.index]  # If y is a Series
# OR
y = y[df_reg.index]  # Adjust y to match df_model_training

# Approach 2: If using the wrong dataframe for feature names
rf = RandomForestRegressor()
rf.fit(df_reg, y)

# Get feature importance using the correct columns
rf_importance = pd.DataFrame({
    'Feature': df_reg.columns,  # Use the same dataframe you used for training
    'Importance': rf.feature_importances_
})
print(rf_importance.sort_values('Importance', ascending=False))

               Feature  Importance
8   Incarcare Wireless    0.601863
4            Diagonala    0.106319
6        Memorie Flash    0.080716
3    resolution height    0.069940
0                   5G    0.041457
9   Capacitate Baterie    0.032738
7          Memorie RAM    0.029995
2     resolution width    0.022381
5         Numar nuclee    0.011945
1                   4G    0.001448
10            Dual SIM    0.001198


In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score

# For regression models
predictions = rf.predict(df_reg)
print(f"R² Score: {r2_score(y, predictions)}")
print(f"Mean Squared Error: {mean_squared_error(y, predictions)}")
print(f"Root Mean Squared Error: {mean_squared_error(y, predictions, squared=False)}")
print(f"Mean Absolute Error: {mean_absolute_error(y, predictions)}")

# Cross-validation (more robust evaluation)
cv_scores = cross_val_score(rf, df_model_training, y, cv=5, scoring='r2')
print(f"Cross-validation R² scores: {cv_scores}")
print(f"Mean CV R² score: {cv_scores.mean()}")

R² Score: 0.985375825298916
Mean Squared Error: 68974.87270051593
Root Mean Squared Error: 262.6306773789306
Mean Absolute Error: 137.00879818303835




Cross-validation R² scores: [0.99983943 0.99992984 0.99998058 0.9999674  0.99877573]
Mean CV R² score: 0.9996985946145026


Save the model using pickle!

In [12]:
import pickle

with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf, file)