In [None]:
import pandas as pd
from pandas import json_normalize
from sklearn.ensemble import RandomForestRegressor

: 

In [None]:
df = pd.read_json('datasets/evomag_2024_11_13.json')

: 

In [None]:
def extract_smartphones(df):
    """
    Extract all products that have "Smartphone": "Da" in their specifications column.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing product information with a 'specifications' column
                          that contains dictionaries with product specs
    
    Returns:
    pandas.DataFrame: A new DataFrame containing only smartphone products
    """
    # Create a mask to filter products where specifications contains "Smartphone": "Da"
    smartphone_mask = df['specifications'].apply(
        lambda specs: isinstance(specs, dict) and specs.get('Smartphone') == 'Da'
    )
    
    # Apply the mask to get only smartphone products
    smartphones_df = df[smartphone_mask].copy()
    
    return smartphones_df

# Example usage:
# smartphones = extract_smartphones(df)
# print(f"Found {len(smartphones)} smartphones out of {len(df)} total products")


: 

In [None]:
import pandas as pd
from pandas import json_normalize

def flatten_json_column(df, json_column):
    """
    Flatten a JSON column in a DataFrame so that the fields become separate columns.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the JSON column to flatten
    json_column : str
        The name of the column containing the JSON data to flatten
        
    Returns:
    --------
    pandas.DataFrame
        A new DataFrame with the JSON column flattened into separate columns
    """
    # Create a copy to avoid modifying the original DataFrame
    result_df = df.copy()
    
    # Check if the JSON column exists in the DataFrame
    if json_column not in result_df.columns:
        raise ValueError(f"Column '{json_column}' not found in DataFrame")
    
    # Normalize the JSON column
    try:
        # Handle cases where some rows might have None/NaN values in the JSON column
        mask = result_df[json_column].notna()
        
        if mask.any():
            # Apply json_normalize only to rows that have valid JSON
            normalized_df = json_normalize(result_df.loc[mask, json_column])
            
            # Drop the original JSON column from the result
            result_subset = result_df.loc[mask].drop(json_column, axis=1)
            
            # Combine the original DataFrame (minus the JSON column) with the normalized data
            flattened_subset = pd.concat([result_subset.reset_index(drop=True), 
                                          normalized_df.reset_index(drop=True)], 
                                         axis=1)
            
            # Merge back with rows that had None/NaN values
            if (~mask).any():
                result_df = pd.concat([flattened_subset, 
                                       result_df.loc[~mask]]).sort_index()
            else:
                result_df = flattened_subset
        
        return result_df
        
    except Exception as e:
        raise ValueError(f"Error flattening JSON column: {str(e)}")

: 

In [None]:
df_smartphone = extract_smartphones(df)
df_smartphone_normalised = flatten_json_column(df_smartphone, 'specifications')

: 

In [None]:
df_smartphone_normalised.head()

: 

Raw features that we use in prediction

In [None]:
df_model_training = pd.DataFrame()
df_model_training['5G'] = df_smartphone_normalised['5G']
df_model_training['4G'] = df_smartphone_normalised['4G']

df_model_training[['resolution width', 'resolution height']] = df_smartphone_normalised['Rezolutie maxima (px)'].str.split(' x ', expand=True)

df_model_training['Diagonala'] = df_smartphone_normalised['Diagonala (inch)']
df_model_training['Numar nuclee'] = df_smartphone_normalised['Numar nuclee']
df_model_training['Memorie Flash'] = df_smartphone_normalised['Memorie Flash']
df_model_training['Memorie RAM'] = df_smartphone_normalised['Memorie RAM']
df_model_training['Incarcare Wireless'] = df_smartphone_normalised['Incarcare Wireless']
df_model_training['Capacitate Baterie'] = df_smartphone_normalised['Capacitate'] 
df_model_training['Dual SIM'] = df_smartphone_normalised['Dual SIM']
# df_model_training['Manufacturer'] = df_smartphone_normalised['manufacturer']
df_model_training['price'] = df_smartphone_normalised['price']

df_model_training.head()

: 

Cleaned up features that we use in predicitons

In [None]:
df_model_training['5G'].fillna(0, inplace=True)
df_model_training['5G'].replace('Da', 1, inplace=True)
df_model_training['5G'].replace('Nu', 0, inplace=True)


df_model_training['4G'].fillna(0, inplace=True)
df_model_training['4G'].replace('Da', 1, inplace=True)
df_model_training['4G'].replace('Nu', 0, inplace=True)


df_model_training['resolution width'] = pd.to_numeric(df_model_training['resolution width'], errors='coerce')
df_model_training['resolution height'] = pd.to_numeric(df_model_training['resolution height'], errors='coerce')

df_model_training['resolution height'].fillna(0, inplace=True)
df_model_training['resolution width'].fillna(0, inplace=True)

df_model_training['Diagonala'] = pd.to_numeric(df_model_training['Diagonala'], errors='coerce')
df_model_training['Diagonala'].fillna(0, inplace=True)

df_model_training['Numar nuclee'] = df_model_training['Numar nuclee'].str.split('(').str[0]
df_model_training['Numar nuclee'] = pd.to_numeric(df_model_training['Numar nuclee'], errors='coerce')
df_model_training['Numar nuclee'].fillna(0, inplace=True)

df_model_training['Memorie RAM'] = df_model_training['Memorie RAM'].str.split(' ').str[0]
df_model_training['Memorie RAM'] = pd.to_numeric(df_model_training['Memorie RAM'], errors='coerce')
df_model_training['Memorie RAM'].fillna(0, inplace=True)

df_model_training['Memorie Flash'] = df_model_training['Memorie Flash'].str.split(' ').str[0]
df_model_training['Memorie Flash'] = pd.to_numeric(df_model_training['Memorie Flash'], errors='coerce')
df_model_training['Memorie Flash'].fillna(0, inplace=True)

df_model_training['Incarcare Wireless'].fillna(0, inplace=True)
df_model_training['Incarcare Wireless'].replace('Da', 1, inplace=True)
df_model_training['Incarcare Wireless'].replace('Nu', 0, inplace=True)


df_model_training['Capacitate Baterie'] = df_model_training['Capacitate Baterie'].str.split(' ').str[0]
df_model_training['Capacitate Baterie'] = pd.to_numeric(df_model_training['Capacitate Baterie'], errors='coerce')
df_model_training['Capacitate Baterie'].fillna(0, inplace=True)

df_model_training['Dual SIM'].fillna(0, inplace=True)
df_model_training['Dual SIM'].replace('Da', 1, inplace=True)
df_model_training['Dual SIM'].replace('Nu', 0, inplace=True)


# manufacturers = df_model_training['Manufacturer'].unique()
# manufacturer_mapping = {manufacturer: i for i, manufacturer in enumerate(sorted(manufacturers))}

# # Apply the mapping to create a new encoded column
# df_model_training['Manufacturer'] = df_model_training['Manufacturer'].map(manufacturer_mapping)


df_model_training

: 

In [None]:
df_reg = df_model_training.drop('price', axis=1)
y = df_model_training['price']

: 

In [None]:
# Approach 1: If df_model_training and y have different lengths
# Make sure they have the same index and align them
df_reg = df_reg.loc[y.index]  # If y is a Series
# OR
y = y[df_reg.index]  # Adjust y to match df_model_training

# Approach 2: If using the wrong dataframe for feature names
rf = RandomForestRegressor()
rf.fit(df_reg, y)

# Get feature importance using the correct columns
rf_importance = pd.DataFrame({
    'Feature': df_reg.columns,  # Use the same dataframe you used for training
    'Importance': rf.feature_importances_
})
print(rf_importance.sort_values('Importance', ascending=False))

: 

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score

# For regression models
predictions = rf.predict(df_reg)
print(f"R² Score: {r2_score(y, predictions)}")
print(f"Mean Squared Error: {mean_squared_error(y, predictions)}")
print(f"Root Mean Squared Error: {mean_squared_error(y, predictions, squared=False)}")
print(f"Mean Absolute Error: {mean_absolute_error(y, predictions)}")

# Cross-validation (more robust evaluation)
cv_scores = cross_val_score(rf, df_model_training, y, cv=5, scoring='r2')
print(f"Cross-validation R² scores: {cv_scores}")
print(f"Mean CV R² score: {cv_scores.mean()}")

: 