In [2]:
import pandas as pd
from pandas import json_normalize
from sklearn.ensemble import RandomForestRegressor

In [3]:
df = pd.read_json('datasets/evomag_2024_11_13.json')
df.head()

Unnamed: 0,timestamp,name,price,rating,number_of_reviews,is_in_stoc,url,product_code,online_mag,specifications,manufacturer,category
0,1976-05-31 17:18:52.234,Nou! Laptop Gaming ASUS TUF A15 FA507NUR (Proc...,5599.99,0,0,1,https://www.evomag.ro/portabile-laptopuri-note...,ASFA507NUR-LP104,evomag,"{'Model': 'TUF A15 FA507', 'Tip Laptop': 'Gami...",ASUS,Laptopuri / Notebook
1,1976-05-31 17:18:52.234,Laptop HP 250 G10 (Procesor Intel® Core™ i3-13...,1449.99,0,0,1,https://www.evomag.ro/portabile-laptopuri-note...,725M8EA,evomag,"{'Model': '250 G10', 'Tip Laptop': 'Business',...",HP,Laptopuri / Notebook
2,1976-05-31 17:18:52.234,Nou! Laptop ASUS Vivobook F1504ZA (Procesor In...,1299.99,5,1,1,https://www.evomag.ro/portabile-laptopuri-note...,F1504ZA-BQ1618,evomag,"{'Model': 'VivoBook F1504ZA', 'Tip Laptop': 'M...",ASUS,Laptopuri / Notebook
3,1976-05-31 17:18:52.234,Laptop ASUS Vivobook X1504VA (Procesor Intel C...,2199.99,5,1,1,https://www.evomag.ro/portabile-laptopuri-note...,X1504VA-BQ004,evomag,"{'Model': 'VivoBook X1504', 'Tip Laptop': 'Mul...",ASUS,Laptopuri / Notebook
4,1976-05-31 17:18:52.234,Laptop Gaming ASUS TUF A15 FA506NF (Procesor A...,2849.99,5,4,1,https://www.evomag.ro/portabile-laptopuri-note...,FA506NF-HN044,evomag,"{'Model': 'TUF A15 FA506', 'Tip Laptop': 'Gami...",ASUS,Laptopuri / Notebook


In [4]:
def extract_smartphones(df):
    """
    Extract all products that have "Smartphone": "Da" in their specifications column.
    
    Parameters:
    df (pandas.DataFrame): DataFrame containing product information with a 'specifications' column
                          that contains dictionaries with product specs
    
    Returns:
    pandas.DataFrame: A new DataFrame containing only smartphone products
    """
    # Create a mask to filter products where specifications contains "Smartphone": "Da"
    smartphone_mask = df['specifications'].apply(
        lambda specs: isinstance(specs, dict) and specs.get('Smartphone') == 'Da'
    )
    
    # Apply the mask to get only smartphone products
    smartphones_df = df[smartphone_mask].copy()
    
    return smartphones_df

# Example usage:
# smartphones = extract_smartphones(df)
# print(f"Found {len(smartphones)} smartphones out of {len(df)} total products")


In [5]:
import pandas as pd
from pandas import json_normalize

def flatten_json_column(df, json_column):
    """
    Flatten a JSON column in a DataFrame so that the fields become separate columns.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the JSON column to flatten
    json_column : str
        The name of the column containing the JSON data to flatten
        
    Returns:
    --------
    pandas.DataFrame
        A new DataFrame with the JSON column flattened into separate columns
    """
    # Create a copy to avoid modifying the original DataFrame
    result_df = df.copy()
    
    # Check if the JSON column exists in the DataFrame
    if json_column not in result_df.columns:
        raise ValueError(f"Column '{json_column}' not found in DataFrame")
    
    # Normalize the JSON column
    try:
        # Handle cases where some rows might have None/NaN values in the JSON column
        mask = result_df[json_column].notna()
        
        if mask.any():
            # Apply json_normalize only to rows that have valid JSON
            normalized_df = json_normalize(result_df.loc[mask, json_column])
            
            # Drop the original JSON column from the result
            result_subset = result_df.loc[mask].drop(json_column, axis=1)
            
            # Combine the original DataFrame (minus the JSON column) with the normalized data
            flattened_subset = pd.concat([result_subset.reset_index(drop=True), 
                                          normalized_df.reset_index(drop=True)], 
                                         axis=1)
            
            # Merge back with rows that had None/NaN values
            if (~mask).any():
                result_df = pd.concat([flattened_subset, 
                                       result_df.loc[~mask]]).sort_index()
            else:
                result_df = flattened_subset
        
        return result_df
        
    except Exception as e:
        raise ValueError(f"Error flattening JSON column: {str(e)}")

In [6]:
df_smartphone = extract_smartphones(df)
df_smartphone_normalised = flatten_json_column(df_smartphone, 'specifications')

In [7]:
df_smartphone_normalised.head()

Unnamed: 0,timestamp,name,price,rating,number_of_reviews,is_in_stoc,url,product_code,online_mag,manufacturer,...,Editie,Model Procesor,Tip incarcator,Rezistent la apa si praf,Frecventa (MHz),Ecran secundar,Versiunea terminalului,Blitz Camera Fata,DNLA,Limba utilizare
0,1976-05-31 17:19:00.029,"Telefon Mobil Motorola Moto G24, Procesor Octa...",439.99,0,0,1,https://www.evomag.ro/telefoane-tablete-acceso...,PB180003PL,evomag,Motorola,...,,,,,,,,,,
1,1976-05-31 17:19:00.029,"Telefon Mobil Apple iPhone 16 Pro Max, LTPO Su...",7399.99,0,0,1,https://www.evomag.ro/telefoane-tablete-acceso...,4181235,evomag,Apple,...,,,,,,,,,,
2,1976-05-31 17:19:00.029,"Telefon Mobil Samsung Galaxy A05s, Procesor Oc...",549.99,0,0,1,https://www.evomag.ro/telefoane-tablete-acceso...,SM-A057GZKUEUE,evomag,Samsung,...,,,,,,,,,,
3,1976-05-31 17:19:00.029,"Telefon Mobil Xiaomi 13T Pro, Procesor Mediate...",2499.99,5,10,1,https://www.evomag.ro/telefoane-tablete-acceso...,4121640,evomag,Xiaomi,...,,,,,,,,,,
4,1976-05-31 17:19:00.029,"Telefon Mobil Apple iPhone 16 Pro Max, LTPO Su...",7499.99,0,0,1,https://www.evomag.ro/telefoane-tablete-acceso...,4181250,evomag,Apple,...,Natural Titanium,,,,,,,,,


Raw features that we use in prediction

In [8]:
df_model_training = pd.DataFrame()
df_model_training['5G'] = df_smartphone_normalised['5G']
df_model_training['4G'] = df_smartphone_normalised['4G']

df_model_training[['resolution width', 'resolution height']] = df_smartphone_normalised['Rezolutie maxima (px)'].str.split(' x ', expand=True)

df_model_training['Diagonala'] = df_smartphone_normalised['Diagonala (inch)']
df_model_training['Numar nuclee'] = df_smartphone_normalised['Numar nuclee']
df_model_training['Memorie Flash'] = df_smartphone_normalised['Memorie Flash']
df_model_training['Memorie RAM'] = df_smartphone_normalised['Memorie RAM']
df_model_training['Incarcare Wireless'] = df_smartphone_normalised['Incarcare Wireless']
df_model_training['Capacitate Baterie'] = df_smartphone_normalised['Capacitate'] 
df_model_training['Dual SIM'] = df_smartphone_normalised['Dual SIM']
# df_model_training['Manufacturer'] = df_smartphone_normalised['manufacturer']
df_model_training['price'] = df_smartphone_normalised['price']

df_model_training.head()

Unnamed: 0,5G,4G,resolution width,resolution height,Diagonala,Numar nuclee,Memorie Flash,Memorie RAM,Incarcare Wireless,Capacitate Baterie,Dual SIM,price
0,,Da,1612.0,720.0,6.56,8 (Octa Core),128 GB,4 GB,,5000 mAh,Da,439.99
1,Da,,1320.0,2868.0,6.9,6 (Hexa-Core),256 GB,,Da,,,7399.99
2,,Da,1080.0,2400.0,6.71,8 (Octa Core),64 GB,4 GB,,5000 mAh,Da,549.99
3,Da,,,,6.67,8 (Octa Core),512 GB,12 GB,,5000 mAh,Da,2499.99
4,Da,,1320.0,2868.0,6.9,6 (Hexa-Core),256 GB,,Da,,,7499.99


Cleaned up features that we use in predicitons

In [9]:
df_model_training['5G'].fillna(0, inplace=True)
df_model_training['5G'].replace('Da', 1, inplace=True)
df_model_training['5G'].replace('Nu', 0, inplace=True)


df_model_training['4G'].fillna(0, inplace=True)
df_model_training['4G'].replace('Da', 1, inplace=True)
df_model_training['4G'].replace('Nu', 0, inplace=True)


df_model_training['resolution width'] = pd.to_numeric(df_model_training['resolution width'], errors='coerce')
df_model_training['resolution height'] = pd.to_numeric(df_model_training['resolution height'], errors='coerce')

df_model_training['resolution height'].fillna(0, inplace=True)
df_model_training['resolution width'].fillna(0, inplace=True)

df_model_training['Diagonala'] = pd.to_numeric(df_model_training['Diagonala'], errors='coerce')
df_model_training['Diagonala'].fillna(0, inplace=True)

df_model_training['Numar nuclee'] = df_model_training['Numar nuclee'].str.split('(').str[0]
df_model_training['Numar nuclee'] = pd.to_numeric(df_model_training['Numar nuclee'], errors='coerce')
df_model_training['Numar nuclee'].fillna(0, inplace=True)

df_model_training['Memorie RAM'] = df_model_training['Memorie RAM'].str.split(' ').str[0]
df_model_training['Memorie RAM'] = pd.to_numeric(df_model_training['Memorie RAM'], errors='coerce')
df_model_training['Memorie RAM'].fillna(0, inplace=True)

df_model_training['Memorie Flash'] = df_model_training['Memorie Flash'].str.split(' ').str[0]
df_model_training['Memorie Flash'] = pd.to_numeric(df_model_training['Memorie Flash'], errors='coerce')
df_model_training['Memorie Flash'].fillna(0, inplace=True)

df_model_training['Incarcare Wireless'].fillna(0, inplace=True)
df_model_training['Incarcare Wireless'].replace('Da', 1, inplace=True)
df_model_training['Incarcare Wireless'].replace('Nu', 0, inplace=True)


df_model_training['Capacitate Baterie'] = df_model_training['Capacitate Baterie'].str.split(' ').str[0]
df_model_training['Capacitate Baterie'] = pd.to_numeric(df_model_training['Capacitate Baterie'], errors='coerce')
df_model_training['Capacitate Baterie'].fillna(0, inplace=True)

df_model_training['Dual SIM'].fillna(0, inplace=True)
df_model_training['Dual SIM'].replace('Da', 1, inplace=True)
df_model_training['Dual SIM'].replace('Nu', 0, inplace=True)


# manufacturers = df_model_training['Manufacturer'].unique()
# manufacturer_mapping = {manufacturer: i for i, manufacturer in enumerate(sorted(manufacturers))}

# # Apply the mapping to create a new encoded column
# df_model_training['Manufacturer'] = df_model_training['Manufacturer'].map(manufacturer_mapping)


df_model_training

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_model_training['5G'].fillna(0, inplace=True)
  df_model_training['5G'].replace('Da', 1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_model_training['4G'].fillna(0, inplace=True)
  df_model_training['4G'].replace('Nu', 0, inplace=True)
The behavior will chan

Unnamed: 0,5G,4G,resolution width,resolution height,Diagonala,Numar nuclee,Memorie Flash,Memorie RAM,Incarcare Wireless,Capacitate Baterie,Dual SIM,price
0,0,1,1612.0,720.0,6.56,8.0,128.0,4.0,0,5000.0,1,439.99
1,1,0,1320.0,2868.0,6.90,6.0,256.0,0.0,1,0.0,0,7399.99
2,0,1,1080.0,2400.0,6.71,8.0,64.0,4.0,0,5000.0,1,549.99
3,1,0,0.0,0.0,6.67,8.0,512.0,12.0,0,5000.0,1,2499.99
4,1,0,1320.0,2868.0,6.90,6.0,256.0,0.0,1,0.0,0,7499.99
...,...,...,...,...,...,...,...,...,...,...,...,...
1473,0,1,576.0,1156.0,6.00,8.0,64.0,4.0,0,6300.0,1,647.99
1474,0,1,576.0,1156.0,6.00,8.0,64.0,4.0,0,6300.0,1,647.99
1475,0,1,576.0,1280.0,6.52,8.0,256.0,6.0,0,10600.0,1,1019.99
1476,0,1,576.0,1280.0,6.52,8.0,256.0,6.0,0,10600.0,1,1019.99


In [10]:
df_reg = df_model_training.drop('price', axis=1)
y = df_model_training['price']
df_reg

Unnamed: 0,5G,4G,resolution width,resolution height,Diagonala,Numar nuclee,Memorie Flash,Memorie RAM,Incarcare Wireless,Capacitate Baterie,Dual SIM
0,0,1,1612.0,720.0,6.56,8.0,128.0,4.0,0,5000.0,1
1,1,0,1320.0,2868.0,6.90,6.0,256.0,0.0,1,0.0,0
2,0,1,1080.0,2400.0,6.71,8.0,64.0,4.0,0,5000.0,1
3,1,0,0.0,0.0,6.67,8.0,512.0,12.0,0,5000.0,1
4,1,0,1320.0,2868.0,6.90,6.0,256.0,0.0,1,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
1473,0,1,576.0,1156.0,6.00,8.0,64.0,4.0,0,6300.0,1
1474,0,1,576.0,1156.0,6.00,8.0,64.0,4.0,0,6300.0,1
1475,0,1,576.0,1280.0,6.52,8.0,256.0,6.0,0,10600.0,1
1476,0,1,576.0,1280.0,6.52,8.0,256.0,6.0,0,10600.0,1


In [11]:
# Approach 1: If df_model_training and y have different lengths
# Make sure they have the same index and align them
df_reg = df_reg.loc[y.index]  # If y is a Series
# OR
y = y[df_reg.index]  # Adjust y to match df_model_training

# Approach 2: If using the wrong dataframe for feature names
rf = RandomForestRegressor()
rf.fit(df_reg, y)

# Get feature importance using the correct columns
rf_importance = pd.DataFrame({
    'Feature': df_reg.columns,  # Use the same dataframe you used for training
    'Importance': rf.feature_importances_
})
print(rf_importance.sort_values('Importance', ascending=False))

               Feature  Importance
8   Incarcare Wireless    0.605981
4            Diagonala    0.104090
6        Memorie Flash    0.080940
3    resolution height    0.069165
0                   5G    0.040977
9   Capacitate Baterie    0.033222
7          Memorie RAM    0.032305
2     resolution width    0.018637
5         Numar nuclee    0.011982
1                   4G    0.001364
10            Dual SIM    0.001337


In [12]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score

# For regression models
predictions = rf.predict(df_reg)
print(f"R² Score: {r2_score(y, predictions)}")
print(f"Mean Squared Error: {mean_squared_error(y, predictions)}")
print(f"Root Mean Squared Error: {mean_squared_error(y, predictions, squared=False)}")
print(f"Mean Absolute Error: {mean_absolute_error(y, predictions)}")

# Cross-validation (more robust evaluation)
cv_scores = cross_val_score(rf, df_model_training, y, cv=5, scoring='r2')
print(f"Cross-validation R² scores: {cv_scores}")
print(f"Mean CV R² score: {cv_scores.mean()}")

R² Score: 0.9856364508004773
Mean Squared Error: 67745.63336495374
Root Mean Squared Error: 260.27991348729495
Mean Absolute Error: 136.89787725609332




Cross-validation R² scores: [0.99984961 0.99993068 0.9999817  0.99998154 0.99863656]
Mean CV R² score: 0.9996760163460495


Save the model using pickle!

In [13]:
# import pickle

# with open('random_forest_model.pkl', 'wb') as file:
#     pickle.dump(rf, file)

In [14]:
import xgboost as xgb

# Approach 1: If df_model_training and y have different lengths
# Make sure they have the same index and align them
df_reg = df_reg.loc[y.index]  # If y is a Series
# OR
y = y[df_reg.index]  # Adjust y to match df_model_training

# Approach 2: Using XGBoost instead of RandomForest
xgb_model = xgb.XGBRegressor()
xgb_model.fit(df_reg, y)

# Get feature importance using the correct columns
xgb_importance = pd.DataFrame({
    'Feature': df_reg.columns,  # Use the same dataframe you used for training
    'Importance': xgb_model.feature_importances_
})
print(xgb_importance.sort_values('Importance', ascending=False))

               Feature  Importance
8   Incarcare Wireless    0.860631
5         Numar nuclee    0.027420
3    resolution height    0.024263
0                   5G    0.024176
6        Memorie Flash    0.018672
4            Diagonala    0.016015
7          Memorie RAM    0.014261
9   Capacitate Baterie    0.005339
2     resolution width    0.005281
1                   4G    0.002347
10            Dual SIM    0.001595


In [16]:
print(f"R² Score: {r2_score(y, predictions)}")
print(f"Mean Squared Error: {mean_squared_error(y, predictions)}")
print(f"Root Mean Squared Error: {mean_squared_error(y, predictions, squared=False)}")
print(f"Mean Absolute Error: {mean_absolute_error(y, predictions)}")

# Cross-validation (more robust evaluation)
cv_scores = cross_val_score(xgb_model, df_reg, y, cv=5, scoring='r2')
print(f"Cross-validation R² scores: {cv_scores}")
print(f"Mean CV R² score: {cv_scores.mean()}")

R² Score: 0.9856364508004773
Mean Squared Error: 67745.63336495374
Root Mean Squared Error: 260.27991348729495
Mean Absolute Error: 136.89787725609332




Cross-validation R² scores: [0.91417447 0.95024314 0.94628768 0.9674191  0.90939792]
Mean CV R² score: 0.9375044624975498


In [17]:
from catboost import CatBoostRegressor

# Approach 1: If df_model_training and y have different lengths
# Make sure they have the same index and align them
df_reg = df_reg.loc[y.index]  # If y is a Series
# OR
y = y[df_reg.index]  # Adjust y to match df_model_training

# Approach 2: Using CatBoost instead of RandomForest
catboost_model = CatBoostRegressor()
catboost_model.fit(df_reg, y)

# Get feature importance using the correct columns
catboost_importance = pd.DataFrame({
    'Feature': df_reg.columns,  # Use the same dataframe you used for training
    'Importance': catboost_model.get_feature_importance()
})
print(catboost_importance.sort_values('Importance', ascending=False))

Learning rate set to 0.04355
0:	learn: 2103.9933975	total: 52.1ms	remaining: 52s
1:	learn: 2037.3656289	total: 54.9ms	remaining: 27.4s
2:	learn: 1976.7369718	total: 58.7ms	remaining: 19.5s
3:	learn: 1914.1928804	total: 61.8ms	remaining: 15.4s
4:	learn: 1860.9662665	total: 64.4ms	remaining: 12.8s
5:	learn: 1805.1188804	total: 67.1ms	remaining: 11.1s
6:	learn: 1756.0831227	total: 69.9ms	remaining: 9.92s
7:	learn: 1705.1616233	total: 73ms	remaining: 9.06s
8:	learn: 1656.9711119	total: 76.1ms	remaining: 8.38s
9:	learn: 1610.0890552	total: 79.1ms	remaining: 7.83s
10:	learn: 1568.8812354	total: 82ms	remaining: 7.37s
11:	learn: 1522.7572881	total: 85.1ms	remaining: 7s
12:	learn: 1480.1539744	total: 88.2ms	remaining: 6.7s
13:	learn: 1441.9557155	total: 91.3ms	remaining: 6.43s
14:	learn: 1406.1316276	total: 94.3ms	remaining: 6.19s
15:	learn: 1369.9022746	total: 97.3ms	remaining: 5.98s
16:	learn: 1336.1199796	total: 101ms	remaining: 5.84s
17:	learn: 1303.3630421	total: 106ms	remaining: 5.76s
18:

In [18]:
# Calculate the same metrics
print(f"R² Score: {r2_score(y, predictions)}")
print(f"Mean Squared Error: {mean_squared_error(y, predictions)}")
print(f"Root Mean Squared Error: {mean_squared_error(y, predictions, squared=False)}")
print(f"Mean Absolute Error: {mean_absolute_error(y, predictions)}")

# Cross-validation (more robust evaluation)
cv_scores = cross_val_score(catboost_model, df_reg, y, cv=5, scoring='r2')
print(f"Cross-validation R² scores: {cv_scores}")
print(f"Mean CV R² score: {cv_scores.mean()}")


R² Score: 0.9856364508004773
Mean Squared Error: 67745.63336495374
Root Mean Squared Error: 260.27991348729495
Mean Absolute Error: 136.89787725609332
Learning rate set to 0.042039
0:	learn: 2124.5658404	total: 1.06ms	remaining: 1.06s
1:	learn: 2061.5169093	total: 1.9ms	remaining: 948ms
2:	learn: 2003.1928138	total: 3.4ms	remaining: 1.13s
3:	learn: 1956.0510224	total: 4.2ms	remaining: 1.05s
4:	learn: 1902.8662601	total: 5.84ms	remaining: 1.16s
5:	learn: 1848.0711799	total: 7.02ms	remaining: 1.16s
6:	learn: 1798.3364348	total: 8.11ms	remaining: 1.15s
7:	learn: 1752.5791296	total: 9.01ms	remaining: 1.12s
8:	learn: 1704.8365548	total: 9.83ms	remaining: 1.08s
9:	learn: 1659.2187286	total: 10.6ms	remaining: 1.05s
10:	learn: 1617.3407751	total: 11.3ms	remaining: 1.01s
11:	learn: 1574.9974446	total: 12ms	remaining: 987ms
12:	learn: 1536.8745603	total: 12.7ms	remaining: 961ms
13:	learn: 1497.5052341	total: 13.3ms	remaining: 940ms
14:	learn: 1462.7513160	total: 14.1ms	remaining: 925ms
15:	learn



182:	learn: 470.6094860	total: 155ms	remaining: 694ms
183:	learn: 469.2114143	total: 157ms	remaining: 698ms
184:	learn: 468.5312431	total: 160ms	remaining: 704ms
185:	learn: 468.2711268	total: 161ms	remaining: 705ms
186:	learn: 467.6866235	total: 162ms	remaining: 705ms
187:	learn: 467.3002361	total: 163ms	remaining: 705ms
188:	learn: 466.3310702	total: 164ms	remaining: 705ms
189:	learn: 465.5704563	total: 165ms	remaining: 703ms
190:	learn: 465.2994600	total: 166ms	remaining: 702ms
191:	learn: 464.7038632	total: 166ms	remaining: 700ms
192:	learn: 464.0539321	total: 167ms	remaining: 699ms
193:	learn: 463.0367835	total: 168ms	remaining: 696ms
194:	learn: 462.6036572	total: 168ms	remaining: 694ms
195:	learn: 461.7977701	total: 169ms	remaining: 692ms
196:	learn: 461.3893315	total: 169ms	remaining: 690ms
197:	learn: 460.8751454	total: 171ms	remaining: 692ms
198:	learn: 459.7685868	total: 173ms	remaining: 696ms
199:	learn: 459.1828873	total: 175ms	remaining: 701ms
200:	learn: 458.0877965	tota

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score

# Define and fit your Linear Regression model
lr_model = LinearRegression()
lr_model.fit(df_reg, y)

# Get predictions from the fitted model
predictions = lr_model.predict(df_reg)

# Calculate the same metrics
print(f"R² Score: {r2_score(y, predictions)}")
print(f"Mean Squared Error: {mean_squared_error(y, predictions)}")
print(f"Root Mean Squared Error: {mean_squared_error(y, predictions, squared=False)}")
print(f"Mean Absolute Error: {mean_absolute_error(y, predictions)}")

# Cross-validation (more robust evaluation)
cv_scores = cross_val_score(lr_model, df_reg, y, cv=5, scoring='r2')
print(f"Cross-validation R² scores: {cv_scores}")
print(f"Mean CV R² score: {cv_scores.mean()}")

# If you want to examine the coefficients
coefficients = pd.DataFrame({
    'Feature': df_reg.columns,
    'Coefficient': lr_model.coef_
})
print("Model Coefficients:")
print(coefficients.sort_values('Coefficient', ascending=False))
print(f"Intercept: {lr_model.intercept_}")

R² Score: 0.7177135866066916
Mean Squared Error: 1331402.9561918057
Root Mean Squared Error: 1153.8643577959265
Mean Absolute Error: 759.7838803661426
Cross-validation R² scores: [0.73227958 0.70527868 0.67256073 0.68954291 0.69015335]
Mean CV R² score: 0.6979630523837701
Model Coefficients:
               Feature  Coefficient
8   Incarcare Wireless  2168.156780
4            Diagonala   547.067763
0                   5G   519.473090
6        Memorie Flash     1.611279
3    resolution height     0.209690
7          Memorie RAM     0.157715
9   Capacitate Baterie    -0.017274
2     resolution width    -0.064784
5         Numar nuclee   -59.950382
1                   4G  -326.669817
10            Dual SIM -1094.412351
Intercept: -1066.46062631845




In [21]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score

# Create polynomial features (degree=2 for quadratic, increase for more complex curves)
poly_model = make_pipeline(
    PolynomialFeatures(degree=2),
    LinearRegression()
)

# Fit and evaluate
poly_model.fit(df_reg, y)
poly_predictions = poly_model.predict(df_reg)

# Calculate metrics
print(f"Polynomial R² Score: {r2_score(y, poly_predictions)}")
print(f"Polynomial Mean Squared Error: {mean_squared_error(y, poly_predictions)}")
print(f"Polynomial Root Mean Squared Error: {mean_squared_error(y, poly_predictions, squared=False)}")
print(f"Polynomial Mean Absolute Error: {mean_absolute_error(y, poly_predictions)}")

# Cross-validation (more robust evaluation)
cv_scores = cross_val_score(poly_model, df_reg, y, cv=5, scoring='r2')
print(f"Polynomial Cross-validation R² scores: {cv_scores}")
print(f"Polynomial Mean CV R² score: {cv_scores.mean()}")

# If you want to examine the coefficients (a bit more complex for polynomial regression)
# Extract the polynomial feature names and coefficients
poly_features = poly_model.named_steps['polynomialfeatures']
linear_model = poly_model.named_steps['linearregression']

# Get feature names (intercept is handled separately)
feature_names = poly_features.get_feature_names_out(df_reg.columns)

# Create a DataFrame with features and coefficients
coefficients = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': linear_model.coef_
})

print("Polynomial Model Coefficients:")
print(coefficients.sort_values('Coefficient', ascending=False))
print(f"Polynomial Intercept: {linear_model.intercept_}")

Polynomial R² Score: 0.83659353822993
Polynomial Mean Squared Error: 770706.0486768428
Polynomial Root Mean Squared Error: 877.8986551287358
Polynomial Mean Absolute Error: 592.7988655974905




Polynomial Cross-validation R² scores: [  0.55862452   0.78669293   0.55252886  -5.35601656 -56.73707088]
Polynomial Mean CV R² score: -12.039048226585615
Polynomial Model Coefficients:
                        Feature  Coefficient
16                 5G Diagonala  3114.666374
74  Incarcare Wireless Dual SIM  2606.076070
26                 4G Diagonala  1399.756916
72         Incarcare Wireless^2  1188.136496
9            Incarcare Wireless  1188.136495
..                          ...          ...
2                            4G -3451.219157
22                  5G Dual SIM -3749.789422
5                     Diagonala -4193.849618
1                            5G -7443.660607
12                         5G^2 -7443.660618

[78 rows x 2 columns]
Polynomial Intercept: 19822.395575662842
