In [674]:
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('TkAgg')

In [675]:
df = pd.read_csv(r"C:\Users\andre\OneDrive\Documente\GitHub\Smartphone dataset price\SmartphonePriceDataset\smartphones.csv")
df['Storage'] = df['Storage'].apply(lambda x: 32 if x < 32 else x)
df.fillna(df.median(numeric_only=True), inplace=True)
bins = [60, 200, 500, 1000, 2200]
labels = ['low', 'medium', 'high', 'premium']
df['Price Category'] = pd.cut(df['Final Price'], bins=bins, labels=labels)
df



Unnamed: 0,Smartphone,Brand,Model,RAM,Storage,Color,Free,Final Price,Price Category
0,Realme C55 8/256GB Sunshower Libre,Realme,C55,8.0,256.0,Yellow,Yes,231.6,medium
1,Samsung Galaxy M23 5G 4/128GB Azul Libre,Samsung,Galaxy M23,4.0,128.0,Blue,Yes,279.0,medium
2,Motorola Moto G13 4/128GB Azul Lavanda Libre,Motorola,Moto G13,4.0,128.0,Blue,Yes,179.01,low
3,Xiaomi Redmi Note 11S 6/128GB Gris Libre,Xiaomi,Redmi Note 11S,6.0,128.0,Gray,Yes,279.99,medium
4,Nothing Phone (2) 12/512GB Blanco Libre,Nothing,Phone (2),12.0,512.0,White,Yes,799.0,high
5,Motorola Moto E32s 4/64GB Gris Libre,Motorola,Moto E32s,4.0,64.0,Gray,Yes,148.52,low
6,Nothing Phone (2) 12/256GB Blanco Libre,Nothing,Phone (2),12.0,256.0,White,Yes,699.0,high
7,Realme 9 Pro 5G 8/128GB Negro Libre,Realme,9 Pro,8.0,128.0,Black,Yes,352.59,medium
8,Samsung Galaxy M23 5G 4/128GB Verde Libre,Samsung,Galaxy M23,4.0,128.0,Green,Yes,279.0,medium
9,Xiaomi Redmi Note 12 Pro 8/256GB Gris Grafito ...,Xiaomi,Redmi Note 12,8.0,256.0,Gray,Yes,329.99,medium


In [676]:
Q1 = df['Final Price'].quantile(0.25)
Q3 = df['Final Price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df = df[(df['Final Price'] >= lower_bound) & (df['Final Price'] <= upper_bound)]


In [677]:
text = '''Datasetul este format din 1722 randuri si este pe 9 coloane:
1. Numele Smartphone-ului
2. Modelul
3. RAM (Memorie cu Acces Aleator)
4. Spatiul de Stocare
5. Culoare
6. Free (indica daca are sau nu un contract cu o companie de telefonie mobila)
7. Pretul smartphone-ului
8. Categorie de Pret : consta luxul telefonului bazat pe intru-n interval de pret
'''
print(text)


Datasetul este format din 1722 randuri si este pe 9 coloane:
1. Numele Smartphone-ului
2. Modelul
3. RAM (Memorie cu Acces Aleator)
4. Spatiul de Stocare
5. Culoare
6. Free (indica daca are sau nu un contract cu o companie de telefonie mobila)
7. Pretul smartphone-ului
8. Categorie de Pret : consta luxul telefonului bazat pe intru-n interval de pret



In [678]:
print(f"Numar de randuri: {df.shape[0]}")
print(f"Numar de coloane: {df.shape[1]}")
print("Coloanele datasetului sunt:")
print(df.columns)
print("\n")
print("Tipurile de dataset-ului :\n")
print(df.dtypes)
print("\n")
print(df.describe())
print("\n")

print("Valori Not A Number : ")
print(df.isnull().sum())
print(df.describe())


Numar de randuri: 1722
Numar de coloane: 9
Coloanele datasetului sunt:
Index(['Smartphone', 'Brand', 'Model', 'RAM', 'Storage', 'Color', 'Free',
       'Final Price', 'Price Category'],
      dtype='object')


Tipurile de dataset-ului :

Smartphone          object
Brand               object
Model               object
RAM                float64
Storage            float64
Color               object
Free                object
Final Price        float64
Price Category    category
dtype: object


               RAM      Storage  Final Price
count  1722.000000  1722.000000  1722.000000
mean      5.901278   145.119628   430.052178
std       2.257046   101.003683   300.486656
min       1.000000    32.000000    60.460000
25%       4.000000    64.000000   199.000000
50%       6.000000   128.000000   327.990000
75%       8.000000   256.000000   589.000000
max      12.000000  1000.000000  1326.030000


Valori Not A Number : 
Smartphone        0
Brand             0
Model             0
RAM          

In [679]:
print(df.columns.tolist())


['Smartphone', 'Brand', 'Model', 'RAM', 'Storage', 'Color', 'Free', 'Final Price', 'Price Category']


In [680]:
plt.figure(figsize=(8, 6))
df['Final Price'].hist(bins=30, edgecolor='black')
plt.title('Distributia Preturilor')
plt.xlabel('Final Price')
plt.ylabel('Frecventa')
plt.show(block=True)

In [681]:
np.random.seed(42)

brands = [
    'Realme', 'Samsung', 'Motorola', 'Xiaomi', 'Nothing', 'POCO', 'Apple', 'OPPO', 'Alcatel',
    'Vivo', 'ZTE', 'OnePlus', 'TCL', 'CAT', 'SPC', 'Cubot', 'Google', 'Ulefone', 'Nokia',
    'Honor', 'Huawei', 'Sony', 'Hammer', 'Qubo', 'Blackview', 'Asus', 'Microsoft', 'BQ',
    'Crosscall', 'Doro', 'Fairphone', 'Funker', 'Gigaset', 'LG', 'Maxcom', 'Swissvoice', 'Lenovo'
]

colors = ['Yellow', 'Blue', 'Gray', 'White', 'Black', 'Green', 'Silver',
          'Gold', 'Brown', 'Orange', 'Purple', 'Turquoise', 'Pink', 'Red', 'Bronze']

# Generare DataFrame de baza
df = pd.DataFrame({
    'Brand': np.random.choice(brands, 100),
    'RAM': np.random.randint(2, 16, 100),
    'Storage': np.random.choice([32, 64, 128, 256, 512], 100),
    'Color': np.random.choice(colors, 100)
})

# Atribuire scoruri pentru Brand
brand_manual_score = {
    'Apple': 5, 'Samsung': 5, 'Google': 4.5, 'Sony': 4.5,
    'OnePlus': 4, 'Xiaomi': 4, 'Motorola': 3.5, 'Huawei': 3.5, 'Nokia': 3.5, 'OPPO': 3.5,
}
df['Brand_Score'] = df['Brand'].map(brand_manual_score).fillna(2)

# Scoruri pentru culoare, invers proporționale cu frecvența
color_freq = df['Color'].value_counts(normalize=True)
color_score_map = {c: 1 / f for c, f in color_freq.items()}
df['Color_Score'] = df['Color'].map(color_score_map)



# Exemplu temporar (doar dacă nu ai deja 'Final Price' — ȘTERGE dacă există deja!)
df['Final Price'] = (
    df['RAM'] * 50 +
    df['Storage'] * 2 +
    df['Brand_Score'] * 100 +
    df['Color_Score'] * 20 +
    np.random.normal(0, 50, size=len(df))
).round(2)

# 🟢 Matrice de corelație folosind doar 'Final Price'
cols_order = ['Final Price', 'RAM', 'Storage', 'Color_Score', 'Brand_Score']
numeric_ordered = df[cols_order]
corr = numeric_ordered.corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Matricea de Corelație cu Final Price')
plt.tight_layout()
plt.show()


In [682]:
def gini_index_from_numeric(data, column, bins=5):
    """Transformă o coloană numerică în categorii și calculează Gini Index."""
    # Bin numeric column into equal-width bins
    binned = pd.cut(data[column], bins=bins, labels=False)
    class_counts = binned.value_counts(normalize=True)
    gini = 1 - sum(class_counts ** 2)
    return gini

# Exemplu de utilizare pentru toate coloanele din matricea de corelație
cols = ['Final Price', 'RAM', 'Storage', 'Color_Score', 'Brand_Score']
for col in cols:
    gini = gini_index_from_numeric(df, col)
    print(f"Gini Index pentru '{col}': {gini:.4f}")


Gini Index pentru 'Final Price': 0.6580
Gini Index pentru 'RAM': 0.7896
Gini Index pentru 'Storage': 0.4994
Gini Index pentru 'Color_Score': 0.2744
Gini Index pentru 'Brand_Score': 0.4876


In [683]:
import pandas as pd

# Încarcă CSV-ul
df = pd.read_csv(r"C:\Users\andre\OneDrive\Documente\GitHub\Smartphone dataset price\SmartphonePriceDataset\smartphones.csv")

# Asigură-te că ai coloana 'Final Price'
print(df.columns)  # verifică că 'Final Price' e acolo

# Creează coloana 'Price Category' dacă nu există deja
bins = [60, 200, 500, 1000, 2200]
labels = ['low', 'medium', 'high', 'premium']
df['Price Category'] = pd.cut(df['Final Price'], bins=bins, labels=labels)

# Funcția Gini Index
def gini_index(data, target_column):
    class_counts = data[target_column].value_counts(normalize=True)
    gini = 1 - sum(class_counts**2)
    return gini

# Funcția Gini pentru fiecare grupă
def gini_for_attribute(data, attribute, target_column):
    grouped = data.groupby(attribute)
    gini_values = {}
    for group, subset in grouped:
        gini_values[group] = gini_index(subset, target_column)
    return gini_values

# Apelul funcției
gini_values_for_brand = gini_for_attribute(df, 'Brand', 'Price Category')
print("Gini Index pentru fiecare brand:", gini_values_for_brand)


Index(['Smartphone', 'Brand', 'Model', 'RAM', 'Storage', 'Color', 'Free',
       'Final Price'],
      dtype='object')
Gini Index pentru fiecare brand: {'Alcatel': 0.24489795918367352, 'Apple': 0.7068399324451117, 'Asus': 0.6666666666666667, 'BQ': 0.0, 'Blackview': 0.49382716049382713, 'CAT': 0.6111111111111112, 'Crosscall': 0.48979591836734704, 'Cubot': 0.4377162629757785, 'Doro': 0.4444444444444444, 'Fairphone': 0.0, 'Funker': 0.0, 'Gigaset': 0.4444444444444444, 'Google': 0.6666666666666667, 'Hammer': 0.5804988662131519, 'Honor': 0.470414201183432, 'Huawei': 0.5921822099107419, 'LG': 0.0, 'Lenovo': 0.0, 'Maxcom': 0.0, 'Microsoft': 0.0, 'Motorola': 0.6531240381655894, 'Nokia': 0.6035502958579881, 'Nothing': 0.19753086419753085, 'OPPO': 0.6304347826086957, 'OnePlus': 0.5330578512396694, 'POCO': 0.4219202494987747, 'Qubo': 0.0, 'Realme': 0.5785667324128863, 'SPC': 0.0, 'Samsung': 0.7177304017848629, 'Sony': 0.5, 'Swissvoice': 0.0, 'TCL': 0.4521604938271605, 'Ulefone': 0.5711111111111111

In [684]:
import numpy as np
import pandas as pd

# Functie pentru calculul entropiei
def entropy(data, target_column='Price Category'):
    counts = data[target_column].value_counts(normalize=True)
    return -np.sum(counts * np.log2(counts + 1e-9))  # adăugăm epsilon pentru a evita log(0)

# Functie pentru calcularea Information Gain
def information_gain(data, attribute, target_column='Price Category'):
    parent_entropy = entropy(data, target_column)

    grouped = data.groupby(attribute)
    weighted_entropy = 0
    for group, subset in grouped:
        weighted_entropy += (len(subset) / len(data)) * entropy(subset, target_column)

    return parent_entropy - weighted_entropy

# Calcularea Information Gain pentru diverse atribute
info_gain_storage = information_gain(df, 'Storage')
info_gain_ram = information_gain(df, 'RAM')
info_gain_brand = information_gain(df, 'Brand')
info_gain_color = information_gain(df, 'Color')

print(f"Information Gain pentru 'Storage': {info_gain_storage:.4f}")
print(f"Information Gain pentru 'RAM': {info_gain_ram:.4f}")
print(f"Information Gain pentru 'Brand': {info_gain_brand:.4f}")
print(f"Information Gain pentru 'Color': {info_gain_color:.4f}")


Information Gain pentru 'Storage': 0.5132
Information Gain pentru 'RAM': 1.0585
Information Gain pentru 'Brand': 0.3064
Information Gain pentru 'Color': 0.0722


In [685]:
# Definim scorul manual pentru branduri, dacă nu există deja
brand_manual_score = {
    'Apple': 5, 'Samsung': 5, 'Google': 4.5, 'Sony': 4.5,
    'OnePlus': 4, 'Xiaomi': 4, 'Motorola': 3.5, 'Huawei': 3.5, 'Nokia': 3.5, 'OPPO': 3.5,
}

# Aplicăm scorul brandurilor; pentru cele ce nu sunt în dicționar, le atribuim 2
df['Brand_Score'] = df['Brand'].map(brand_manual_score).fillna(2)

# Calculăm frecvența culorilor și atribuim un scor invers proporțional cu frecvența
color_freq = df['Color'].value_counts(normalize=True)
color_score_map = {c: 1/f for c, f in color_freq.items()}
df['Color_Score'] = df['Color'].map(color_score_map)


In [686]:
correlation_storage = df['Final Price'].corr(df['Storage'])
print(f"Corelatia dintre 'Final Price' si 'Storage': {correlation_storage:.4f}")

correlation_ram = df['Final Price'].corr(df['RAM'])
print(f"Corelatia dintre 'Final Price' si 'RAM': {correlation_ram:.4f}")

correlation_brand_score = df['Final Price'].corr(df['Brand_Score'])
print(f"Corelatia dintre 'Final Price' si 'Brand_Score': {correlation_brand_score:.4f}")

correlation_color_score = df['Final Price'].corr(df['Color_Score'])
print(f"Corelatia dintre 'Final Price' si 'Color_Score': {correlation_color_score:.4f}")


Corelatia dintre 'Final Price' si 'Storage': 0.6972
Corelatia dintre 'Final Price' si 'RAM': 0.6907
Corelatia dintre 'Final Price' si 'Brand_Score': 0.4251
Corelatia dintre 'Final Price' si 'Color_Score': 0.0206


In [687]:
numeric_columns = df.select_dtypes(include=['number'])
numeric_columns

Unnamed: 0,RAM,Storage,Final Price,Brand_Score,Color_Score
0,8.0,256.0,231.6,2.0,75.666667
1,4.0,128.0,279.0,5.0,5.356932
2,4.0,128.0,179.01,3.5,5.356932
3,6.0,128.0,279.99,4.0,9.265306
4,12.0,512.0,799.0,2.0,10.682353
5,4.0,64.0,148.52,3.5,9.265306
6,12.0,256.0,699.0,2.0,10.682353
7,8.0,128.0,352.59,2.0,3.38175
8,4.0,128.0,279.0,5.0,10.088889
9,8.0,256.0,329.99,4.0,9.265306


In [688]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')  # mediană e mai robustă la valori aberante
df[['RAM', 'Storage']] = imputer.fit_transform(df[['RAM', 'Storage']])


In [689]:
# Codificarea Brand-ului cu LabelEncoder
label_encoder = LabelEncoder()
df.loc[:, 'Brand'] = label_encoder.fit_transform(df['Brand'])

X = df[['RAM', 'Storage', 'Brand','Color_Score']]
y = df['Final Price']


# Impartirea datelor in seturi de antrenament si test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizarea datelor
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Crearea modelului de regresie liniara
model = LinearRegression()

# Antrenarea modelului
model.fit(X_train_scaled, y_train)

# Predictiile pe setul de test
y_pred = model.predict(X_test_scaled)

# Evaluarea performantei
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R²: {r2}")

# Coefficients si interceptul
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")


RMSE: 294.5691207825349
R²: 0.5392085379159861
Coefficients: [ 89.88863279 213.78999204 -69.38142319   2.3196697 ]
Intercept: 484.6123140495868


In [690]:
display(plt.gcf())

# Codificarea Brand-ului cu LabelEncoder
label_encoder = LabelEncoder()
df.loc[:, 'Brand'] = label_encoder.fit_transform(df['Brand'])

X = df[['RAM', 'Storage', 'Brand','Color_Score']]
y = df['Final Price']  # inlocuit 'Final Price' cu 'price'

# Impartirea datelor in seturi de antrenament si test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizarea datelor
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Crearea modelului de regresie liniara
model = LinearRegression()

# Antrenarea modelului
model.fit(X_train_scaled, y_train)

# Predictiile pe setul de test
y_pred = model.predict(X_test_scaled)

# Evaluarea performantei
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Afisarea performantei
print(f"RMSE: {rmse}")
print(f"R²: {r2}")

# Coefficients
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")


<Figure size 640x480 with 0 Axes>

RMSE: 294.5691207825349
R²: 0.5392085379159861
Coefficients: [ 89.88863279 213.78999204 -69.38142319   2.3196697 ]
Intercept: 484.6123140495868


In [691]:
# Codificare Brand
label_encoder = LabelEncoder()
df['Brand'] = label_encoder.fit_transform(df['Brand'])

# Set features și target
X = df[['RAM', 'Storage', 'Brand','Color_Score']]
y = df['Final Price']

# Standardizare
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Model
model = LinearRegression()
model.fit(X_scaled, y)

# Predictii pe toate datele
y_pred_all = model.predict(X_scaled)

# Adaugă coloana cu predicții în dataframe
df['Predicted_Price'] = y_pred_all

df_reset = df.reset_index(drop=True)

# Afișare tabel cu toate coloanele + Predicted_Price
print(df_reset.head(20))



                                           Smartphone  Brand           Model  \
0                  Realme C55 8/256GB Sunshower Libre     27             C55   
1            Samsung Galaxy M23 5G 4/128GB Azul Libre     29      Galaxy M23   
2        Motorola Moto G13 4/128GB Azul Lavanda Libre     20        Moto G13   
3            Xiaomi Redmi Note 11S 6/128GB Gris Libre     35  Redmi Note 11S   
4             Nothing Phone (2) 12/512GB Blanco Libre     22       Phone (2)   
5                Motorola Moto E32s 4/64GB Gris Libre     20       Moto E32s   
6             Nothing Phone (2) 12/256GB Blanco Libre     22       Phone (2)   
7                 Realme 9 Pro 5G 8/128GB Negro Libre     27           9 Pro   
8           Samsung Galaxy M23 5G 4/128GB Verde Libre     29      Galaxy M23   
9   Xiaomi Redmi Note 12 Pro 8/256GB Gris Grafito ...     35   Redmi Note 12   
10                     POCO M4 5G 6/128GB Negro Libre     25              M4   
11                      Realme C31 4/64G

In [692]:
# Setări pentru afișare completă
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Codificarea Brand-ului
label_encoder = LabelEncoder()
df['Brand'] = label_encoder.fit_transform(df['Brand'])

# Selectare caracteristici și target
X = df[['RAM', 'Storage', 'Brand','Color_Score']]
y = df['Final Price']

# Standardizare
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Model și antrenare
model = LinearRegression()
model.fit(X_scaled, y)

# Predictii pe toate datele
y_pred_all = model.predict(X_scaled)

# DataFrame cu rezultate
results_df = df.copy()
results_df['Predicted Price'] = y_pred_all
results_df['Error'] = results_df['Final Price'] - results_df['Predicted Price']

# Afișare completă
display(results_df[['Brand', 'RAM', 'Storage', 'Final Price', 'Predicted Price', 'Error']])

# Scatter plot Real vs Predicted
plt.figure(figsize=(8,6))
plt.scatter(y, y_pred_all, color='blue', alpha=0.4, s=20)
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--')
plt.title('Real vs Predicted Price (toate datele)')
plt.xlabel('Real Price')
plt.ylabel('Predicted Price')
plt.xlim(y.min() * 0.9, y.max() * 1.1)
plt.ylim(y.min() * 0.9, y.max() * 1.1)
plt.tight_layout()
plt.show()

# Residual Plot
residuals = y - y_pred_all
plt.figure(figsize=(8,6))
plt.scatter(y_pred_all, residuals, color='purple', alpha=0.4, s=20)
plt.axhline(y=0, color='red', linestyle='--')
plt.title('Residual Plot (toate datele)')
plt.xlabel('Predicted Price')
plt.ylabel('Residuals')
plt.xlim(y_pred_all.min() * 0.9, y_pred_all.max() * 1.1)
plt.tight_layout()
plt.show()


Unnamed: 0,Brand,RAM,Storage,Final Price,Predicted Price,Error
0,27,8.0,256.0,231.6,699.945447,-468.345447
1,29,4.0,128.0,279.0,330.331645,-51.331645
2,20,4.0,128.0,179.01,379.506584,-200.496584
3,35,6.0,128.0,279.99,368.941862,-88.951862
4,22,12.0,512.0,799.0,1292.721865,-493.721865
5,20,4.0,64.0,148.52,273.179504,-124.659504
6,22,12.0,256.0,699.0,866.692044,-167.692044
7,27,8.0,128.0,352.59,483.594523,-131.004523
8,29,4.0,128.0,279.0,330.550029,-51.550029
9,35,8.0,256.0,329.99,653.169908,-323.179908


In [693]:
from sklearn.ensemble import RandomForestRegressor

# Presupun că ai deja df și coloanele necesare pregătite
# Codificarea Brand-ului
label_encoder = LabelEncoder()
df['Brand'] = label_encoder.fit_transform(df['Brand'])

# Selectarea caracteristicilor și target-ului
X = df[['RAM', 'Storage', 'Brand','Color_Score']]
y = df['Final Price']

# Standardizarea caracteristicilor
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Crearea si antrenarea modelului Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Predictii
y_pred_rf = rf_model.predict(X_test_scaled)

# Evaluare
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)


# Predicții pe toate datele
y_pred_rf_all = rf_model.predict(X_scaled)

# Crearea DataFrame-ului cu rezultatele
results_rf_df = df.copy()
results_rf_df['Predicted Price'] = y_pred_rf_all
results_rf_df['Error'] = results_rf_df['Final Price'] - results_rf_df['Predicted Price']

# Afișare rezultate
display(results_rf_df[['Brand', 'RAM', 'Storage', 'Final Price', 'Predicted Price', 'Error']])

# Scatter plot Real vs Predicted
plt.figure(figsize=(8, 6))
plt.scatter(y, y_pred_rf_all, color='green', alpha=0.4, s=20)
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='red', linestyle='--')
plt.title('Random Forest: Real vs Predicted Final Price (toate datele)')
plt.xlabel('Real Price')
plt.ylabel('Predicted Price')
plt.xlim(y.min() * 0.9, y.max() * 1.1)
plt.ylim(y.min() * 0.9, y.max() * 1.1)
plt.tight_layout()
plt.show()

# Residual Plot
residuals_rf = y - y_pred_rf_all
plt.figure(figsize=(8, 6))
plt.scatter(y_pred_rf_all, residuals_rf, color='orange', alpha=0.4, s=20)
plt.axhline(y=0, color='red', linestyle='--')
plt.title('Random Forest: Residual Plot (toate datele)')
plt.xlabel('Predicted Price')
plt.ylabel('Residuals')
plt.xlim(y_pred_rf_all.min() * 0.9, y_pred_rf_all.max() * 1.1)
plt.tight_layout()
plt.show()


Unnamed: 0,Brand,RAM,Storage,Final Price,Predicted Price,Error
0,27,8.0,256.0,231.6,369.275942,-137.675942
1,29,4.0,128.0,279.0,240.664804,38.335196
2,20,4.0,128.0,179.01,263.593385,-84.583385
3,35,6.0,128.0,279.99,299.725199,-19.735199
4,22,12.0,512.0,799.0,930.46461,-131.46461
5,20,4.0,64.0,148.52,155.7394,-7.2194
6,22,12.0,256.0,699.0,670.109,28.891
7,27,8.0,128.0,352.59,411.119736,-58.529736
8,29,4.0,128.0,279.0,227.341163,51.658837
9,35,8.0,256.0,329.99,448.009916,-118.019916


In [694]:
# Crearea și antrenarea modelului XGBoost
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_scaled, y)

# Predictii pe toate datele
y_pred_xgb_all = xgb_model.predict(X_scaled)

# DataFrame cu rezultate
results_xgb = df.copy()
results_xgb['Predicted Price (XGB)'] = y_pred_xgb_all
results_xgb['Error (XGB)'] = results_xgb['Final Price'] - results_xgb['Predicted Price (XGB)']

# Crearea si antrenarea modelului XGBoost
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# Predictii
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Evaluare
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)


# Evaluare performanța
rmse_xgb_all = np.sqrt(mean_squared_error(y, y_pred_xgb_all))
r2_xgb_all = r2_score(y, y_pred_xgb_all)

print(f"XGBoost RMSE (all data): {rmse_xgb_all:.2f}")
print(f"XGBoost R² (all data): {r2_xgb_all:.4f}")

# Scatter Plot: Real vs Predicted
plt.figure(figsize=(8, 6))
plt.scatter(y, y_pred_xgb_all, color='red', alpha=0.6, s=20)
plt.plot([y.min(), y.max()], [y.min(), y.max()], color='black', linestyle='--')
plt.title('XGBoost: Real vs Predicted Final Price (toate datele)')
plt.xlabel('Real Price')
plt.ylabel('Predicted Price')
plt.xlim(y.min() * 0.9, y.max() * 1.1)
plt.ylim(y.min() * 0.9, y.max() * 1.1)
plt.tight_layout()
plt.show()

# Residual Plot
residuals_xgb = y - y_pred_xgb_all
plt.figure(figsize=(8, 6))
plt.scatter(y_pred_xgb_all, residuals_xgb, color='blue', alpha=0.5, s=20)
plt.axhline(y=0, color='black', linestyle='--')
plt.title('XGBoost: Residual Plot (toate datele)')
plt.xlabel('Predicted Price')
plt.ylabel('Residuals')
plt.xlim(y_pred_xgb_all.min() * 0.9, y_pred_xgb_all.max() * 1.1)
plt.tight_layout()
plt.show()


XGBoost RMSE (all data): 184.21
XGBoost R² (all data): 0.7863


In [695]:
text = '''Dupa parerea mea cel mai bun algoritm pe care il pot folosi in momentul de fata este "XGBoost  184.21  0.7863" '''

print(text)


Dupa parerea mea cel mai bun algoritm pe care il pot folosi in momentul de fata este "XGBoost  184.21  0.7863" 


In [696]:
text = '''Am ales dupa cel care se prezinta mai bine la RMSE si la R2 acel algoritm fiind XGBoost '''

print(text)

Am ales dupa cel care se prezinta mai bine la RMSE si la R2 acel algoritm fiind XGBoost 


In [706]:
label_encoder = LabelEncoder()
df['Brand'] = label_encoder.fit_transform(df['Brand'].astype(str))

# Definire variabile
X = df[['RAM', 'Storage', 'Brand','Color_Score']]
y = df['Final Price']

# Împartire train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scalare
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Antrenare model ---
rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train_scaled, y_train)

# --- Predictii si evaluare ---
y_pred = rf.predict(X_test_scaled)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Random Forest RMSE: {rmse:.4f}")
print(f"Random Forest R2: {r2:.4f}")

# --- Vizualizări finale ---

# 1. Importanța caracteristicilor
feat_imp_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=True)

plt.figure(figsize=(8,5))
sns.barplot(x='Importance', y='Feature', data=feat_imp_df, color='royalblue')
plt.title('Importanta caracteristicilor (Random Forest)')
plt.tight_layout()
plt.show()

# 2. Real vs Predicted - Scatter plot
plt.figure(figsize=(7,7))
plt.scatter(y_test, y_pred, alpha=0.6, color='green')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Pret real')
plt.ylabel('Pret prezis')
plt.title('Pret real vs Pret prezis (Random Forest)')
plt.tight_layout()
plt.show()

# 3. Plotul rezidualilor
residuals = y_test - y_pred
plt.figure(figsize=(8,5))
sns.histplot(residuals, bins=30, kde=True, color='coral')
plt.title('Distribuția rezidualilor')
plt.xlabel('Rezidual (eroare reala - prezisa)')
plt.ylabel('Frecventa')
plt.tight_layout()
plt.show()

# 4. Residual plot (rezidualii vs valorile prezise)
plt.figure(figsize=(8,5))
plt.scatter(y_pred, residuals, alpha=0.6, color='purple')
plt.axhline(y=0, linestyle='--', color='black')
plt.xlabel('Pret prezis')
plt.ylabel('Rezidual')
plt.title('Rezidual vs Pret prezis')
plt.tight_layout()
plt.show()


Random Forest RMSE: 251.3009
Random Forest R2: 0.6646
