# Случайный лес для прогнозирования цены на основе характеристик

In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression

In [19]:
data = pd.read_csv("https://raw.githubusercontent.com/AleksKids/hedonistic-demand-from-ebay/main/preprocessed.tsv", delimiter = '\t')

In [20]:
data

Unnamed: 0,Title,Price,Shipping_cost,Sales_Count,Sales_Value,Link,Condition,Seller Notes,Brand,Model,...,Dual-Band,FM,Global Version,HDMI Micro,Headphone Jack,Infrared,Lightning,Quad-Band,Tri-Band,Wireless charging
0,Apple iPhone 11 64GB Factory Unlocked 4G LTE S...,337.59,0.00,19787.0,6679958.00,https://www.ebay.com/itm/254604777645?nordt=tr...,Very Good - Refurbished,“This iPhone 11 is in Very Good condition and ...,apple,iPhone 11,...,0,0,0,0,0,0,0,0,0,0
1,Apple iPhone XR 64GB Factory Unlocked Smartpho...,266.28,0.00,12440.0,3312560.60,https://www.ebay.com/itm/254187678666?nordt=tr...,Very Good - Refurbished,“This Apple iPhone XR 64GB Factory Unlocked Sm...,apple,iPhone XR,...,0,0,0,0,0,0,0,0,0,0
2,Apple iPhone 11 64GB Unlocked Smartphone - Ver...,359.11,16.93,12224.0,4389742.37,https://www.ebay.com/itm/363183815277?nordt=tr...,Very Good - Refurbished,"“This is a B+ Stock item, meaning unit is in v...",apple,Apple iPhone 11,...,0,0,0,0,0,0,0,0,0,0
3,Apple iPhone X 64GB Factory Unlocked Phone - V...,231.98,16.50,11781.0,2732899.88,https://www.ebay.com/itm/382605209867?nordt=tr...,Very Good - Refurbished,"“This is a B+ Stock item, meaning unit is in v...",apple,Apple iPhone X,...,1,1,1,1,1,1,1,1,1,1
4,Apple iPhone X 256GB Unlocked Smartphone - Ver...,263.59,15.34,10020.0,2641203.85,https://www.ebay.com/itm/382446518910?nordt=tr...,Very Good - Refurbished,"“This is a B+ Stock item, meaning unit is in v...",apple,Apple iPhone X,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3076,Samsung Galaxy Note8 SM-N950U - 64GB (Unlocked...,128.40,0.00,37.0,4750.91,https://www.ebay.com/itm/265635248944?nordt=tr...,Used,“These devices contain a SPOT on the display. ...,samsung,Samsung Galaxy Note8,...,0,0,0,0,0,0,0,0,0,0
3077,Brand New STALA509DCP TCL A3 32GB Storage 3GB ...,43.13,0.00,34.0,1466.36,https://www.ebay.com/itm/265731617536?nordt=tr...,New,"“This is a B+ Stock item, meaning unit is in v...",tcl,TCL A3,...,0,0,0,0,0,0,0,0,0,0
3078,Apple iPhone 12 Pro Max - 128GB - Fully Unlock...,554.31,0.00,37.0,20509.63,https://www.ebay.com/itm/265936106963?nordt=tr...,Used,“Phones Have Been Certified By Our Industry-Le...,apple,Apple iPhone 12 Pro Max,...,0,0,0,0,0,0,1,0,0,0
3079,Apple iPhone 13 Mini 128GB - T-Mobile / Metro ...,380.90,0.00,33.0,12569.67,https://www.ebay.com/itm/266138817726?nordt=tr...,Good - Refurbished,“LOCKED TO T-MOBILE Network - Fully tested 100...,apple,Apple iPhone 13 mini,...,0,0,0,0,0,0,0,0,0,0


In [68]:
y = data['Price']
X = data.drop(columns = ['Title', 'Price', 'Sales_Value', 'Link', 'Seller Notes', 'Model','Lock Status', 'Contract', 'Model Number', 'Style', 'MPN', 'Connectivity', 'Features', 'Memory Card Type', 'Processor', 'Color'])

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [70]:
categorical = list(X_train.dtypes[X_train.dtypes == "object"].index)
numeric = list(X_train.dtypes[X_train.dtypes == np.number].index)
X_train[categorical] = X_train[categorical]
X_test[categorical] = X_test[categorical]

In [71]:
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown="ignore"), categorical),
    ('scaling', StandardScaler(), numeric)
])

pipeline = Pipeline(steps=[
    ('ohe_and_scaling', column_transformer),
    ('regression', RandomForestRegressor())
])

In [72]:
model = pipeline.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Test RMSE = %.4f" % mean_squared_error(y_test, y_pred, squared=False))

Test RMSE = 80.1074


# Линейная регрессия для прогнозирования спроса на основе характеристик и цены

In [135]:
y2 = data['Sales_Count']
X2 = data.drop(columns = ['Title', 'Sales_Value', 'Sales_Count', 'Link', 'Seller Notes', 'Model','Lock Status', 'Contract', 'Model Number', 'Style', 'MPN', 'Connectivity', 'Features', 'Memory Card Type', 'Processor', 'Color'])

In [136]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=10)

In [137]:
X_train2

Unnamed: 0,Price,Shipping_cost,Condition,Brand,Storage Capacity,Network,Camera Resolution,Screen Size,RAM,Operating System,...,Dual-Band,FM,Global Version,HDMI Micro,Headphone Jack,Infrared,Lightning,Quad-Band,Tri-Band,Wireless charging
1995,57.62,6.99,Used,samsung,32.0,Operator,12.0,5.100000,4.000000,Android,...,0,0,0,0,0,0,0,0,0,0
825,696.98,0.00,Excellent - Refurbished,samsung,128.0,Operator,32.0,6.800000,12.000000,Android,...,0,0,0,0,0,0,0,0,0,0
3057,58.09,4.67,Used,samsung,32.0,Operator,12.0,5.100000,4.000000,Android,...,0,0,0,0,0,0,0,0,0,0
370,216.81,14.16,Open box,samsung,128.0,Unlocked,12.0,6.900000,12.000000,Android,...,0,0,0,0,0,0,0,0,0,0
608,296.14,0.00,Excellent - Refurbished,apple,64.0,Operator,12.0,5.800000,3.000000,IOS,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2009,77.37,0.00,Used,samsung,128.0,Unlocked,48.0,6.500000,6.000000,Other,...,0,0,0,0,0,0,0,0,0,0
1180,193.80,7.74,Good - Refurbished,apple,64.0,Unlocked,12.0,4.700000,2.000000,IOS,...,0,0,0,0,0,0,1,0,0,0
1344,252.62,0.00,New,apple,16.0,Operator,12.0,4.700000,2.000000,IOS,...,1,1,1,1,1,1,1,1,1,1
527,61.43,25.00,New,motorola,16.0,Operator,8.0,5.916407,2.000000,Android,...,0,0,0,0,0,0,0,0,0,0


In [138]:
y_train2

1995     79.0
825     196.0
3057     45.0
370     389.0
608     247.0
        ...  
2009     81.0
1180    119.0
1344     98.0
527     351.0
1289    115.0
Name: Sales_Count, Length: 2156, dtype: float64

In [139]:
categorical2 = list(X_train2.dtypes[X_train2.dtypes == "object"].index)
numeric2 = list(X_train2.dtypes[X_train2.dtypes == np.number].index)
X_train2[categorical2] = X_train2[categorical2]
X_test2[categorical2] = X_test2[categorical2]

In [140]:
column_transformer2 = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown="ignore"), categorical2),
    ('scaling', StandardScaler(), numeric2)
])

pipeline2 = Pipeline(steps=[
    ('ohe_and_scaling', column_transformer2),
    ('regression', LinearRegression())
])

In [141]:
model2 = pipeline2.fit(X_train2, y_train2)
y_pred2 = model2.predict(X_test2)
print("Test RMSE = %.4f" % mean_squared_error(y_test2, y_pred2, squared=False))
print("Test MAE = %.4f" % mean_absolute_error(y_test2, y_pred2))

Test RMSE = 710.2118
Test MAE = 285.2663
