In [36]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

import lightgbm

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error

from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('Diamond Price Prediction.csv')
df.head(10)

Unnamed: 0,Carat(Weight of Daimond),Cut(Quality),Color,Clarity,Depth,Table,Price(in US dollars),X(length),Y(width),Z(Depth)
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
5,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
6,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47
7,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53
8,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
9,0.23,Very Good,H,VS1,59.4,61.0,338,4.0,4.05,2.39


In [3]:
df.isna().sum()

Carat(Weight of Daimond)    0
Cut(Quality)                0
Color                       0
Clarity                     0
Depth                       0
Table                       0
Price(in US dollars)        0
X(length)                   0
Y(width)                    0
Z(Depth)                    0
dtype: int64

In [4]:
cat_columns = [cname for cname in df.columns if df[cname].dtype == 'object']
encoder = preprocessing.LabelEncoder()
for col in cat_columns:
    df[col] = encoder.fit_transform(df[col])

In [5]:
df.head(10)

Unnamed: 0,Carat(Weight of Daimond),Cut(Quality),Color,Clarity,Depth,Table,Price(in US dollars),X(length),Y(width),Z(Depth)
0,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,334,4.2,4.23,2.63
4,0.31,1,6,3,63.3,58.0,335,4.34,4.35,2.75
5,0.24,4,6,7,62.8,57.0,336,3.94,3.96,2.48
6,0.24,4,5,6,62.3,57.0,336,3.95,3.98,2.47
7,0.26,4,4,2,61.9,55.0,337,4.07,4.11,2.53
8,0.22,0,1,5,65.1,61.0,337,3.87,3.78,2.49
9,0.23,4,4,4,59.4,61.0,338,4.0,4.05,2.39


In [6]:
X = df.drop('Price(in US dollars)', axis=1)
y = df['Price(in US dollars)']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)

In [7]:
lr = LinearRegression()
scores = cross_val_score(lr, X_train, y_train, cv=5, scoring='neg_mean_absolute_error') #метод средней ошибки в долларах, то есть очень большая ошибка
print('LinearRegression =', - np.mean(scores))

LinearRegression = 864.6521477706298


In [12]:
dt = DecisionTreeRegressor()
scores = cross_val_score(dt, X_train, y_train, cv=5, scoring='neg_mean_absolute_error') #средняя абсолютая ошибка
print('DecisionTreeRegressor =', - np.mean(scores))

DecisionTreeRegressor = 372.2044177936305


In [9]:
rf = RandomForestRegressor()
scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print('RandomForestRegressor =', - np.mean(scores))

RandomForestRegressor = 277.2731062725249


In [19]:
lgb = lightgbm.LGBMRegressor(random_state=0)
scores = cross_val_score(dt, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print('LGBMRegressor =', - np.mean(scores))

LGBMRegressor = 371.0185638069569


## ПОДБОР ПАРАМЕТРОВ ВЫБРАНОЙ МОДЕЛИ

In [22]:
def cv_params(model, param_grid):
    scoring='neg_mean_absolute_error'
    
    opt_params = GridSearchCV(
        estimator=model,        #модель
        param_grid=param_grid,  #параметры
        scoring=scoring,        #стратегия валидации
        cv=5,                   #кол во слоев кросс-валид
        n_jobs=-1               #кол во потоков для обучения(все)
    )
    
    opt_params.fit(X_train, y_train)
    params = opt_params.best_params_
    best_score = opt_params.best_score_
    
    print(f'Best score: {round(-best_score, 2)}')
    print(f'Best prarams: {params}\n')
    
    return params

In [30]:
lgb_params_grid = {
    'max_depth': [4, 10, 15, -1],                #макс глубина дерева
    'num_leaves': [25, 35, 45],                  #макс кол во листтьев на дереве (развитие модели вширь)
    'n_estimators': [41, 100, 250, 500, 600]    #кол во деревьев
}

lgb_clean = lightgbm.LGBMRegressor(random_state=1)
lgb_params = cv_params(lgb_clean, lgb_params_grid)

Best score: 273.85
Best prarams: {'max_depth': 10, 'n_estimators': 600, 'num_leaves': 45}



In [31]:
rf_param_grid = {
    'max_depth': [20, 25],                #макс глубина дерева
    'n_estimators': [500, 800]            #кол во деревьев
}

rf_clean = RandomForestRegressor(random_state=1)
rf_params = cv_params(rf_clean, rf_param_grid)

Best score: 276.05
Best prarams: {'max_depth': 25, 'n_estimators': 800}



In [38]:
lgb = lightgbm.LGBMRegressor(**lgb_params)
lgb.fit(X_train, y_train)

preds = lgb.predict(X_test)
print(f'MAPE: {round(mean_absolute_percentage_error(y_test, preds) * 100,2)}')
print(f'MAE: {round(mean_absolute_error(y_test, preds), 2)}')

MAPE: 6.63
MAE: 263.79


In [39]:
results = pd.DataFrame({'Model': np.round(preds), 'Actual': y_test}) #сравним что предсказала мрдель с актуальными данными
results = results.reset_index().drop('index', axis=1)
results.head(15) 

Unnamed: 0,Model,Actual
0,506.0,564
1,5628.0,5914
2,2569.0,2562
3,530.0,537
4,5893.0,5964
5,1250.0,984
6,5025.0,5247
7,604.0,611
8,10847.0,9645
9,1078.0,1162


In [None]:
lgb.fit(X, y)