In [62]:
import numpy as np
import pandas as pd
from sklearn.datasets import * 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style ="ticks")


from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, Normalizer

In [48]:
df = pd.read_csv('gpu_specs_v6.csv')
df.head()

Unnamed: 0,manufacturer,productName,releaseYear,memSize,memBusWidth,gpuClock,memClock,unifiedShader,tmu,rop,pixelShader,vertexShader,igp,bus,memType,gpuChip
0,NVIDIA,GeForce RTX 4050,2023.0,8.0,128.0,1925,2250.0,3840.0,120,48,,,No,PCIe 4.0 x16,GDDR6,AD106
1,Intel,Arc A350M,2022.0,4.0,64.0,300,1500.0,768.0,48,24,,,No,PCIe 4.0 x8,GDDR6,DG2-128
2,Intel,Arc A370M,2022.0,4.0,64.0,300,1500.0,1024.0,64,32,,,No,PCIe 4.0 x8,GDDR6,DG2-128
3,Intel,Arc A380,2022.0,4.0,64.0,300,1500.0,1024.0,64,32,,,No,PCIe 4.0 x8,GDDR6,DG2-128
4,Intel,Arc A550M,2022.0,8.0,128.0,300,1500.0,2048.0,128,64,,,No,PCIe 4.0 x16,GDDR6,DG2-512


In [49]:
# удаляем ненужные строки
df.drop(['manufacturer','productName','unifiedShader','tmu','rop','pixelShader','vertexShader','igp','bus','memType','gpuChip'], axis=1, inplace=True)
df.head()

Unnamed: 0,releaseYear,memSize,memBusWidth,gpuClock,memClock
0,2023.0,8.0,128.0,1925,2250.0
1,2022.0,4.0,64.0,300,1500.0
2,2022.0,4.0,64.0,300,1500.0
3,2022.0,4.0,64.0,300,1500.0
4,2022.0,8.0,128.0,300,1500.0


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   releaseYear  2845 non-null   float64
 1   memSize      2477 non-null   float64
 2   memBusWidth  2477 non-null   float64
 3   gpuClock     2889 non-null   int64  
 4   memClock     2477 non-null   float64
dtypes: float64(4), int64(1)
memory usage: 113.0 KB


In [51]:
# ищем пропуски
df.isna().sum()

releaseYear     44
memSize        412
memBusWidth    412
gpuClock         0
memClock       412
dtype: int64

In [52]:
columns_to_replace = ['memSize', 'memBusWidth', 'memClock']
for column in columns_to_replace:
    df[column] = df[column].fillna(0)

In [53]:
df.isna().sum()

releaseYear    44
memSize         0
memBusWidth     0
gpuClock        0
memClock        0
dtype: int64

In [54]:
df.dropna(subset=['releaseYear'],inplace = True, axis = 0)
df.isna().sum()

releaseYear    0
memSize        0
memBusWidth    0
gpuClock       0
memClock       0
dtype: int64

In [55]:
y = df['gpuClock']  #Наименования признаков
X = df.drop('gpuClock', axis=1) # Значения признаков

In [56]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state= 45)
# random_state позволяет задавать базовое значение для генератора случайных чисел, чтобы сделать выборку неслучайной 

In [57]:
# Размер обучающей выборки
X_train.shape, y_train.shape

((2133, 4), (2133,))

In [58]:
# Размер тестовой выборки
X_test.shape, y_test.shape

((712, 4), (712,))

In [59]:
np.unique(y_train)

array([  10,   25,   40,   41,   44,   45,   53,   60,   62,   66,   75,
         80,   83,   84,   90,   96,  100,  103,  105,  118,  120,  125,
        130,  133,  134,  135,  143,  144,  147,  148,  150,  160,  162,
        166,  175,  183,  187,  199,  200,  220,  225,  230,  235,  240,
        243,  250,  260,  270,  275,  276,  280,  290,  295,  297,  300,
        311,  320,  324,  325,  327,  331,  333,  336,  337,  350,  351,
        360,  375,  378,  380,  384,  390,  398,  400,  405,  412,  416,
        419,  420,  424,  425,  430,  440,  444,  445,  450,  452,  459,
        460,  470,  473,  475,  479,  480,  485,  488,  491,  493,  495,
        496,  497,  500,  503,  506,  507,  508,  513,  514,  515,  518,
        519,  520,  523,  525,  530,  533,  535,  540,  549,  550,  552,
        554,  557,  560,  561,  562,  567,  574,  575,  576,  580,  585,
        587,  589,  590,  593,  594,  598,  600,  601,  602,  606,  608,
        610,  612,  615,  620,  625,  628,  630,  6

In [60]:
np.unique(y_test)

array([  10,   40,   50,   53,   60,   75,   83,   84,   90,  100,  105,
        120,  125,  130,  143,  147,  160,  166,  175,  183,  200,  222,
        230,  233,  235,  240,  250,  270,  275,  276,  277,  280,  300,
        324,  325,  327,  336,  350,  371,  375,  378,  380,  391,  398,
        400,  412,  425,  444,  450,  459,  470,  475,  480,  500,  513,
        520,  523,  525,  527,  540,  550,  557,  560,  562,  567,  570,
        574,  575,  580,  585,  587,  589,  590,  594,  598,  600,  601,
        602,  606,  608,  610,  620,  625,  640,  641,  648,  650,  651,
        660,  667,  668,  669,  672,  675,  680,  688,  700,  706,  709,
        719,  720,  722,  725,  730,  732,  736,  738,  745,  750,  753,
        760,  765,  771,  772,  773,  775,  777,  779,  780,  783,  784,
        795,  796,  797,  800,  810,  817,  823,  824,  825,  840,  850,
        854,  863,  875,  876,  880,  900,  902,  907,  909,  915,  918,
        920,  924,  925,  926,  928,  930,  940,  9

In [63]:
#Масштабирование данных
scaler = MinMaxScaler().fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_train.columns)
X_train.describe()

Unnamed: 0,releaseYear,memSize,memBusWidth,memClock
count,2133.0,2133.0,2133.0,2133.0
mean,0.687204,0.02176,0.0299,0.328045
std,0.171644,0.057658,0.078859,0.245591
min,0.0,0.0,0.0,0.0
25%,0.555556,0.001,0.007812,0.110767
50%,0.694444,0.004,0.015625,0.354453
75%,0.805556,0.015625,0.03125,0.498449
max,1.0,1.0,1.0,1.0


In [64]:
def test_model(model):
    print("mean_absolute_error:",
          mean_absolute_error(y_test, model.predict(X_test)))
    print("mean_squared_error:",
          mean_squared_error(y_test, model.predict(X_test)))
    print("median_absolute_error:",
          median_absolute_error(y_test, model.predict(X_test)))
    print("r2_score:",
          r2_score(y_test, model.predict(X_test)))

In [65]:
reg_10 = KNeighborsRegressor(n_neighbors=10)
reg_10.fit(X_train, y_train)

In [66]:
test_model(reg_10)

mean_absolute_error: 103.2741573033708
mean_squared_error: 27854.493511235953
median_absolute_error: 63.30000000000001
r2_score: 0.8177414976569988


In [69]:
def test_model(model):
    print("mean_absolute_error:",
          mean_absolute_error(y_test, model.predict(X_test)))
    print("mean_squared_error:",
          mean_squared_error(y_test, model.predict(X_test)))
    print("median_absolute_error:",
          median_absolute_error(y_test, model.predict(X_test)))
    print("r2_score:",
          r2_score(y_test, model.predict(X_test)))

In [70]:
reg_10 = KNeighborsRegressor(n_neighbors=20)
reg_10.fit(X_train, y_train)

In [71]:
test_model(reg_10)

mean_absolute_error: 107.73054775280897
mean_squared_error: 31305.38659761236
median_absolute_error: 65.75
r2_score: 0.7951614925524337


In [72]:
def test_model(model):
    print("mean_absolute_error:",
          mean_absolute_error(y_test, model.predict(X_test)))
    print("mean_squared_error:",
          mean_squared_error(y_test, model.predict(X_test)))
    print("median_absolute_error:",
          median_absolute_error(y_test, model.predict(X_test)))
    print("r2_score:",
          r2_score(y_test, model.predict(X_test)))

In [73]:
reg_10 = KNeighborsRegressor(n_neighbors=100)
reg_10.fit(X_train, y_train)

In [74]:
test_model(reg_10)

mean_absolute_error: 119.09692415730338
mean_squared_error: 36710.89427233146
median_absolute_error: 75.255
r2_score: 0.759791984476457


In [75]:
def test_model(model):
    print("mean_absolute_error:",
          mean_absolute_error(y_test, model.predict(X_test)))
    print("mean_squared_error:",
          mean_squared_error(y_test, model.predict(X_test)))
    print("median_absolute_error:",
          median_absolute_error(y_test, model.predict(X_test)))
    print("r2_score:",
          r2_score(y_test, model.predict(X_test)))

In [76]:
reg_10 = KNeighborsRegressor(n_neighbors=8)
reg_10.fit(X_train, y_train)

In [77]:
test_model(reg_10)

mean_absolute_error: 102.29529494382022
mean_squared_error: 27819.60603932584
median_absolute_error: 61.875
r2_score: 0.8179697745911425
