In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math as mth
import matplotlib.patches as patches
from scipy import stats as st
plt.rcParams.update({'figure.max_open_warning': 0})
from plotly.offline import init_notebook_mode, iplot
import plotly
import plotly.graph_objs as go

In [3]:
df = pd.read_csv('gpu_specs_v6.csv')

In [4]:
df.head()

Unnamed: 0,manufacturer,productName,releaseYear,memSize,memBusWidth,gpuClock,memClock,unifiedShader,tmu,rop,pixelShader,vertexShader,igp,bus,memType,gpuChip
0,NVIDIA,GeForce RTX 4050,2023.0,8.0,128.0,1925,2250.0,3840.0,120,48,,,No,PCIe 4.0 x16,GDDR6,AD106
1,Intel,Arc A350M,2022.0,4.0,64.0,300,1500.0,768.0,48,24,,,No,PCIe 4.0 x8,GDDR6,DG2-128
2,Intel,Arc A370M,2022.0,4.0,64.0,300,1500.0,1024.0,64,32,,,No,PCIe 4.0 x8,GDDR6,DG2-128
3,Intel,Arc A380,2022.0,4.0,64.0,300,1500.0,1024.0,64,32,,,No,PCIe 4.0 x8,GDDR6,DG2-128
4,Intel,Arc A550M,2022.0,8.0,128.0,300,1500.0,2048.0,128,64,,,No,PCIe 4.0 x16,GDDR6,DG2-512


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2889 entries, 0 to 2888
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   manufacturer   2889 non-null   object 
 1   productName    2889 non-null   object 
 2   releaseYear    2845 non-null   float64
 3   memSize        2477 non-null   float64
 4   memBusWidth    2477 non-null   float64
 5   gpuClock       2889 non-null   int64  
 6   memClock       2477 non-null   float64
 7   unifiedShader  2065 non-null   float64
 8   tmu            2889 non-null   int64  
 9   rop            2889 non-null   int64  
 10  pixelShader    824 non-null    float64
 11  vertexShader   824 non-null    float64
 12  igp            2889 non-null   object 
 13  bus            2889 non-null   object 
 14  memType        2889 non-null   object 
 15  gpuChip        2889 non-null   object 
dtypes: float64(7), int64(3), object(6)
memory usage: 361.2+ KB


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
releaseYear,2845.0,2010.691388,6.193125,1986.0,2006.0,2011.0,2015.0,2023.0
memSize,2477.0,3.113803,7.175399,3.2e-05,0.256,1.024,3.0,128.0
memBusWidth,2477.0,274.874445,653.163896,32.0,128.0,128.0,256.0,8192.0
gpuClock,2889.0,661.126687,374.48145,10.0,400.0,600.0,875.0,2331.0
memClock,2477.0,868.578119,509.987396,5.0,400.0,837.0,1250.0,2257.0
unifiedShader,2065.0,1032.93753,1662.834618,8.0,144.0,384.0,1280.0,17408.0
tmu,2889.0,47.429214,73.014849,0.0,8.0,20.0,56.0,880.0
rop,2889.0,18.750087,25.067896,0.0,4.0,8.0,24.0,256.0
pixelShader,824.0,6.739078,8.091586,0.0,2.0,4.0,8.0,48.0
vertexShader,824.0,2.622573,2.579388,0.0,0.0,2.0,4.0,24.0


In [7]:
df.isna().sum()

manufacturer        0
productName         0
releaseYear        44
memSize           412
memBusWidth       412
gpuClock            0
memClock          412
unifiedShader     824
tmu                 0
rop                 0
pixelShader      2065
vertexShader     2065
igp                 0
bus                 0
memType             0
gpuChip             0
dtype: int64

In [8]:
columns_to_replace = ['memSize', 'memBusWidth', 'memClock', 'unifiedShader', 'pixelShader', 'vertexShader']
for column in columns_to_replace:
    df[column] = df[column].fillna(0)

In [9]:
df.isna().sum()

manufacturer      0
productName       0
releaseYear      44
memSize           0
memBusWidth       0
gpuClock          0
memClock          0
unifiedShader     0
tmu               0
rop               0
pixelShader       0
vertexShader      0
igp               0
bus               0
memType           0
gpuChip           0
dtype: int64

In [10]:
year = 1-len(df['releaseYear'].dropna())/len(df)
print('Пропущенных значений в releaseYear: {:.1%}'.format(year))

Пропущенных значений в releaseYear: 1.5%


In [11]:
df.dropna(subset=['releaseYear'],inplace = True, axis = 0)
df.isna().sum()

manufacturer     0
productName      0
releaseYear      0
memSize          0
memBusWidth      0
gpuClock         0
memClock         0
unifiedShader    0
tmu              0
rop              0
pixelShader      0
vertexShader     0
igp              0
bus              0
memType          0
gpuChip          0
dtype: int64

In [12]:
df.columns

Index(['manufacturer', 'productName', 'releaseYear', 'memSize', 'memBusWidth',
       'gpuClock', 'memClock', 'unifiedShader', 'tmu', 'rop', 'pixelShader',
       'vertexShader', 'igp', 'bus', 'memType', 'gpuChip'],
      dtype='object')

In [13]:
df.columns = df.columns.str.lower()
df.columns

Index(['manufacturer', 'productname', 'releaseyear', 'memsize', 'membuswidth',
       'gpuclock', 'memclock', 'unifiedshader', 'tmu', 'rop', 'pixelshader',
       'vertexshader', 'igp', 'bus', 'memtype', 'gpuchip'],
      dtype='object')

In [14]:
df.duplicated().sum()

32

In [15]:
df = df.drop_duplicates()

In [16]:
df.duplicated().sum()

0

In [30]:
df.sample(10)

Unnamed: 0,manufacturer,productname,releaseyear,memsize,membuswidth,gpuclock,memclock,unifiedshader,tmu,rop,pixelshader,vertexshader,igp,bus,memtype,gpuchip
1091,AMD,Radeon HD 8330E,2013.0,0.0,0.0,497,0.0,128.0,8,4,0.0,0.0,Yes,IGP,System Shared,Kalindi
1025,NVIDIA,GeForce GTX 780M Mac Edition,2013.0,4.0,256.0,771,1250.0,1536.0,128,32,0.0,0.0,No,MXM-B (3.0),GDDR5,GK104
973,NVIDIA,GeForce 705M,2013.0,1.024,64.0,738,900.0,48.0,8,4,0.0,0.0,No,PCIe 2.0 x16,DDR3,GF119
780,AMD,Radeon R9 370 1024SP,2015.0,2.0,256.0,925,1400.0,1024.0,64,32,0.0,0.0,No,PCIe 3.0 x16,GDDR5,Trinidad
624,AMD,Radeon R7 M465X,2016.0,2.0,128.0,900,1125.0,512.0,32,16,0.0,0.0,No,PCIe 3.0 x16,GDDR5,Tropo
1632,ATI,Mobility Radeon HD 5850 Mac Edition,2010.0,1.024,128.0,628,1000.0,800.0,40,16,0.0,0.0,No,MXM-B (3.0),GDDR5,Broadway
2006,NVIDIA,GeForce 9100,2007.0,0.0,0.0,500,0.0,16.0,8,4,0.0,0.0,Yes,PCI,System Shared,C78
359,NVIDIA,Quadro P5200 Max-Q,2018.0,16.0,256.0,1316,1804.0,2560.0,160,64,0.0,0.0,No,MXM-B (3.0),GDDR5,GP104
1437,NVIDIA,Quadro 5000 SDI,2011.0,2.5,320.0,513,750.0,352.0,44,40,0.0,0.0,No,PCIe 2.0 x16,GDDR5,GF100
1550,NVIDIA,GeForce 310M,2010.0,0.512,64.0,625,790.0,16.0,8,4,0.0,0.0,No,PCIe 2.0 x16,DDR3,GT218


In [43]:
new_dummies = pd.get_dummies(df['manufacturer'], prefix='manufacturer')
new_data = pd.concat([df, new_dummies], axis=1)
new_data.drop('manufacturer', axis=1, inplace=True)

In [44]:
new_data

Unnamed: 0,productname,releaseyear,memsize,membuswidth,gpuclock,memclock,unifiedshader,tmu,rop,pixelshader,...,gpuchip,time_go,manufacturer_3dfx,manufacturer_AMD,manufacturer_ATI,manufacturer_Intel,manufacturer_Matrox,manufacturer_NVIDIA,manufacturer_Sony,manufacturer_XGI
0,GeForce RTX 4050,2023.0,8.000000,128.0,1925,2250.0,3840.0,120,48,0.0,...,AD106,0.0,0,0,0,0,0,1,0,0
1,Arc A350M,2022.0,4.000000,64.0,300,1500.0,768.0,48,24,0.0,...,DG2-128,1.0,0,0,0,1,0,0,0,0
2,Arc A370M,2022.0,4.000000,64.0,300,1500.0,1024.0,64,32,0.0,...,DG2-128,1.0,0,0,0,1,0,0,0,0
3,Arc A380,2022.0,4.000000,64.0,300,1500.0,1024.0,64,32,0.0,...,DG2-128,1.0,0,0,0,1,0,0,0,0
4,Arc A550M,2022.0,8.000000,128.0,300,1500.0,2048.0,128,64,0.0,...,DG2-512,1.0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2840,EGA Wonder 800,1987.0,0.000256,32.0,10,8.0,0.0,0,1,1.0,...,16899-0,36.0,0,0,1,0,0,0,0,0
2841,Graphics Solution Plus,1987.0,0.000064,32.0,10,5.0,0.0,0,0,0.0,...,CW16800-B,36.0,0,0,1,0,0,0,0,0
2842,VGA Improved Performance,1987.0,0.000256,32.0,10,10.0,0.0,0,1,1.0,...,16899-0,36.0,0,0,1,0,0,0,0,0
2843,Color Emulation Card,1986.0,0.000032,32.0,10,5.0,0.0,0,0,0.0,...,CW16800-A,37.0,0,0,1,0,0,0,0,0


In [45]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
new_data['gpuclock_scaled'] = scaler.fit_transform(new_data[['gpuclock']])

In [46]:
new_data

Unnamed: 0,productname,releaseyear,memsize,membuswidth,gpuclock,memclock,unifiedshader,tmu,rop,pixelshader,...,time_go,manufacturer_3dfx,manufacturer_AMD,manufacturer_ATI,manufacturer_Intel,manufacturer_Matrox,manufacturer_NVIDIA,manufacturer_Sony,manufacturer_XGI,gpuclock_scaled
0,GeForce RTX 4050,2023.0,8.000000,128.0,1925,2250.0,3840.0,120,48,0.0,...,0.0,0,0,0,0,0,1,0,0,0.825075
1,Arc A350M,2022.0,4.000000,64.0,300,1500.0,768.0,48,24,0.0,...,1.0,0,0,0,1,0,0,0,0,0.124946
2,Arc A370M,2022.0,4.000000,64.0,300,1500.0,1024.0,64,32,0.0,...,1.0,0,0,0,1,0,0,0,0,0.124946
3,Arc A380,2022.0,4.000000,64.0,300,1500.0,1024.0,64,32,0.0,...,1.0,0,0,0,1,0,0,0,0,0.124946
4,Arc A550M,2022.0,8.000000,128.0,300,1500.0,2048.0,128,64,0.0,...,1.0,0,0,0,1,0,0,0,0,0.124946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2840,EGA Wonder 800,1987.0,0.000256,32.0,10,8.0,0.0,0,1,1.0,...,36.0,0,0,1,0,0,0,0,0,0.000000
2841,Graphics Solution Plus,1987.0,0.000064,32.0,10,5.0,0.0,0,0,0.0,...,36.0,0,0,1,0,0,0,0,0,0.000000
2842,VGA Improved Performance,1987.0,0.000256,32.0,10,10.0,0.0,0,1,1.0,...,36.0,0,0,1,0,0,0,0,0,0.000000
2843,Color Emulation Card,1986.0,0.000032,32.0,10,5.0,0.0,0,0,0.0,...,37.0,0,0,1,0,0,0,0,0,0.000000


In [47]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
new_data['memclock_scaled'] = scaler.fit_transform(new_data[['memclock']])

In [48]:
new_data

Unnamed: 0,productname,releaseyear,memsize,membuswidth,gpuclock,memclock,unifiedshader,tmu,rop,pixelshader,...,manufacturer_3dfx,manufacturer_AMD,manufacturer_ATI,manufacturer_Intel,manufacturer_Matrox,manufacturer_NVIDIA,manufacturer_Sony,manufacturer_XGI,gpuclock_scaled,memclock_scaled
0,GeForce RTX 4050,2023.0,8.000000,128.0,1925,2250.0,3840.0,120,48,0.0,...,0,0,0,0,0,1,0,0,0.825075,2.684124
1,Arc A350M,2022.0,4.000000,64.0,300,1500.0,768.0,48,24,0.0,...,0,0,0,1,0,0,0,0,0.124946,1.341579
2,Arc A370M,2022.0,4.000000,64.0,300,1500.0,1024.0,64,32,0.0,...,0,0,0,1,0,0,0,0,0.124946,1.341579
3,Arc A380,2022.0,4.000000,64.0,300,1500.0,1024.0,64,32,0.0,...,0,0,0,1,0,0,0,0,0.124946,1.341579
4,Arc A550M,2022.0,8.000000,128.0,300,1500.0,2048.0,128,64,0.0,...,0,0,0,1,0,0,0,0,0.124946,1.341579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2840,EGA Wonder 800,1987.0,0.000256,32.0,10,8.0,0.0,0,1,1.0,...,0,0,1,0,0,0,0,0,0.000000,-1.329190
2841,Graphics Solution Plus,1987.0,0.000064,32.0,10,5.0,0.0,0,0,0.0,...,0,0,1,0,0,0,0,0,0.000000,-1.334561
2842,VGA Improved Performance,1987.0,0.000256,32.0,10,10.0,0.0,0,1,1.0,...,0,0,1,0,0,0,0,0,0.000000,-1.325610
2843,Color Emulation Card,1986.0,0.000032,32.0,10,5.0,0.0,0,0,0.0,...,0,0,1,0,0,0,0,0,0.000000,-1.334561
