## CDS 301

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def read_csv(file_path):
    return pd.read_csv(open(file_path))

gpu_specs_path = "./data/GPU_Specs.csv"
gpu_scores_path = "./data/GPU_Scores.csv"
gpu_benchmark_path = "./data/GPU_Benchmarks.csv"

df_specs = read_csv(gpu_specs_path)
df_scores = read_csv(gpu_scores_path)
df_benchmarks = read_csv(gpu_benchmark_path)

## Data Previewing

In [3]:
### SIZE OF DATAFRAMES ###
def preview_size(idx, dataframe):
    print(idx,"-",dataframe.shape)

preview_size("Specs", df_specs)
preview_size("Scores", df_scores)
preview_size("Benchmarks", df_benchmarks)

Specs - (3056, 16)
Scores - (1213, 6)
Benchmarks - (2317, 9)


In [4]:
### DATA DICTIONARY ###
def data_dictionary(dataframe):
    new_df = pd.DataFrame({"Column": [], "Type": [], "Missing Data": [], "Percentage Missing": []})
    for column in dataframe.columns:
        new_df.loc[len(new_df)] = {"Column": str(column), "Type": dataframe[column].dtype, "Missing Data": dataframe[column].isna().sum(), "Percentage Missing": round((dataframe[column].isna().sum()/len(dataframe[column])*100), 2)}
    return new_df


In [5]:
data_dictionary(df_specs)

Unnamed: 0,Column,Type,Missing Data,Percentage Missing
0,manufacturer,object,0,0.0
1,productName,object,0,0.0
2,releaseYear,float64,44,1.44
3,memSize,float64,441,14.43
4,memBusWidth,float64,2868,93.85
5,gpuClock,int64,0,0.0
6,memClock,float64,441,14.43
7,unifiedShader,float64,824,26.96
8,tmu,int64,0,0.0
9,rop,int64,0,0.0


In [6]:
data_dictionary(df_scores)

Unnamed: 0,Column,Type,Missing Data,Percentage Missing
0,Manufacturer,object,0,0.0
1,Device,object,0,0.0
2,CUDA,float64,947,78.07
3,Metal,float64,972,80.13
4,OpenCL,float64,237,19.54
5,Vulkan,float64,584,48.15


In [7]:
data_dictionary(df_benchmarks)

Unnamed: 0,Column,Type,Missing Data,Percentage Missing
0,gpuName,object,0,0.0
1,G3Dmark,int64,0,0.0
2,G2Dmark,int64,0,0.0
3,price,float64,1764,76.13
4,gpuValue,float64,1764,76.13
5,TDP,float64,1625,70.13
6,powerPerformance,float64,1625,70.13
7,testDate,int64,0,0.0
8,category,object,0,0.0


In [8]:
df_specs.head()

Unnamed: 0,manufacturer,productName,releaseYear,memSize,memBusWidth,gpuClock,memClock,unifiedShader,tmu,rop,pixelShader,vertexShader,igp,bus,memType,gpuChip
0,NVIDIA,GeForce RTX 5090,2025.0,28.0,448.0,900,1200.0,8192.0,256,128,,,No,PCIe 4.0 x16,HBM2e,Arctic Sound
1,NVIDIA,GeForce RTX 5080,2025.0,16.0,256.0,900,1215.0,6912.0,432,192,,,No,PCIe 4.0 x16,HBM2e,GA100
2,NVIDIA,GeForce RTX 5070,2025.0,12.0,192.0,1825,2000.0,5120.0,320,128,,,No,PCIe 4.0 x16,GDDR6,Navi 21
3,NVIDIA,GeForce RTX 5060 Mobile,2025.0,8.0,128.0,2235,2500.0,4608.0,144,48,,,No,PCIe 5.0 x16,GDDR7,GB206
4,NVIDIA,GeForce RTX 5060,2025.0,8.0,128.0,1825,2250.0,4608.0,288,192,,,No,PCIe 4.0 x16,GDDR6,Navi 31


## Data Cleaning

In [79]:
### Cleaning df_specs ###
df_specs_final = df_specs.drop(columns=["memBusWidth", "pixelShader", "vertexShader"])
df_specs_final = df_specs_final.rename(columns={"productName": "product_name", "releaseYear": "release_year",
                                                "memSize": "memory_size", "gpuClock": "gpu_clock", "memClock": "memory_clock",
                                                "unifiedShader": "unified_shader", "tmu": "texture_mapping_units", "rop": "render_output_units",
                                                "igp": "integrated_graphics_processor", "memType": "memory_type", "gpuChip": "gpu_chip"})
df_specs_final["integrated_graphics_processor"] = df_specs_final["integrated_graphics_processor"].map({"Yes": 1, "No": 0}).astype(bool)
df_specs_final = df_specs_final.dropna(subset=["release_year"])
df_specs_final['memory_size'] = df_specs_final.groupby('release_year')['memory_size'].transform(lambda x: x.fillna(x.median()))
df_specs_final['memory_clock'] = df_specs_final.groupby('release_year')['memory_clock'].transform(lambda x: x.fillna(x.median()))
df_specs_final['unified_shader'] = df_specs_final.groupby('release_year')['unified_shader'].transform(lambda x: x.fillna(0))
data_dictionary(df_specs_final)

Unnamed: 0,Column,Type,Missing Data,Percentage Missing
0,manufacturer,object,0,0.0
1,product_name,object,0,0.0
2,release_year,float64,0,0.0
3,memory_size,float64,0,0.0
4,gpu_clock,int64,0,0.0
5,memory_clock,float64,0,0.0
6,unified_shader,float64,0,0.0
7,texture_mapping_units,int64,0,0.0
8,render_output_units,int64,0,0.0
9,integrated_graphics_processor,bool,0,0.0


In [80]:
df_specs_final.head()

Unnamed: 0,manufacturer,product_name,release_year,memory_size,gpu_clock,memory_clock,unified_shader,texture_mapping_units,render_output_units,integrated_graphics_processor,bus,memory_type,gpu_chip
0,NVIDIA,GeForce RTX 5090,2025.0,28.0,900,1200.0,8192.0,256,128,False,PCIe 4.0 x16,HBM2e,Arctic Sound
1,NVIDIA,GeForce RTX 5080,2025.0,16.0,900,1215.0,6912.0,432,192,False,PCIe 4.0 x16,HBM2e,GA100
2,NVIDIA,GeForce RTX 5070,2025.0,12.0,1825,2000.0,5120.0,320,128,False,PCIe 4.0 x16,GDDR6,Navi 21
3,NVIDIA,GeForce RTX 5060 Mobile,2025.0,8.0,2235,2500.0,4608.0,144,48,False,PCIe 5.0 x16,GDDR7,GB206
4,NVIDIA,GeForce RTX 5060,2025.0,8.0,1825,2250.0,4608.0,288,192,False,PCIe 4.0 x16,GDDR6,Navi 31
