## CDS 301

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
def read_csv(file_path):
    return pd.read_csv(open(file_path))

gpu_specs_path = "./data/GPU_Specs.csv"
gpu_scores_path = "./data/GPU_Scores.csv"
gpu_benchmark_path = "./data/GPU_Benchmarks.csv"

df_specs = read_csv(gpu_specs_path)
df_scores = read_csv(gpu_scores_path)
df_benchmarks = read_csv(gpu_benchmark_path)

## Data Previewing

In [6]:
### SIZE OF DATAFRAMES ###
def preview_size(idx, dataframe):
    print(idx,"-",dataframe.shape)

preview_size("Specs", df_specs)
preview_size("Scores", df_scores)
preview_size("Benchmarks", df_benchmarks)

Specs - (3056, 16)
Scores - (1213, 6)
Benchmarks - (2317, 9)


In [10]:
### DATA DICTIONARY ###
def data_dictionary(dataframe):
    new_df = pd.DataFrame({"Column": [], "Type": []})
    for column in dataframe.columns:
        new_df.loc[len(new_df)] = {"Column": str(column), "Type": dataframe[column].dtype}
    return new_df


In [11]:
data_dictionary(df_specs)

Unnamed: 0,Column,Type
0,manufacturer,object
1,productName,object
2,releaseYear,float64
3,memSize,float64
4,memBusWidth,float64
5,gpuClock,int64
6,memClock,float64
7,unifiedShader,float64
8,tmu,int64
9,rop,int64


In [12]:
data_dictionary(df_scores)

Unnamed: 0,Column,Type
0,Manufacturer,object
1,Device,object
2,CUDA,float64
3,Metal,float64
4,OpenCL,float64
5,Vulkan,float64


In [13]:
data_dictionary(df_benchmarks)

Unnamed: 0,Column,Type
0,gpuName,object
1,G3Dmark,int64
2,G2Dmark,int64
3,price,float64
4,gpuValue,float64
5,TDP,float64
6,powerPerformance,float64
7,testDate,int64
8,category,object


## Data Cleaning

In [31]:
### CHANING COLUMN NAMES ###
df_specs = df_specs.rename(columns={"productName": "Device"})
df_benchmarks = df_benchmarks.rename(columns={"gpuName": "Device"})

In [15]:
### CLEANING GPU_specs.csv ###
df_specs.isnull().sum()

manufacturer        0
productName         0
releaseYear        44
memSize           441
memBusWidth      2868
gpuClock            0
memClock          441
unifiedShader     824
tmu                 0
rop                 0
pixelShader      2232
vertexShader     2232
igp                 0
bus                 0
memType           441
gpuChip             0
dtype: int64

In [32]:
df_benchmarks.head()

Unnamed: 0,Device,G3Dmark,G2Dmark,price,gpuValue,TDP,powerPerformance,testDate,category
0,GeForce RTX 3090 Ti,29094,1117,2099.99,13.85,450.0,64.65,2022,Unknown
1,GeForce RTX 3080 Ti,26887,1031,1199.99,22.41,350.0,76.82,2021,Desktop
2,GeForce RTX 3090,26395,999,1749.99,15.08,350.0,75.41,2020,Desktop
3,Radeon RX 6900 XT,25458,1102,1120.31,22.72,300.0,84.86,2020,Desktop
4,GeForce RTX 3080,24853,1003,999.0,24.88,320.0,77.66,2020,Desktop


In [17]:
df_scores.head()

Unnamed: 0,Manufacturer,Device,CUDA,Metal,OpenCL,Vulkan
0,Nvidia,GeForce RTX 3090 Ti,260346.0,,229738.0,141134.0
1,Nvidia,A100 80GB PCIe,259828.0,,214586.0,
2,Nvidia,A100-PCIE-80GB,256292.0,,207124.0,
3,Nvidia,GeForce RTX 3090,238123.0,,204921.0,138859.0
4,Nvidia,A100-SXM4-40GB,237220.0,,190489.0,


In [33]:
df_joined = pd.merge(df_specs,df_scores,how="left",on="Device")
df_joined = pd.merge(df_joined,df_benchmarks,how="left",on="Device")

In [34]:
df_joined = df_joined.drop_duplicates()

In [35]:
df_joined.duplicated().sum()

np.int64(0)

In [36]:
df_joined.shape

(3037, 29)

In [37]:
df_joined.to_csv("joined_data.csv", index=False)