In [1]:
import pandas as pd
import re
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [2]:
code_flops_df = pd.read_csv("../../data/raw_data/codename_to_flops.csv")
raw_list = pd.read_json("../../data/raw_data/final_cpu_list.json")
raw_list = raw_list[raw_list['Market:'] == 'Desktop']

In [3]:
new_list = raw_list[['name','brand','Released:', 'Generation:','Codename:','Frequency:','# of Cores:']].dropna()

In [4]:
from dateutil.parser import parse
def cpu_date_parser(df):
    """cleans the gpu dataframe removing entries without dates and converting the date column to date format"""
    df['Released:'] = df[df['Released:'].isin(["Unknown","Never Released"]) == False]['Released:']
    df = df.dropna().copy()
    df['Released:'] = df["Released:"].apply(parse, fuzzy=True)
    return df

def prefixed_number_to_int(cell):
    """used to convert texture rate with letter (100M) to int"""
    val,mod = cell.split()
    mod_val = {"G":1,"M":1/1000}[mod[0]]
    return pd.to_numeric(val)*mod_val

def generation_scrubber(text):
    "removes the newline character and all following charaters from text"
    out = re.search(r"(.*)\n", text)
    return out.groups(0)[0]

In [5]:
new_list.columns

Index(['name', 'brand', 'Released:', 'Generation:', 'Codename:', 'Frequency:',
       '# of Cores:'],
      dtype='object')

In [6]:
raw_list

Unnamed: 0,name,brand,link,Socket:,Foundry:,Process Size:,Transistors:,Die Size:,Package:,tCaseMax:,Frequency:,Turbo Clock:,Base Clock:,Multiplier:,Multiplier Unlocked:,Voltage:,TDP:,Market:,Production Status:,Released:,Codename:,Generation:,Part#:,Memory Support:,# of Cores:,# of Threads:,SMP # CPUs:,Integrated Graphics:,Cache L1:,Cache L2:,Cache L3:
0,Pentium 4 1.4,Intel,https://www.techpowerup.com/cpudb/296/pentium-...,Intel Socket 423,Intel,180 nm,42 million,217 mm²,µPGA,72°C,1400 MHz,,100 MHz,14.0x,No,1.75 V,55 W,Desktop,End-of-life,Nov 2000,Willamette,Pentium 4\n(Willamette),SL4SGSL4SCSL4X2SL4WS,unknown,1,1,1,,8K,256K,
1,Pentium 4 1.5,Intel,https://www.techpowerup.com/cpudb/297/pentium-...,Intel Socket 423,Intel,180 nm,42 million,217 mm²,µPGA,73°C,1500 MHz,,100 MHz,15.0x,No,1.75 V,58 W,Desktop,End-of-life,Nov 2000,Willamette,Pentium 4\n(Willamette),SL4TYSL4WTSL4SHSL5TNSL5SXSL4X3,unknown,1,1,1,,8K,256K,
2,Pentium III 1133,Intel,https://www.techpowerup.com/cpudb/1290/pentium...,Intel Socket 370,Intel,180 nm,44 million,80 mm²,µPGA,69°C,1133 MHz,,133 MHz,8.5x,No,1.75 V,29 W,Desktop,End-of-life,Jul 2000,Coppermine,Pentium III\n(Coppermine),SL5B2SL4YV,unknown,1,1,1,,8K,256K,
3,Pentium 4 1.3,Intel,https://www.techpowerup.com/cpudb/1269/pentium...,Intel Socket 478,Intel,180 nm,42 million,217 mm²,µPGA,70°C,1300 MHz,,100 MHz,13.0x,No,1.75 V,52 W,Desktop,End-of-life,Jan 2001,Willamette,Pentium 4\n(Willamette),SL5FWSL4SFSL4QDSL5GC,"DDR1, DDR2",1,1,1,,8K,256K,
4,Pentium 4 1.4,Intel,https://www.techpowerup.com/cpudb/1244/pentium...,Intel Socket 478,Intel,180 nm,42 million,217 mm²,µPGA,72°C,1400 MHz,,100 MHz,14.0x,No,1.75 V,55 W,Desktop,End-of-life,Aug 2001,Willamette,Pentium 4\n(Willamette),SL5N7SL59USL5UESL5TG,"DDR1, DDR2",1,1,1,,8K,256K,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971,Ryzen 9 3900X,AMD,https://www.techpowerup.com/cpudb/2128/ryzen-9...,AMD Socket AM4,TSMC,7 nm,"19,200 million",unknown,,unknown,3.8 GHz,up to 4.6 GHz,100 MHz,38.0x,Yes,variable,125 W,Desktop,Active,Jul 2019,Matisse,Ryzen 9\n(Zen 2 (Matisse)),unknown,DDR4,12,24,1,,96K (per core),512K (per core),64MB
1972,Ryzen 9 3950X,AMD,https://www.techpowerup.com/cpudb/2203/ryzen-9...,AMD Socket AM4,TSMC,7 nm,"19,200 million",unknown,,unknown,3.5 GHz,up to 4.7 GHz,100 MHz,35.0x,Yes,variable,105 W,Desktop,Active,Nov 2019,Matisse,Ryzen 9\n(Zen 2 (Matisse)),unknown,DDR4,16,32,1,,96K (per core),512K (per core),64MB
1973,Ryzen Threadripper 3960X,AMD,https://www.techpowerup.com/cpudb/2268/ryzen-t...,AMD Socket TR4,TSMC,7 nm,"19,200 million",unknown,sTRX4,unknown,3.8 GHz,up to 4.5 GHz,100 MHz,38.0x,Yes,variable,280 W,Desktop,Active,Nov 2019,Matisse,Ryzen Threadripper\n(Zen 2 (Matisse)),unknown,DDR4,24,48,1,,96K (per core),512K (per core),128MB
1974,Ryzen Threadripper 3970X,AMD,https://www.techpowerup.com/cpudb/2269/ryzen-t...,AMD Socket TR4,TSMC,7 nm,"19,200 million",unknown,sTRX4,unknown,3.7 GHz,up to 4.5 GHz,100 MHz,37.0x,Yes,variable,280 W,Desktop,Active,Nov 2019,Matisse,Ryzen Threadripper\n(Zen 2 (Matisse)),unknown,DDR4,32,64,1,,96K (per core),512K (per core),128MB


In [7]:
df = cpu_date_parser(new_list)
df = df.set_index('Released:')
df.columns = ["name",'brand','generation',"codename",'frequency','num_cores']

In [8]:
df['frequency'] = df['frequency'].apply(prefixed_number_to_int)
df['generation'] = df['generation'].apply(generation_scrubber)

In [9]:
len(df)

1025

In [10]:
df = df.merge(code_flops_df[['Code name',"dp_flops", 'sp_flops']], left_on='codename', right_on='Code name')

In [11]:
df['gflops'] = df['frequency']*df['num_cores']*df['sp_flops']

In [12]:
df.to_pickle("../../data/preprocessed_data/final_cpu_df.pkl")

In [13]:
p = pd.read_pickle("../../data/preprocessed_data/final_cpu_df.pkl")

In [14]:
p

Unnamed: 0,name,brand,generation,codename,frequency,num_cores,Code name,dp_flops,sp_flops,gflops
0,Pentium 4 1.4,Intel,Pentium 4,Willamette,1.4,1,Willamette,2.0,4.0,5.6
1,Pentium 4 1.5,Intel,Pentium 4,Willamette,1.5,1,Willamette,2.0,4.0,6.0
2,Pentium 4 1.3,Intel,Pentium 4,Willamette,1.3,1,Willamette,2.0,4.0,5.2
3,Pentium 4 1.4,Intel,Pentium 4,Willamette,1.4,1,Willamette,2.0,4.0,5.6
4,Pentium 4 1.5,Intel,Pentium 4,Willamette,1.5,1,Willamette,2.0,4.0,6.0
...,...,...,...,...,...,...,...,...,...,...
1020,Ryzen 9 3900X,AMD,Ryzen 9,Matisse,3.8,12,Matisse,8.0,16.0,729.6
1021,Ryzen 9 3950X,AMD,Ryzen 9,Matisse,3.5,16,Matisse,8.0,16.0,896.0
1022,Ryzen Threadripper 3960X,AMD,Ryzen Threadripper,Matisse,3.8,24,Matisse,8.0,16.0,1459.2
1023,Ryzen Threadripper 3970X,AMD,Ryzen Threadripper,Matisse,3.7,32,Matisse,8.0,16.0,1894.4
