In [1]:
import pandas as pd
import re
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [2]:
code_flops_df = pd.read_csv("../../data/raw_data/codename_to_flops.csv")
raw_list = pd.read_json("../../data/raw_data/final_cpu_list.json")
raw_list = raw_list[raw_list['Market:'] == 'Desktop']

In [3]:
new_list = raw_list[['name','brand','Released:', 'Generation:','Codename:','Frequency:','# of Cores:']].dropna()

In [4]:
from dateutil.parser import parse
def cpu_date_parser(df):
    """cleans the gpu dataframe removing entries without dates and converting the date column to date format"""
    df['Released:'] = df[df['Released:'].isin(["Unknown","Never Released"]) == False]['Released:']
    df = df.dropna().copy()
    df['Released:'] = df["Released:"].apply(parse, fuzzy=True)
    return df

def prefixed_number_to_int(cell):
    """used to convert texture rate with letter (100M) to int"""
    val,mod = cell.split()
    mod_val = {"G":1,"M":1/1000}[mod[0]]
    return pd.to_numeric(val)*mod_val

def generation_scrubber(text):
    "removes the newline character and all following charaters from text"
    out = re.search(r"(.*)\n", text)
    return out.groups(0)[0]

In [5]:
df = cpu_date_parser(new_list)
df.columns = ["name",'brand','released','generation',"codename",'frequency','num_cores']

In [6]:
df['frequency'] = df['frequency'].apply(prefixed_number_to_int)
df['generation'] = df['generation'].apply(generation_scrubber)

In [7]:
df = df.merge(code_flops_df[['Code name',"dp_flops", 'sp_flops']], left_on='codename', right_on='Code name')

In [8]:
df['gflops'] = df['frequency']*df['num_cores']*df['sp_flops']

In [9]:
df = df.set_index('released')

In [10]:
df.to_pickle("../../data/preprocessed_data/final_cpu_df.pkl")

In [11]:
p = pd.read_pickle("../../data/preprocessed_data/final_cpu_df.pkl")

In [12]:
p

Unnamed: 0_level_0,name,brand,generation,codename,frequency,num_cores,Code name,dp_flops,sp_flops,gflops
released,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2000-11-14,Pentium 4 1.4,Intel,Pentium 4,Willamette,1.4,1,Willamette,2.0,4.0,5.6
2000-11-14,Pentium 4 1.5,Intel,Pentium 4,Willamette,1.5,1,Willamette,2.0,4.0,6.0
2001-01-14,Pentium 4 1.3,Intel,Pentium 4,Willamette,1.3,1,Willamette,2.0,4.0,5.2
2001-08-14,Pentium 4 1.4,Intel,Pentium 4,Willamette,1.4,1,Willamette,2.0,4.0,5.6
2001-08-14,Pentium 4 1.5,Intel,Pentium 4,Willamette,1.5,1,Willamette,2.0,4.0,6.0
...,...,...,...,...,...,...,...,...,...,...
2019-07-14,Ryzen 9 3900X,AMD,Ryzen 9,Matisse,3.8,12,Matisse,8.0,16.0,729.6
2019-11-14,Ryzen 9 3950X,AMD,Ryzen 9,Matisse,3.5,16,Matisse,8.0,16.0,896.0
2019-11-14,Ryzen Threadripper 3960X,AMD,Ryzen Threadripper,Matisse,3.8,24,Matisse,8.0,16.0,1459.2
2019-11-14,Ryzen Threadripper 3970X,AMD,Ryzen Threadripper,Matisse,3.7,32,Matisse,8.0,16.0,1894.4
