# Cleaning Data for ECO400 project

### Gabrielle Martinez
### Oct 15, 2020


In this notebook, I'll be using R to clean and merge my gathered datasets. The first is historical data on fuel prices by state from the US Energy Information Administration, discussed more [here](https://www.eia.gov/tools/faqs/faq.php?id=26&t=10). Prices are in dollars per million British thermal units ($/MMBtu) ([eia](https://www.eia.gov/tools/faqs/faq.php?id=26&t=10)). The second dataset is patents data I gathered from the PatentsView API ranging from 1976 to 1987. 

In [1]:
library(tidyverse)

"package 'tidyverse' was built under R version 3.6.3"-- Attaching packages ------------------------------------------------------------------------------- tidyverse 1.3.0 --
v ggplot2 3.3.2     v purrr   0.3.4
v tibble  3.0.4     v dplyr   1.0.2
v tidyr   1.1.2     v stringr 1.4.0
v readr   1.4.0     v forcats 0.5.0
"package 'forcats' was built under R version 3.6.3"-- Conflicts ---------------------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()


In [2]:
#set working dir
setwd("~/eco590_Data_Analysis_Python_R/Data Cleaning")

In [34]:
#load data
gas_by_state <- read.csv("1970_gasoline.csv")

#let's look at our data
gas_by_state %>% head(1)

Data_Status,State,MSN,X1970,X1971,X1972,X1973,X1974,X1975,X1976,...,X2009,X2010,X2011,X2012,X2013,X2014,X2015,X2016,X2017,X2018
2018F,AK,ARICD,0.57,0.81,0.82,0.85,1.74,1.8,1.62,...,13.37,13.56,15.73,17.66,16.96,16.33,13.93,10.61,10.5,12.86


In [35]:
#make a list of years to refrence col names when selecting
c(1970:1987)->c
paste(c) -> c
gsub('1','X1',c) %>% gsub('7X1','71',.) %>% gsub('8X1','81',.) ->c
c

In [36]:
#filter and cleaning
gas_by_state %>%
    select(State, MSN,all_of(c)) %>%
    filter(MSN=="MGACD") %>%
    filter(State!="US") -> gas_by_state

#num of rows = num of states?
nrow(gas_by_state) #yep, 51 including DC

#show new df
gas_by_state

State,MSN,X1970,X1971,X1972,X1973,X1974,X1975,X1976,X1977,X1978,X1979,X1980,X1981,X1982,X1983,X1984,X1985,X1986,X1987
AK,MGACD,3.18,3.25,3.21,3.46,4.76,5.15,5.36,5.69,6.09,7.59,10.2,11.58,11.35,10.46,10.07,9.83,7.86,7.9
AL,MGACD,2.82,2.85,2.95,3.08,4.14,4.26,4.7,5.12,5.27,7.04,9.89,11.24,10.61,9.07,8.98,9.15,6.99,7.27
AR,MGACD,2.74,2.84,2.77,2.96,4.09,4.6,4.82,5.15,5.3,7.06,9.93,11.04,10.39,8.82,8.55,8.8,6.84,7.28
AZ,MGACD,2.8,2.87,2.83,3.06,4.12,4.62,4.84,5.09,5.42,6.98,9.68,10.83,10.34,8.87,8.94,9.06,7.24,7.77
CA,MGACD,2.8,2.84,2.79,3.08,4.47,4.84,5.04,5.33,5.48,7.47,10.19,11.33,10.82,8.96,8.74,8.68,6.68,6.95
CO,MGACD,2.72,2.93,2.81,3.11,4.2,4.67,4.9,5.13,5.05,6.85,9.36,10.23,10.15,9.02,8.99,9.28,6.76,7.59
CT,MGACD,2.96,3.02,3.01,3.28,4.38,4.61,4.8,5.11,5.34,7.26,10.1,11.38,10.3,9.56,9.11,9.37,7.45,7.93
DC,MGACD,2.86,2.93,3.01,3.12,4.49,4.85,5.08,5.39,5.46,7.02,9.97,11.16,10.54,9.83,9.83,10.28,8.23,8.05
DE,MGACD,2.86,2.92,2.98,3.1,4.1,4.54,4.58,4.81,4.94,6.83,9.6,10.66,10.02,9.39,8.95,9.39,6.97,7.61
FL,MGACD,2.81,2.77,2.88,3.05,4.1,4.39,4.57,4.7,5.18,7.05,9.8,10.94,10.51,9.13,8.92,9.03,6.7,7.11


In [37]:
#lets get rid of those Xs
gsub('X','',names(gas_by_state)) -> names(gas_by_state)

#show new col names
head(gas_by_state,1)

State,MSN,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987
AK,MGACD,3.18,3.25,3.21,3.46,4.76,5.15,5.36,5.69,6.09,7.59,10.2,11.58,11.35,10.46,10.07,9.83,7.86,7.9


In [38]:
#make sure there are no missing elements

gas_by_state %>% 
    filter(is.na(gas_by_state)==T) #no missing elements

#is.na(gas_by_state)

State,MSN,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987


There are no empty elements.

Now we pivot the data to be long

In [39]:
#list of years
c(1970:1987)->c
#make ints to strings
paste(c) -> c

#pivot = make data long instead of wide
gas_by_state %>%
    select(!MSN) %>% #cut out MSN
    pivot_longer(names_to = "Years", values_to = "fuel_price", c) -> gas_by_state_longer

gas_by_state_longer

Note: Using an external vector in selections is ambiguous.
i Use `all_of(c)` instead of `c` to silence this message.
i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.


State,Years,fuel_price
AK,1970,3.18
AK,1971,3.25
AK,1972,3.21
AK,1973,3.46
AK,1974,4.76
AK,1975,5.15
AK,1976,5.36
AK,1977,5.69
AK,1978,6.09
AK,1979,7.59


In [40]:
#look at data types (important for merging later)
gas_by_state_longer %>%
    sapply(class)

#change Years type to int
as.integer(gas_by_state_longer$Years) -> gas_by_state_longer$Years
gas_by_state_longer %>%
    sapply(class)

## Adding Patents Data from API

In [10]:
#load patents data
total_patents <- read.csv("total_patents_1976to87.csv")
head(total_patents)

years,patent_number,uspc_subclass_id,assignee_organization,assignee_type,assignee_id,assignee_country,assignee_state,assignee_key_id,num_inventors,specialization
1976,3934417,60/516,Robertshaw Controls Company,2,org_WLFbXEQXv0uuNglpaujt,US,VA,405316,4,60/516
1976,3937017,60/516,Maschinenfabrik Augsburg-Nurnberg Aktiengesellschaft,3,org_d7Cgs8GkaSDEwmfFOL1Z,DT,,119492,3,60/516
1976,3953715,701/123,Societe des Procedes Modernes d'Injection Sopromi,3,org_Z8mWQfh54whRqC1GqzOM,FR,,443684,1,701/123
1976,3953716,701/123,Agence Nationale de Valorisation de la Recherche (ANVAR),7,org_7IZPx0no0y4uuAMGwkei,FR,,54873,1,701/123
1976,3956895,60/516,The United States of America as represented by the National Institute of Health,6,org_aaQSmBL5udG3Y90mfh5v,US,DC,76455,4,60/516
1976,3956898,60/698,"Combustion Engineering, Inc.",2,org_5jkJ96hMiw938qSgh5Mq,US,CT,40134,1,60/698


In [11]:
#drop assignee_id bc its not any different from using the assignee_key_id
total_patents %>% select(!assignee_id) -> total_patents
head(total_patents)

years,patent_number,uspc_subclass_id,assignee_organization,assignee_type,assignee_country,assignee_state,assignee_key_id,num_inventors,specialization
1976,3934417,60/516,Robertshaw Controls Company,2,US,VA,405316,4,60/516
1976,3937017,60/516,Maschinenfabrik Augsburg-Nurnberg Aktiengesellschaft,3,DT,,119492,3,60/516
1976,3953715,701/123,Societe des Procedes Modernes d'Injection Sopromi,3,FR,,443684,1,701/123
1976,3953716,701/123,Agence Nationale de Valorisation de la Recherche (ANVAR),7,FR,,54873,1,701/123
1976,3956895,60/516,The United States of America as represented by the National Institute of Health,6,US,DC,76455,4,60/516
1976,3956898,60/698,"Combustion Engineering, Inc.",2,US,CT,40134,1,60/698


In [12]:
#sort by years
total_patents %>%
    arrange(years,desc())
#dataset does include 1987, I was a bit worried about that

years,patent_number,uspc_subclass_id,assignee_organization,assignee_type,assignee_country,assignee_state,assignee_key_id,num_inventors,specialization
1976,3934417,60/516,Robertshaw Controls Company,2,US,VA,405316,4,60/516
1976,3937017,60/516,Maschinenfabrik Augsburg-Nurnberg Aktiengesellschaft,3,DT,,119492,3,60/516
1976,3953715,701/123,Societe des Procedes Modernes d'Injection Sopromi,3,FR,,443684,1,701/123
1976,3953716,701/123,Agence Nationale de Valorisation de la Recherche (ANVAR),7,FR,,54873,1,701/123
1976,3956895,60/516,The United States of America as represented by the National Institute of Health,6,US,DC,76455,4,60/516
1976,3956898,60/698,"Combustion Engineering, Inc.",2,US,CT,40134,1,60/698
1976,3958418,60/272,General Motors Corporation,2,US,MI,38960,2,701/123
1976,3964443,701/103,The Bendix Corporation,2,US,MI,47634,1,701/103
1976,3966362,60/698,"Airco, Inc.",2,US,NJ,297100,1,60/698
1976,3967097,701/123,General Motors Corporation,2,US,MI,38960,1,701/123


In [13]:
#get info on what type each column is
as.factor(total_patents$assignee_type) -> total_patents$assignee_type #change assignee_type from int to factor
total_patents %>%
    sapply(class)


In [14]:
#filter 180/subclasses 65.1 through 65.8

#make a list of strings
paste(c(1:8)) -> c

a=c()
for (i in c){
    a[i] <- paste('180/65.',i,sep="")
}
a #list of subclasses to search for

#filter
total_patents %>%
    filter(str_detect(uspc_subclass_id, paste(a, collapse = "|"))) -> subclasses_180

subclasses_180

#str_detect: https://stackoverflow.com/questions/22850026/filter-rows-which-contain-a-certain-string
#collapse: https://stackoverflow.com/questions/35962426/multiple-strings-with-str-detect-r
# | = or 

years,patent_number,uspc_subclass_id,assignee_organization,assignee_type,assignee_country,assignee_state,assignee_key_id,num_inventors,specialization
1976,3937293,180/65.6,Siemens Aktiengesellschaft,3,DE,,345236,1,180/65.6
1976,3943420,180/65.8,Crompton Electricars Ltd.,3,,,183565,1,180/65.8
1976,3943726,180/65.31,"Lawrence Peska Associates, Inc.",12,US,NY,333476,1,180/65.31
1976,3965971,180/65.8,EATON CORPORATION,2,US,OH,452507,1,180/65.8
1976,3970163,180/65.26,"Nissan Motor Co., Ltd.",3,JA,,43058,1,180/90
1976,3984742,180/65.1,Firma Deutsche Automobilgesellschaft mbH,3,JA,,63586,1,180/65.1
1976,3986095,180/65.1,The Japan Tobacco & Salt Public Corporation,3,JA,,404011,2,180/65.1
1977,4005759,180/65.1,Lucas Industries public limited company,3,EN,,175705,1,180/65.1
1977,4020916,180/65.1,"The Raymond Lee Organization, Inc.",12,US,NY,33742,1,180/65.1
1977,4021677,180/65.25,"Petro-Electric Motors, Ltd.",2,US,NY,451293,2,180/65.25


In [15]:
#drop all class 180 rows
total_patents %>%
    filter(!str_detect(uspc_subclass_id,'180')) %>% #-> pats_filtered
    bind_rows(subclasses_180,id=NULL) -> total_patents #add subclasses back in and save to total_patents

nrow(total_patents)
total_patents #work with this dataset from now on

years,patent_number,uspc_subclass_id,assignee_organization,assignee_type,assignee_country,assignee_state,assignee_key_id,num_inventors,specialization
1976,3934417,60/516,Robertshaw Controls Company,2,US,VA,405316,4,60/516
1976,3937017,60/516,Maschinenfabrik Augsburg-Nurnberg Aktiengesellschaft,3,DT,,119492,3,60/516
1976,3953715,701/123,Societe des Procedes Modernes d'Injection Sopromi,3,FR,,443684,1,701/123
1976,3953716,701/123,Agence Nationale de Valorisation de la Recherche (ANVAR),7,FR,,54873,1,701/123
1976,3956895,60/516,The United States of America as represented by the National Institute of Health,6,US,DC,76455,4,60/516
1976,3956898,60/698,"Combustion Engineering, Inc.",2,US,CT,40134,1,60/698
1976,3958418,60/272,General Motors Corporation,2,US,MI,38960,2,701/123
1976,3964443,701/103,The Bendix Corporation,2,US,MI,47634,1,701/103
1976,3966362,60/698,"Airco, Inc.",2,US,NJ,297100,1,60/698
1976,3967097,701/123,General Motors Corporation,2,US,MI,38960,1,701/123


In [16]:
#there's something a bit weird about the country codes JA and JP. Both are used for japan making dupes later. Let's fix that

#change type
as.character(total_patents$assignee_country) -> total_patents$assignee_country
sapply(total_patents,class)

#replace JA w JP
total_patents$assignee_country[total_patents$assignee_country == 'JA'] <- 'JP'

#check that there are no JAs left
total_patents %>%
    filter(assignee_country == 'JA') #no JAs yay!

#change type back to factor
as.factor(total_patents$assignee_country) -> total_patents$assignee_country
sapply(total_patents,class)

#show new df
total_patents #Nissan Motor Co., Ltd. used to be JA now its JP. Fixed!

#replace:https://www.programmingr.com/tutorial/how-to-%EF%BB%BFreplace-values-in-a-data-frame-in-r/

years,patent_number,uspc_subclass_id,assignee_organization,assignee_type,assignee_country,assignee_state,assignee_key_id,num_inventors,specialization


years,patent_number,uspc_subclass_id,assignee_organization,assignee_type,assignee_country,assignee_state,assignee_key_id,num_inventors,specialization
1976,3934417,60/516,Robertshaw Controls Company,2,US,VA,405316,4,60/516
1976,3937017,60/516,Maschinenfabrik Augsburg-Nurnberg Aktiengesellschaft,3,DT,,119492,3,60/516
1976,3953715,701/123,Societe des Procedes Modernes d'Injection Sopromi,3,FR,,443684,1,701/123
1976,3953716,701/123,Agence Nationale de Valorisation de la Recherche (ANVAR),7,FR,,54873,1,701/123
1976,3956895,60/516,The United States of America as represented by the National Institute of Health,6,US,DC,76455,4,60/516
1976,3956898,60/698,"Combustion Engineering, Inc.",2,US,CT,40134,1,60/698
1976,3958418,60/272,General Motors Corporation,2,US,MI,38960,2,701/123
1976,3964443,701/103,The Bendix Corporation,2,US,MI,47634,1,701/103
1976,3966362,60/698,"Airco, Inc.",2,US,NJ,297100,1,60/698
1976,3967097,701/123,General Motors Corporation,2,US,MI,38960,1,701/123


### Export new total_patents dataset

In [29]:
write.csv(total_patents, file="total_patents_filtered_1976.csv",
          row.names=FALSE)

### Fixing Country Codes
Let's just use the already filtered dataset and rewrite it to the same csv (bc I'm lazy)

In [21]:
#read in new filtered dataset - no need to rerun all the previous stuff
total_patents <- read.csv("total_patents_filtered_1976.csv")
head(total_patents)

years,patent_number,uspc_subclass_id,assignee_organization,assignee_type,assignee_country,assignee_state,assignee_key_id,num_inventors,specialization
1976,3934417,60/516,Robertshaw Controls Company,2,US,VA,405316,4,60/516
1976,3937017,60/516,Maschinenfabrik Augsburg-Nurnberg Aktiengesellschaft,3,DT,,119492,3,60/516
1976,3953715,701/123,Societe des Procedes Modernes d'Injection Sopromi,3,FR,,443684,1,701/123
1976,3953716,701/123,Agence Nationale de Valorisation de la Recherche (ANVAR),7,FR,,54873,1,701/123
1976,3956895,60/516,The United States of America as represented by the National Institute of Health,6,US,DC,76455,4,60/516
1976,3956898,60/698,"Combustion Engineering, Inc.",2,US,CT,40134,1,60/698


In [26]:
#UGH DE & DT are both Germany
#SW & SE are both Sweden
#EN & GB are both England/UK

#change type
as.character(total_patents$assignee_country) -> total_patents$assignee_country
sapply(total_patents,class)

#replace DE w DT
total_patents$assignee_country[total_patents$assignee_country == 'DT'] <- 'DE'

#check that there are no JAs left
total_patents %>%
    filter(assignee_country == 'DT') #no DTs yay!


#replace EN w GB
total_patents$assignee_country[total_patents$assignee_country == 'EN'] <- 'GB'

#check that there are no JAs left
total_patents %>%
    filter(assignee_country == 'EN') #no ENs yay!


#replace SW w SE new ISO uses SE
total_patents$assignee_country[total_patents$assignee_country == 'SW'] <- 'SE'

#check that there are no JAs left
total_patents %>%
    filter(assignee_country == 'SW') #no SWs yay!

#change type back to factor
as.factor(total_patents$assignee_country) -> total_patents$assignee_country
sapply(total_patents,class)

#show new df
total_patents


years,patent_number,uspc_subclass_id,assignee_organization,assignee_type,assignee_country,assignee_state,assignee_key_id,num_inventors,specialization


years,patent_number,uspc_subclass_id,assignee_organization,assignee_type,assignee_country,assignee_state,assignee_key_id,num_inventors,specialization


years,patent_number,uspc_subclass_id,assignee_organization,assignee_type,assignee_country,assignee_state,assignee_key_id,num_inventors,specialization


years,patent_number,uspc_subclass_id,assignee_organization,assignee_type,assignee_country,assignee_state,assignee_key_id,num_inventors,specialization
1976,3934417,60/516,Robertshaw Controls Company,2,US,VA,405316,4,60/516
1976,3937017,60/516,Maschinenfabrik Augsburg-Nurnberg Aktiengesellschaft,3,DE,,119492,3,60/516
1976,3953715,701/123,Societe des Procedes Modernes d'Injection Sopromi,3,FR,,443684,1,701/123
1976,3953716,701/123,Agence Nationale de Valorisation de la Recherche (ANVAR),7,FR,,54873,1,701/123
1976,3956895,60/516,The United States of America as represented by the National Institute of Health,6,US,DC,76455,4,60/516
1976,3956898,60/698,"Combustion Engineering, Inc.",2,US,CT,40134,1,60/698
1976,3958418,60/272,General Motors Corporation,2,US,MI,38960,2,701/123
1976,3964443,701/103,The Bendix Corporation,2,US,MI,47634,1,701/103
1976,3966362,60/698,"Airco, Inc.",2,US,NJ,297100,1,60/698
1976,3967097,701/123,General Motors Corporation,2,US,MI,38960,1,701/123


### Export Again

In [27]:
# overwrite old w new df w corrected country codes
write.csv(total_patents, file="total_patents_filtered_1976.csv",
          row.names=FALSE)

## Making new variables and collapsing to firm level data 
Instead of patent level data, I want to group by firm to get firm level data. I'll need new variables for this:
- number of inventors per firm per year    
- number of patents per firm per year   
- specialization of firm per year   

In [28]:
#groupby to find num of inventors per year for each firm
total_patents %>%
    group_by(assignee_key_id,years) %>%
    summarize(number_of_inventors = sum(num_inventors)) -> num_inventors

#number of rows should be the same as num_pats_by_year
nrow(num_inventors)

#show df
num_inventors

`summarise()` regrouping output by 'assignee_key_id' (override with `.groups` argument)


assignee_key_id,years,number_of_inventors
633,1980,1
1544,1977,1
2267,1976,118
2267,1977,198
2267,1978,217
2267,1979,213
2267,1980,292
2267,1981,12
2267,1982,20
2267,1983,19


In [29]:
# extract assignee data 
total_patents %>%
    select(!patent_number & !uspc_subclass_id & !num_inventors) -> assignee_data
#nrow(total_patents) #7998 rows


assignee_data %>%
    group_by(assignee_key_id,years,assignee_organization,assignee_type,assignee_country,assignee_state) %>%
    summarise(number_of_patents = n()) -> assignee_data

nrow(assignee_data)    
assignee_data

#1291 rows bc companies moved over time. 
#I think its important to keep companies changing states.
#I'll group by all of these factors to get num_pats



#I didn't include country the first time I ran the group_by - ignore this
#get assignee country 
# total_patents %>% 
#     group_by(assignee_key_id,years,assignee_country) %>% count() %>%  #2754 rows
#     select(!n)%>%
#     right_join(assignee_data,by=c('assignee_key_id','years')) %>% 
#     distinct() %>% nrow() #2776 rows

`summarise()` regrouping output by 'assignee_key_id', 'years', 'assignee_organization', 'assignee_type', 'assignee_country' (override with `.groups` argument)


assignee_key_id,years,assignee_organization,assignee_type,assignee_country,assignee_state,number_of_patents
633,1980,Blackstone Corporation,2,US,NY,1
1544,1977,American Motors Corporation,2,US,MI,1
2267,1976,Toyota Jidosha Kabushiki Kaisha,3,JP,,55
2267,1977,Toyota Jidosha Kabushiki Kaisha,3,,,1
2267,1977,Toyota Jidosha Kabushiki Kaisha,3,JP,,85
2267,1978,Toyota Jidosha Kabushiki Kaisha,3,JP,,89
2267,1979,Toyota Jidosha Kabushiki Kaisha,3,JP,,81
2267,1980,Toyota Jidosha Kabushiki Kaisha,3,JP,,103
2267,1980,Toyota Jidosha Kabushiki Kaisha,3,US,,1
2267,1981,Toyota Jidosha Kabushiki Kaisha,3,JP,,4


In [30]:
#merge assignee_data w num_inventors
assignee_data %>%
    left_join(num_inventors,by=c('assignee_key_id','years')) -> firms
nrow(firms)
firms %>% arrange(assignee_key_id,desc()) #sort by...

assignee_key_id,years,assignee_organization,assignee_type,assignee_country,assignee_state,number_of_patents,number_of_inventors
633,1980,Blackstone Corporation,2,US,NY,1,1
1544,1977,American Motors Corporation,2,US,MI,1,1
2267,1976,Toyota Jidosha Kabushiki Kaisha,3,JP,,55,118
2267,1977,Toyota Jidosha Kabushiki Kaisha,3,,,1,198
2267,1977,Toyota Jidosha Kabushiki Kaisha,3,JP,,85,198
2267,1978,Toyota Jidosha Kabushiki Kaisha,3,JP,,89,217
2267,1979,Toyota Jidosha Kabushiki Kaisha,3,JP,,81,213
2267,1980,Toyota Jidosha Kabushiki Kaisha,3,JP,,103,292
2267,1980,Toyota Jidosha Kabushiki Kaisha,3,US,,1,292
2267,1981,Toyota Jidosha Kabushiki Kaisha,3,JP,,4,12


### Redoing Specialization variable

In [31]:
total_patents %>%
    group_by(assignee_key_id,years,uspc_subclass_id) %>%
    summarise(count=n()) %>%
    group_by(assignee_key_id,years) %>% 
    mutate(max = max(count)) %>%
    mutate(specialization = uspc_subclass_id[which.max(count)]) %>%
    select(!count & !max & !uspc_subclass_id) %>% #dropping columns
    distinct() -> specialization  #dropping dupes and saving to df

nrow(specialization)
specialization

#which.max(count) gives the index of the max value. I don't really need the max col - just for visual checking
#if there's a tie, which.max returns the first instance
#https://stackoverflow.com/questions/12039681/find-max-per-group-and-return-another-column

`summarise()` regrouping output by 'assignee_key_id', 'years' (override with `.groups` argument)


assignee_key_id,years,specialization
633,1980,123/41.8
1544,1977,123/553
2267,1976,123/547
2267,1977,123/568.29
2267,1978,123/260
2267,1979,123/260
2267,1980,123/676
2267,1981,123/556
2267,1982,701/105
2267,1983,701/110


In [32]:
#merge w firms df 
firms %>%
    left_join(specialization,by=c('assignee_key_id','years')) -> firms
#try not to run this twice, you get weird results since its saving back into itself

nrow(firms)
firms

assignee_key_id,years,assignee_organization,assignee_type,assignee_country,assignee_state,number_of_patents,number_of_inventors,specialization
633,1980,Blackstone Corporation,2,US,NY,1,1,123/41.8
1544,1977,American Motors Corporation,2,US,MI,1,1,123/553
2267,1976,Toyota Jidosha Kabushiki Kaisha,3,JP,,55,118,123/547
2267,1977,Toyota Jidosha Kabushiki Kaisha,3,,,1,198,123/568.29
2267,1977,Toyota Jidosha Kabushiki Kaisha,3,JP,,85,198,123/568.29
2267,1978,Toyota Jidosha Kabushiki Kaisha,3,JP,,89,217,123/260
2267,1979,Toyota Jidosha Kabushiki Kaisha,3,JP,,81,213,123/260
2267,1980,Toyota Jidosha Kabushiki Kaisha,3,JP,,103,292,123/676
2267,1980,Toyota Jidosha Kabushiki Kaisha,3,US,,1,292,123/676
2267,1981,Toyota Jidosha Kabushiki Kaisha,3,JP,,4,12,123/556


## Merge Fuel Data

In [41]:
firms %>%
    left_join(gas_by_state_longer, by=c('assignee_state'='State','years'='Years')) -> firms
#same here, only run once!

nrow(firms)
firms

assignee_key_id,years,assignee_organization,assignee_type,assignee_country,assignee_state,number_of_patents,number_of_inventors,specialization,fuel_price
633,1980,Blackstone Corporation,2,US,NY,1,1,123/41.8,10.26
1544,1977,American Motors Corporation,2,US,MI,1,1,123/553,5.23
2267,1976,Toyota Jidosha Kabushiki Kaisha,3,JP,,55,118,123/547,
2267,1977,Toyota Jidosha Kabushiki Kaisha,3,,,1,198,123/568.29,
2267,1977,Toyota Jidosha Kabushiki Kaisha,3,JP,,85,198,123/568.29,
2267,1978,Toyota Jidosha Kabushiki Kaisha,3,JP,,89,217,123/260,
2267,1979,Toyota Jidosha Kabushiki Kaisha,3,JP,,81,213,123/260,
2267,1980,Toyota Jidosha Kabushiki Kaisha,3,JP,,103,292,123/676,
2267,1980,Toyota Jidosha Kabushiki Kaisha,3,US,,1,292,123/676,
2267,1981,Toyota Jidosha Kabushiki Kaisha,3,JP,,4,12,123/556,


Now we have our full dataset (finally!)
## Filter out NAs
Recoding NAs and empty values to '.'

In [42]:
#find empty values
# firms %>%
#     filter(assignee_state == '') #I know its mostly in the states tho there are some empty values in the country codes

#check datatype - we can only replace using chars
sapply(firms,class)

#set to char
as.character(firms$assignee_state) -> firms$assignee_state
as.character(firms$assignee_country) -> firms$assignee_country
as.numeric(firms$fuel_price) -> firms$fuel_price
sapply(firms,class)

#replace empty w/ '.'
firms$assignee_state[firms$assignee_state==''] <- '.'
firms$assignee_country[firms$assignee_country==''] <- '.'

#replace NA w/ '.'
firms$fuel_price[is.na(firms$fuel_price)==TRUE] <- '.'

#change fuel_price back to numeric type

sapply(firms,class) #hmm R doesn't like that. I changed it back to NAs
as.numeric(firms$fuel_price) -> firms$fuel_price
head(firms)


"NAs introduced by coercion"

assignee_key_id,years,assignee_organization,assignee_type,assignee_country,assignee_state,number_of_patents,number_of_inventors,specialization,fuel_price
633,1980,Blackstone Corporation,2,US,NY,1,1,123/41.8,10.26
1544,1977,American Motors Corporation,2,US,MI,1,1,123/553,5.23
2267,1976,Toyota Jidosha Kabushiki Kaisha,3,JP,.,55,118,123/547,
2267,1977,Toyota Jidosha Kabushiki Kaisha,3,.,.,1,198,123/568.29,
2267,1977,Toyota Jidosha Kabushiki Kaisha,3,JP,.,85,198,123/568.29,
2267,1978,Toyota Jidosha Kabushiki Kaisha,3,JP,.,89,217,123/260,


Note to self:
- filter 180/decimal subclasses out of 180 class - *done*
- redo specialization var - *done*
- merge to get correct df - *done*
- add fuel data - *done*
- make 1/0 dummy var out of assignee type
- summary stats for each var [summary()](https://www.statmethods.net/stats/descriptives.html) 

## Dummy Variables
Make the Domestic and Government dummy variables

Classification of assignee:
- 2 - US Company or Corporation, 
- 3 - Foreign Company or Corporation, 
- 4 - US Individual, 
- 5 - Foreign Individual, 
- 6 - US Government, 
- 7 - Foreign Government, 
- 8 - Country Government, (though I don't know what they mean by country gov)    
- 9 - State Government (US). 

Note: A "1" appearing before any of these codes signifies part interest

(from the [PatentsView API](https://api.patentsview.org/assignee.html))

Domestic == 2 and 6   
Government == 6-9  



     
If you want to know if an firm is a domestic public entity, both should be 1. If a firm is a foregin public entity, gov should be 1 and domestic should be 0. If a firm is a domestic private company, domestic should be 1 and gov 0. If a firm is a foreign private company, both should be 0. I hope this gets around the multicolliniarity problem. If not, then I'll drop the gov variable.

In [43]:
#make domestic var
firms %>%
    mutate(domestic = ifelse(assignee_type=='2' | assignee_type=='6' | assignee_type=='12',1,0)) -> firms
    #mutate(government = ifelse(assignee_type=='6',1,0))


#making gov var - tho there's nothing in the 8 and 9 codes
#lets code for all cases - never know when I'll need to run this again ;)

#list of codes as strings
paste(c(6:9)) ->c

firms %>%
    mutate(government = ifelse(str_detect(assignee_type, paste(c, collapse = "|")),1,0)) -> firms

#see what government codes are present in the data
firms %>%
    filter(assignee_type=='6'|assignee_type=='7') #%>% nrow() #6 and 7 have 30 entries, none for 8 and 9


assignee_key_id,years,assignee_organization,assignee_type,assignee_country,assignee_state,number_of_patents,number_of_inventors,specialization,fuel_price,domestic,government
22274,1980,Ministry of International Trade & Industry,7,JP,.,1,4,123/501,,0,1
50553,1978,Etablissement Public die: Agence Nationale de Valorisation de la Recherche CNT FRX COD 07,7,FR,.,1,2,123/1A,,0,1
54873,1976,Agence Nationale de Valorisation de la Recherche (ANVAR),7,FR,.,1,1,701/123,,0,1
54873,1980,Agence Nationale de Valorisation de la Recherche (ANVAR),7,FR,.,2,3,123/1A,,0,1
76455,1976,The United States of America as represented by the National Institute of Health,6,US,DC,1,4,60/516,5.08,1,1
99253,1977,The United States of America as represented by the United States National Aeronautics and Space Administration Office of General Counsel-Code GP,6,US,DC,1,2,123/41.33,5.39,1,1
231668,1976,The United States of America as represented by the United States National Institute of Health,6,US,DC,1,4,123/74R,5.08,1,1
234480,1979,The Secretary of State for Industry in Her Britannic Majesty's Government of the United Kingdom of Great Britain and Northern Ireland,7,GB,.,1,2,123/545,,0,1
249898,1977,The United States of America as represented by the United States Energy Research and Development Administration,6,US,DC,1,1,123/1A,5.39,1,1
269676,1977,Agency of Industrial Science and Technology,7,JP,.,1,2,180/65.1,,0,1


In [44]:
#make gov and domestic factor types
as.factor(firms$government) -> firms$government
as.factor(firms$domestic) -> firms$domestic
sapply(firms,class)
head(firms)

assignee_key_id,years,assignee_organization,assignee_type,assignee_country,assignee_state,number_of_patents,number_of_inventors,specialization,fuel_price,domestic,government
633,1980,Blackstone Corporation,2,US,NY,1,1,123/41.8,10.26,1,0
1544,1977,American Motors Corporation,2,US,MI,1,1,123/553,5.23,1,0
2267,1976,Toyota Jidosha Kabushiki Kaisha,3,JP,.,55,118,123/547,,0,0
2267,1977,Toyota Jidosha Kabushiki Kaisha,3,.,.,1,198,123/568.29,,0,0
2267,1977,Toyota Jidosha Kabushiki Kaisha,3,JP,.,85,198,123/568.29,,0,0
2267,1978,Toyota Jidosha Kabushiki Kaisha,3,JP,.,89,217,123/260,,0,0


### Exporting Cleaned Dataset

In [45]:
write.csv(firms, file="Gabrielle_Martinez_asn4.csv",
          row.names=FALSE)


## Summary Statistics

In [27]:
firms %>% 
    select(years,number_of_inventors,number_of_patents,fuel_price,government,domestic) %>%
    summary()

sprintf("standard deviation of number_of_patents: %f",sd(firms$number_of_patents))
sprintf("standard deviation of number_of_inventors: %f",sd(firms$number_of_inventors))
sprintf("standard deviation of fuel_price: %f",sd(firms$fuel_price,TRUE))

firms %>% 
    group_by(years) %>%
    count()

Adding missing grouping variables: `assignee_key_id`, `assignee_organization`, `assignee_type`, `assignee_country`


 assignee_key_id                      assignee_organization assignee_type
 Min.   :   633   Toyota Jidosha Kabushiki Kaisha:  14      3      :652  
 1st Qu.:116474   Hitachi, Ltd.                  :  12      2      :588  
 Median :221515   Nippondenso Co., Ltd.          :  12      6      : 19  
 Mean   :228978   Nissan Motor Co., Ltd.         :  12      7      : 11  
 3rd Qu.:344081   The Bendix Corporation         :  12      12     :  4  
 Max.   :455978   Ford Motor Company             :  11      15     :  1  
                  (Other)                        :1202      (Other):  0  
 assignee_country       years      number_of_inventors number_of_patents
 Length:1275        Min.   :1976   Min.   :  1.000     Min.   :  1.000  
 Class :character   1st Qu.:1977   1st Qu.:  1.000     1st Qu.:  1.000  
 Mode  :character   Median :1978   Median :  2.000     Median :  1.000  
                    Mean   :1979   Mean   :  7.738     Mean   :  3.425  
                    3rd Qu.:1980   3rd Qu.:

years,n
1976,238
1977,205
1978,239
1979,205
1980,192
1981,33
1982,28
1983,17
1984,31
1985,29


In [46]:
#get a list of countries
#read in csv
read.csv("Gabrielle_Martinez_asn4.csv") -> firms

In [47]:
#have to fix country codes - again
firms %>%
    filter(assignee_country == 'AU'| assignee_country == 'un') 
#UGH DE & DT are both Germany
#SW & SE are both Sweden
#EN & GB are both England/UK

#list
firms %>% group_by(assignee_country) %>% count() %>% arrange(n,desc()) %>% tail(11) #%>% select(assignee_country) -> list

assignee_key_id,years,assignee_organization,assignee_type,assignee_country,assignee_state,number_of_patents,number_of_inventors,specialization,fuel_price,domestic,government
69566,1977,F. B. J. Engineering Services Pty. Limited,3,AU,.,1,1,123/193.6,,0,0
145259,1977,Fairey Norbon Pty. Ltd.,3,AU,.,1,2,123/239,,0,0
244751,1976,The Zenith Carburetor Company Limited,3,AU,.,1,1,123/179.18,,0,0
246046,1978,Repco Research Proprietary Limited,3,AU,.,1,1,123/494,,0,0
286490,1978,Comalco Aluminium (Bell Bay) Limited,3,AU,.,1,1,123/193.2,,0,0
301692,1976,Ruapehu Pty. Ltd.,3,AU,.,1,1,123/54.3,,0,0
305104,1978,The Zenith Carburetter Company Limited,3,AU,.,1,2,123/179.18,,0,0
336276,1979,"""""""""""""""""""""""""""""""Osrodek Badawczo-Rozwojowy Samochodow Malolitrazowych """"""""""""""""""""""""""""""""Bosmal""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""",3,un,.,1,9,123/339.13,,0,0
423797,1976,Gippsland Institute of Advance Education,3,AU,.,1,3,123/242,,0,0
429828,1979,Solo Industries Pty. Limited,3,AU,.,1,3,123/651,,0,0


assignee_country,n
AU,9
NL,11
CH,13
SE,13
CA,17
IT,24
GB,71
FR,85
DE,121
JP,241


In [28]:
#bloopers and defunct code for my own reference (I don't like deleting code)
#group_by org and year to get the number of patents by year for each firm
# total_patents %>% 
#     group_by(assignee_key_id,years) %>% 
#     summarise(number_of_patents = n()) -> num_pats_by_year

# nrow(num_pats_by_year) #2693

# #show df
# num_pats_by_year


#get specialization info, remove dupes
#specialization is really weird. I'm gonna have to redo the variable
# total_patents %>%
#     select(specialization, assignee_key_id,years) %>%
#     group_by(specialization, assignee_key_id, years) %>%
#     count() %>%
#  #   select(!n) %>%
#     filter(assignee_key_id==38960) %>%
#     arrange(years,desc())
    
#-> specialization

#left_join merge
# assignee_data %>%
#     left_join(specialization,by=c('assignee_key_id','years')) %>% 
#     distinct() %>%
#     filter(assignee_organization == "General Motors Corporation")