-
Notifications
You must be signed in to change notification settings - Fork 39
/
Calculating-NumProp-Returns.Rmd
100 lines (86 loc) · 3.38 KB
/
Calculating-NumProp-Returns.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
---
title: "Calculating-NumProp-Returns"
author: "Eric He"
date: "August 28, 2017"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library("quanteda")
library("dplyr")
library("purrr")
library("ggplot2")
library("reshape2")
library("gridExtra")
```
```{r}
sections <- c("1", "1A", "1B", "2", "3", "4", "5", "6", "7", "7A", "8", "9", "9A", "9B", "10", "11", "12", "13", "14", "15")
masterIndex <- read.csv("masterIndex.csv")
masterIndex$filing <- as.character(masterIndex$filing)
```
```{r}
dfmstat_ratio <- function(dfmObj, dict){
dfm_select(dfmObj, features = dict) %>%
rowSums(.) / rowSums(dfmObj)
}
section_extractor <- function(statement, section){
name <- statement$doc_id
pattern <- paste0("(?i)°Item ", section, "[^\\w|\\d]", ".*°")
section_hits <- str_extract_all(statement, pattern, simplify=TRUE)
if (is_empty(section_hits) == TRUE){
empty_vec <- "empty"
names(empty_vec) <- paste(name, section, sep = "_")
print(paste("No hits for section", section, "of filing", name))
return(empty_vec)
}
word_counts <- map_int(section_hits, ntoken)
max_hit <- which(word_counts == max(word_counts))
max_filing <- section_hits[[max_hit[length(max_hit)]]]
if (max(word_counts) < 250 & str_detect(max_filing, pattern = "(?i)(incorporated by reference)|(incorporated herein by reference)") == TRUE){
empty_vec <- "empty"
names(empty_vec) <- paste(name, section, sep = "_")
print(paste("Section", section, "of filing", name, "incorporates by reference its information"))
return(empty_vec)
}
names(max_filing) <- paste(name, section, sep = "_")
return(max_filing)
}
numeric_proportion_calculator <- function(text_obj){
dfm_obj <- corpus(text_obj) %>%
dfm(remove_punct = TRUE)
num_prop <- dfm_select(dfm_obj, pattern = "\\d+", valuetype = "regex") %>%
rowSums(.) / rowSums(dfm_obj)
big_name <- names(num_prop) # this is so cancer please find a better way to do this
filing_id <- str_extract(big_name, pattern = ".*?(?=\\.)")
section_number <- paste0("sec", str_extract(big_name, pattern = "(?<=_).*"), "num_prop")
matrified <- matrix(num_prop, dimnames = list(filing_id, section_number))
return(matrified)
}
file_location <- "parsed/1.txt"
filing <- readtext(file_location)
section_list <- map(sections, section_extractor, statement = filing) %>%
map(numeric_proportion_calculator) %>%
reduce(cbind)
numeric_proportion_algorithm <- function(file_location){
filing <- readtext(file_location)
section_list <- map(sections, section_extractor, statement = filing) %>%
map(numeric_proportion_calculator) %>%
reduce(cbind)
print(paste("Successfully calculated for filing", file_location))
return(section_list)
}
file_locations <- paste0("parsed/", masterIndex$filing, ".txt")
a <- map(file_locations, numeric_proportion_algorithm) %>%
reduce(rbind)
a[a == 0] <- NA # no numbers found is impossible and only occurs during parse error
rows <- rownames(a)
b <- as_tibble(a)
b <- cbind(filing = rows, b, stringsAsFactors = FALSE)
masterIndex <- left_join(masterIndex, b, by = "filing")
write.csv(masterIndex, file = "index_numprop.csv", row.names = FALSE)
#\\d+ includes things such as "300-millimeter", 10-k, 8-q, 3-dimensional, 1.51, etc. Quite flexible!
```
Differences with sentiment analysis: Don't remove stop words or numbers. No tfidf weighting.
Similarities: Sample population is still by year.