-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_process.R
266 lines (214 loc) · 7.4 KB
/
data_process.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
# ********************************************
# DATA SOURCING AND CLEANING FOR PELL PROJECT ----
# Author: Arafath Hossain
# Date: 06-06-2021
# Description:
# The US Department of Education stores data on the Pell recipients.
# The data stored are in excel format in one file per year style with no fixed pattern or format style.
# This script will serve three purposes:
# 1. Collect the data from the website,
# 2. Process the data to standardize their format,
# 3. Merge different files to create one single source of data for futher use.
# ********************************************
# 0.0 PROJECT SET UP ----
# 0.1 Loading Libraries ----
library(readxl)
library(rvest)
library(tidyverse)
library(httr)
library(janitor)
library(patchwork)
library(glue)
# 1.0 DATA SOURCING ----
# 1.1 Web Scraping ----
url <- "https://www2.ed.gov/finaid/prof/resources/data/pell-institution.html"
downloadString <- "https://www2.ed.gov/finaid/prof/resources/data/"
htmlOutput <- read_html(url)
# collect years
years <- htmlOutput %>%
html_nodes(".smallindent") %>%
html_text() %>%
substr(1, 7)
# collects download link
reportLinks <- htmlOutput %>%
html_nodes(".smallindent > a") %>%
html_attr('href') %>%
unique()
# year and report match
reportSources <- tibble(
years = years,
report_link = paste0(downloadString, reportLinks)
)
# 1.2 Collecting Data ----
downloadData <- function(reportSourcesDF, storageLocation){
df <- tibble()
for(i in seq(1, nrow(reportSourcesDF), 1)){
reportYear <- reportSourcesDF[i, 1]$years
report <- reportSourcesDF[i,2]
fileExt <- tools::file_ext(report)
outputLocation <- paste0(storageLocation, reportYear, ".csv")
# downloads and save file temporarily
GET(report$report_link, write_disk(
yearlyReport <- tempfile(fileext = fileExt))
)
# removes introductory paragraphs from the excel files
yearlyReport = read_excel(yearlyReport, col_names = F)
yearlyReport <- yearlyReport[!is.na(yearlyReport[,1]), ]
yearlyReport <- yearlyReport[!is.na(yearlyReport[,2]), ]
yearlyReport <- yearlyReport[!is.na(yearlyReport[,3]), ]
# promotes first full row as column names
yearlyReport <- yearlyReport %>%
janitor::row_to_names(row_number = 1)
# writes back to the desired location
yearlyReport %>%
write_excel_csv(outputLocation)
# populates a summary report with col names
cols <- as_tibble(names(yearlyReport))
cols$year <- reportYear
df <- rbind(df, cols)
}
return(df)
}
reportSourcesSummary <- downloadData(reportSources, "data/")
# 1.3 Data Explore ----
# Total columns difference
yearly_cols <- reportSourcesSummary %>%
ggplot(aes(y = year)) +
geom_bar(stat = "count", width = .5, show.legend = F, color = "#4a0202") +
labs(
y = "Year", x = "Total",
title = "Varying Column Counts for the Same Report Over the Years"
) +
fastEda::theme_fasteda(color = "#0a0a0a") +
theme(plot.title = element_text(size = 12))
# Non standardized column names
similarColCount <- function(df, targetCol, keyword){
cols <- unique(
grep(keyword, pull(df, {{targetCol}}), ignore.case = T, value = T)
)
# cols2 <- unique(filter(df, {{targetCol}} == keyword) %>% pull({{targetCol}}))
filter(df, {{targetCol}} %in% c(cols)) %>%
count({{targetCol}}, sort = T)
}
uns_col_names <-
similarColCount(reportSourcesSummary, value, "name") %>%
mutate(info_type = "Institution Name Cols") %>%
rbind(
similarColCount(reportSourcesSummary, value, "awards") %>%
mutate(info_type = "Award Amount Cols")
) %>%
rbind(
similarColCount(reportSourcesSummary, value, "city") %>%
mutate(info_type = "City Name Cols")
) %>%
ggplot(
aes(y = reorder(value, n), x = n, color = "#4a0202")
) +
geom_col(show.legend = F) +
facet_grid(info_type ~ ., scales = "free", space = "free") +
labs(
title = "Varying Column Names Reporting Same Data",
y = NULL,
x = "Total"
) +
fastEda::theme_fasteda(color = "#0a0a0a") +
theme(panel.border = element_rect(color = "white", fill = NA, size = 1)) +
theme(strip.background =element_rect(fill="black"))+
theme(strip.text = element_text(colour = 'white')) +
theme(plot.title = element_text(size = 12))
# Combining data vizes
yearly_cols + uns_col_names +
plot_annotation(
title = "A Brief Data Nightmare: Inconsistent Data Reporting",
subtitle = "Data Inconsistencies in the Web Scrapped Pell Grant Data",
caption = "Prepared By: Arafath Hossain \n #dash_app_pell",
theme = fastEda::theme_fasteda(color = "#e6e6e6") +
theme(text = element_text(color = "black"),
plot.title = element_text(size = 14, hjust = 0.5),
plot.subtitle = element_text(size = 10, hjust = 0.5, vjust = 5))
)
# 1.4 Data Cleaning ----
# Convert char data type to factor
str(reportSourcesSummary)
reportSourcesSummary <- reportSourcesSummary %>%
mutate_if(is.character, as.factor)
summary(reportSourcesSummary)
unique(reportSourcesSummary$year) %>% length()
table(reportSourcesSummary$value) %>% sort(decreasing = T)
# Report with exact same columns
table(reportSourcesSummary)
# 2.0 DATA PREPARATION ----
# 2.1 Matching old names to standardised names ----
reportSourcesSummaryCln %>%
mutate(value = toupper(value)) %>%
select(value, newName) %>%
table()
standardNames <- c(
"NAME",
"STATE",
"AWARD",
"RECIPIENT"
)
matchNewName <- function(oldNamesTbl, oldNameCol, standardNames){
copyTbl <- oldNamesTbl %>% head(0)
oldNames <- pull(oldNamesTbl, {{oldNameCol}})
for(name in standardNames){
df <- oldNamesTbl %>%
mutate(
newName = ifelse(
grepl(pattern = name, x = oldNames, ignore.case = T),
name, NA
)
) %>% drop_na()
copyTbl <- copyTbl %>% rbind(df)
}
unmatchedRows <- oldNamesTbl %>%
filter(!{{oldNameCol}} %in% unique(pull(copyTbl, {{oldNameCol}}))) %>%
mutate(newName = {{oldNameCol}})
copyTbl <- copyTbl %>%
rbind(unmatchedRows)
return(unique(copyTbl))
}
reportSourcesSummaryCln <- matchNewName(reportSourcesSummary, value, standardNames)
table(reportSourcesSummaryCln$newName)
# Institution and Receips are two unstandardised values yet in the dataset
reportSourcesSummaryCln <- reportSourcesSummaryCln %>%
mutate(newName = ifelse(
value == "Institution", "NAME", ifelse(
value == "Recips", "RECIPIENT", newName
)
))
table(reportSourcesSummaryCln$newName)
# dropping unnecessary columns
reportSourcesSummaryCln <- reportSourcesSummaryCln %>%
filter(newName %in% standardNames)
# 2.1 Function for Data Processing ----
# 2.1.1 Attempt 02 ----
# loading all files
files <- list.files("data/")
data_combo <- NULL
standdardNames <- newNames
for(file in files){
file_year = strsplit(file, ".csv")[[1]]
data <- read_csv(paste0("data/", file))
nameMatch <- reportSourcesSummaryCln %>%
filter(year == file_year)
print(glue("Year in process: {file_year}"))
data <- data %>% select(nameMatch$value)
names(data) <- nameMatch$newName
data <- data %>%
mutate(YEAR = file_year)
data_combo <- data_combo %>%
rbind(data)
}
# creating column for year start and session
data_combo <- data_combo %>%
rename(SESSION = YEAR) %>%
separate(SESSION, sep = "-", into = c("YEAR", NA), remove = F)
# formatting data
data_combo <- data_combo %>%
mutate(
NAME = tools::toTitleCase(tolower(NAME)),
STATE = toupper(STATE)
)
write_csv(data_combo, "data/pell_grant_data.csv")