-
Notifications
You must be signed in to change notification settings - Fork 0
/
firstIter.sh
executable file
·155 lines (115 loc) · 4.98 KB
/
firstIter.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/bin/bash
########################################################################
########################################################################
#### ##
#### File Name: firstIter.sh ##
#### ##
#### Github Repo: https://github.com/drednaut/REntropy.git ##
#### ##
#### Author: Jared Knutson ##
#### ##
#### Email: jaredknutson@nevada.unr.edu ##
#### ##
#### Date: 5/19/2017 ##
#### ##
#### Dependencies: youtube-dl avconv ##
#### ##
#### Version: 0.4.0 ##
#### ##
#### Usage: ./firstIter.sh <User Channel> ##
#### ##
#### Notes: ##
#### ##
########################################################################
########################################################################
#This function touches and then removes all the files dealt with by this script.
#This insure that all file contents begin the program empty.
file_gen() {
touch firstIter firstSnip firstClean data-video-ids clean-dvi clean-ytid clean_channel channel_raw temp_channel
rm firstIter firstSnip firstClean data-video-ids clean-dvi clean-ytid clean_channel channel_raw temp_channel
}
#This function targets the original html dump and finds patterns which corrilate to the ytid values of the channels within the current channel.
get_ytid() {
wget -O firstIter $1
grep 'data-ytid=' firstIter > firstSnip
declare -a ARRAY
exec 10<&0
fileName="firstSnip"
exec < $fileName
let count=0
while read LINE; do
ARRAY[$count]=$LINE
((count++))
done
exec 10<&-
regex="data-ytid=\"[A-Z a-z 0-9 _ -- ]*\""
ELEMENTS=${#ARRAY[@]}
firstline=0
for((i=0;i<$ELEMENTS;i++)); do
if [[ ${ARRAY[${i}]} =~ $regex ]]; then
if [[ $firstLine<1 ]]; then
echo ${BASH_REMATCH[0]} > firstClean
let firstLine=$firstLine+1
else
echo ${BASH_REMATCH[0]} >> firstClean
fi
fi
done
get_channel
}
# This function formats the ytid pattern match result so that it can be used in the next iteration of this program.
# The program uses awk and sed to accomplish this task.
scrape_ytid() {
awk '!a[$0]++' firstClean > channel-ytids
sed -i 's/\"/ /g' channel-ytids
awk '{print $2}' channel-ytids >> clean-ytid
}
# This function targets the main html dump and searches for patterns which correspond to the title of the channels, within the channel.
# These patterns are output to a file called channel_raw.
get_channel() {
declare -a ARRAY
exec 10<&0
fileName="firstIter"
exec < $fileName
let count=0
while read LINE; do
ARRAY[$count]=$LINE
((count++))
done
exec 10<&-
regex="dir=\"ltr\" title=\"[A-Za-z]*"
ELEMENTS=${#ARRAY[@]}
firstline=0
for((i=0;i<$ELEMENTS;i++)); do
if [[ ${ARRAY[${i}]} =~ $regex ]]; then
if [[ $firstLine<1 ]]; then
echo ${BASH_REMATCH[0]} > channel_raw
let firstLine=$firstLine+1
else
echo ${BASH_REMATCH[0]} >> channel_raw
fi
fi
done
scrape_channel
}
# This function uses the file created by the get_channel function and formats the results contained in that file so that they represent the correct names of the channels, within the current channel.
#This function outputs the corrected channel titles into a file named clean_channel
scrape_channel() {
awk '!a[$0]++' channel_raw > temp_channel
sed -i 's/\"/$/g' temp_channel
awk -F '$' '{print $4}' temp_channel >> clean_channel
sed -i 's/ //g' clean_channel
}
# As it's name suggests this function removes the temporary files which are used to format the final output.
garbage_collection() {
rm firstSnip firstClean channel-ytids channel_raw temp_channel
}
#-------BEGIN PROGRAM------------
channel=$(echo $1)
channel="https://www.youtube.com/user/${channel}/channels"
file_gen
get_ytid $channel
scrape_ytid
echo "Number of Channels found: $(wc -l clean_channel | awk '{print $1}')"
garbage_collection
#---------END PROGRAM-------------