# Prezentowanie stworzonych zasobów analizy

In [9]:
df <- data.frame(
    id = 1:5,
    url = c(
        "https://www.3blue1brown.com/faq#music",
        "https://www.google.com/search?q=r+notebook+page+break",
        "https://calendar.google.com/calendar/u/0/r?tab=rc&pli=1",
        "https://www.microsoft.com/en-us/p/windows-terminal/9n0dx20hk701?activetab=pivot:overviewtab",
        "https://learn.datacamp.com/skill-tracks/anaconda-skills?version=1"
    )    
)
df

id,url
1,https://www.3blue1brown.com/faq#music
2,https://www.google.com/search?q=r+notebook+page+break
3,https://calendar.google.com/calendar/u/0/r?tab=rc&pli=1
4,https://www.microsoft.com/en-us/p/windows-terminal/9n0dx20hk701?activetab=pivot:overviewtab
5,https://learn.datacamp.com/skill-tracks/anaconda-skills?version=1


In [72]:
(digits_regex <- "[\\d][a-z]+")

In [80]:
library("stringr")
library("stringi")
str_match_all(df$url, digits_regex)

0
3blue
1brown

0
9n
0dx
0hk


In [81]:
stri_extract_all_regex(df$url, pattern = digits_regex)

In [82]:
stri_count_regex(df$url, pattern = digits_regex)

In [83]:
domain_regex <- "[a-z]+\\.com"
df

id,url
1,https://www.3blue1brown.com/faq#music
2,https://www.google.com/search?q=r+notebook+page+break
3,https://calendar.google.com/calendar/u/0/r?tab=rc&pli=1
4,https://www.microsoft.com/en-us/p/windows-terminal/9n0dx20hk701?activetab=pivot:overviewtab
5,https://learn.datacamp.com/skill-tracks/anaconda-skills?version=1


In [84]:
stri_extract_all_regex(df$url, pattern = domain_regex)

In [43]:
complicated_url_regex <- paste(
    "^(([^:/?#]+):(?=\\/\\/))?(\\/\\/)?((([^:]+)",
    "(?::([^@]+)?)?@)?([^@/?#:]*)(?::(\\d+)?)?)?",
    "([^?#]*)(\\?([^#]*))?(#(.*))?", sep = "")
complicated_url_regex

In [91]:
str_match(df$url, complicated_url_regex)[1,]

In [94]:
complete_url <- paste("https://john.doe:geheim@www.example.com:8080",
                      "/forum/questions/example.cgi",
                      "?tag=networking&order=newest#top", sep = "")
str_match(complete_url, complicated_url_regex)[1,]

In [102]:
source("split-url.r")
split_url(complete_url)

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
https://john.doe:geheim@www.example.com:8080/forum/questions/example.cgi?tag=networking&order=newest#top,https:,https,//,john.doe:geheim@www.example.com:8080,john.doe:geheim@,john.doe,geheim,www.example.com,8080,/forum/questions/example.cgi,?tag=networking&order=newest,tag=networking&order=newest,#top,top


In [104]:
split_url(df$url)

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
https://www.3blue1brown.com/faq#music,https:,https,//,www.3blue1brown.com,,,,www.3blue1brown.com,,/faq,,,#music,music
https://www.google.com/search?q=r+notebook+page+break,https:,https,//,www.google.com,,,,www.google.com,,/search,?q=r+notebook+page+break,q=r+notebook+page+break,,
https://calendar.google.com/calendar/u/0/r?tab=rc&pli=1,https:,https,//,calendar.google.com,,,,calendar.google.com,,/calendar/u/0/r,?tab=rc&pli=1,tab=rc&pli=1,,
https://www.microsoft.com/en-us/p/windows-terminal/9n0dx20hk701?activetab=pivot:overviewtab,https:,https,//,www.microsoft.com,,,,www.microsoft.com,,/en-us/p/windows-terminal/9n0dx20hk701,?activetab=pivot:overviewtab,activetab=pivot:overviewtab,,
https://learn.datacamp.com/skill-tracks/anaconda-skills?version=1,https:,https,//,learn.datacamp.com,,,,learn.datacamp.com,,/skill-tracks/anaconda-skills,?version=1,version=1,,


In [123]:
split_res <- clean_split_url(df$url)
split_res

url,protocol,host,path,query,fragment
https://www.3blue1brown.com/faq#music,https:,www.3blue1brown.com,/faq,,music
https://www.google.com/search?q=r+notebook+page+break,https:,www.google.com,/search,q=r+notebook+page+break,
https://calendar.google.com/calendar/u/0/r?tab=rc&pli=1,https:,calendar.google.com,/calendar/u/0/r,tab=rc&pli=1,
https://www.microsoft.com/en-us/p/windows-terminal/9n0dx20hk701?activetab=pivot:overviewtab,https:,www.microsoft.com,/en-us/p/windows-terminal/9n0dx20hk701,activetab=pivot:overviewtab,
https://learn.datacamp.com/skill-tracks/anaconda-skills?version=1,https:,learn.datacamp.com,/skill-tracks/anaconda-skills,version=1,


In [124]:
is.matrix(split_res)

In [106]:
source("url-lengths.r")
source("url-special-symbol-count.r")
source("url-ambiguity.r")

In [115]:
url_lengths(split_res)

url_l,protocol_l,host_l,path_l,query_l,fragment_l,host_by_url,path_by_url,query_by_url,path_by_host,query_by_host,query_by_path
37,6,19,4,,5.0,0.5135135,0.1081081,,0.2105263,,
53,6,14,7,23.0,,0.2641509,0.1320755,0.4339623,0.5,1.6428571,3.2857143
55,6,19,15,12.0,,0.3454545,0.2727273,0.2181818,0.7894737,0.6315789,0.8
91,6,17,38,27.0,,0.1868132,0.4175824,0.2967033,2.2352941,1.5882353,0.7105263
65,6,18,29,9.0,,0.2769231,0.4461538,0.1384615,1.6111111,0.5,0.3103448


In [116]:
lett_dig_symb_count(split_res)

lett_url,lett_protocol,lett_host,lett_path,lett_query,lett_fragment,dig_url,dig_protocol,dig_host,dig_path,dig_query,dig_fragment,symb_url,symb_protocol,symb_host,symb_path,symb_query,symb_fragment
28,5,15,3,,5.0,2,0,2,0,,0.0,7,1,2,1,,0.0
42,5,12,6,19.0,,0,0,0,0,0.0,,7,1,2,1,0.0,
40,5,17,10,8.0,,2,0,0,1,1.0,,11,1,2,4,1.0,
70,5,15,25,25.0,,7,0,0,7,0.0,,13,1,2,6,1.0,
53,5,16,25,7.0,,1,0,0,0,1.0,,10,1,2,4,0.0,


In [117]:
letter_digit_letter(split_res)

ldl_url,ldl_protocol,ldl_host,ldl_path,ldl_query,ldl_fragment
1,0,1,0,,0.0
0,0,0,0,0.0,
0,0,0,0,0.0,
2,0,0,2,0.0,
0,0,0,0,0.0,


In [118]:
digit_letter_digit(split_res)

dld_url,dld_protocol,dld_host,dld_path,dld_query,dld_fragment
0,0,0,0,,0.0
0,0,0,0,0.0,
0,0,0,0,0.0,
2,0,0,2,0.0,
0,0,0,0,0.0,


In [119]:
combined_url_ambiguity(split_res)

xyx_url,xyx_protocol,xyx_host,xyx_path,xyx_query,xyx_fragment
1,0,1,0,,0.0
0,0,0,0,0.0,
0,0,0,0,0.0,
2,0,0,2,0.0,
0,0,0,0,0.0,
