# Scraping/pulling to photo source URL
### Author: Tess Wagner
### Date: 8/21/24

First, I'm looking at the old scrape for tips and to identify any url patterns.

In [2]:
import pandas as pd
import re

In [6]:
data = pd.read_csv('C://Users//csky2//Desktop//CS//FracTracker//cleaned_data.csv')
data.rename(columns={'Photo_src_URL': 'photo_src'}, inplace=True)

In [8]:
# making sure there arent repeats to identify a true pattern
checking_for_duplicate_srcs = list(data.duplicated('photo_src'))
if True in checking_for_duplicate_srcs:
    data.drop_duplicates(subset = ['photo_src'], inplace= True)
else:
    pass


In [9]:
pd.set_option('display.max_colwidth', None)
subset_data = data[['photo_src', 'URL', 'PhotoID']].sort_values('photo_src')
subset_data

Unnamed: 0,photo_src,URL,PhotoID
0,https://live.staticflickr.com/65535/49727911618_3a2f2bcaa8_o.jpg,https://www.flickr.com/photos/fractracker/49727911618/in/album-72157713728924116,49727911618
1,https://live.staticflickr.com/65535/49727914298_a476f79278_o.jpg,https://www.flickr.com/photos/fractracker/49727914298/in/album-72157713728924116,49727914298
2,https://live.staticflickr.com/65535/49727914638_bfe5a977dc_o.jpg,https://www.flickr.com/photos/fractracker/49727914638/in/album-72157713728924116,49727914638
3,https://live.staticflickr.com/65535/49727915018_a3e7f76c52_o.jpg,https://www.flickr.com/photos/fractracker/49727915018/in/album-72157713728924116,49727915018
4,https://live.staticflickr.com/65535/49727915083_77990d84d6_o.jpg,https://www.flickr.com/photos/fractracker/49727915083/in/album-72157713728924116,49727915083
...,...,...,...
2705,https://live.staticflickr.com/65535/53706158340_0cab5deb42_o.jpg,https://www.flickr.com/photos/fractracker/53706158340/in/album-72157714265378312,53706158340
2706,https://live.staticflickr.com/65535/53706158375_3aa0cbae5c_o.jpg,https://www.flickr.com/photos/fractracker/53706158375/in/album-72157714265378312,53706158375
2707,https://live.staticflickr.com/65535/53706158545_bf89ddbcbf_o.jpg,https://www.flickr.com/photos/fractracker/53706158545/in/album-72157714265378312,53706158545
2708,https://live.staticflickr.com/65535/53706158630_742558cf00_o.jpg,https://www.flickr.com/photos/fractracker/53706158630/in/album-72157714265378312,53706158630


## Checker:

## 1.

In [10]:
# it looks like the beginning part of src is the same lests see
def get_beginning_str_src(data):
    return data[0:len('https://live.staticflickr.com/')]

In [11]:
# if the list is longer that 1, there are different beginnings to the string. If it =1, all the strings start the same
len(subset_data['photo_src'].apply(get_beginning_str_src).unique())

1

In [12]:
print('The first', len('https://live.staticflickr.com/'), 'characters of `photo_src` are always https://live.staticflickr.com/' )

The first 30 characters of `photo_src` are always https://live.staticflickr.com/


## 2.

In [13]:
# it looks like 5 digits always follow the beginning https://live.staticflickr.com/
# to check this, i will grab all the characters between the third and fourth "/", check that there are 5 characters, and check that it is an integer
def get_text_between_third_and_fourth_slash(url):
    # Split the string by "/"
    parts = url.split('/')
    # Check if there are enough parts
    if len(parts) > 4:
        # Return the part between the third and fourth slash
        return parts[3]
    else:
        return "NaN"

In [14]:
five_digit_check = pd.DataFrame(subset_data.photo_src.apply(get_text_between_third_and_fourth_slash))

In [15]:
# if it is false, all of the values were successfully grabbed
five_digit_check.isna().value_counts()

photo_src
False        2710
Name: count, dtype: int64

In [16]:
print('it is originally an', five_digit_check.dtypes[0])
print('and we can successfully convert it to', five_digit_check.astype('int').dtypes[0], 'meaning that they are all integers')

it is originally an object
and we can successfully convert it to int32 meaning that they are all integers


  print('it is originally an', five_digit_check.dtypes[0])
  print('and we can successfully convert it to', five_digit_check.astype('int').dtypes[0], 'meaning that they are all integers')


In [17]:
print('number of digits | Percent of data')
five_digit_check.photo_src.apply(lambda x: len(x)).value_counts()/subset_data.shape[0]*100

number of digits | Percent of data


photo_src
5    100.0
Name: count, dtype: float64

## 3.

In [18]:
# it looks like the middle part of photo src is identical to he middle part of url and PhotoID, lets see
def get_middle_str_src(data):
    return data[36:47]

def get_middle_str_url(data):
    return data[42:53]

In [19]:
# running the functions and adding it to a dataframe
middle_chars = pd.DataFrame({'src_middle': subset_data['photo_src'].apply(get_middle_str_src), 'URL_middle': subset_data['URL'].apply(get_middle_str_url), 'PhotoID': subset_data['PhotoID'].astype('str')})

In [20]:
middle_chars

Unnamed: 0,src_middle,URL_middle,PhotoID
0,49727911618,49727911618,49727911618
1,49727914298,49727914298,49727914298
2,49727914638,49727914638,49727914638
3,49727915018,49727915018,49727915018
4,49727915083,49727915083,49727915083
...,...,...,...
2705,53706158340,53706158340,53706158340
2706,53706158375,53706158375,53706158375
2707,53706158545,53706158545,53706158545
2708,53706158630,53706158630,53706158630


In [21]:
# comparing the values. if all entries return true, then these sections are identical across the photo_src and URL
pd.Series(middle_chars['src_middle'] == middle_chars['URL_middle']).value_counts()

True    2710
Name: count, dtype: int64

In [22]:
pd.Series(middle_chars['src_middle'] == middle_chars['PhotoID']).value_counts()

True    2710
Name: count, dtype: int64

In [23]:
print('Character number', len('https://live.staticflickr.com/31337'), 'through character number', len('https://live.staticflickr.com/31337/50199457303'), 
      'of `photo_src` match character number', len('https://www.flickr.com/photos/fractracker'), 'through character number', 
      len('https://www.flickr.com/photos/fractracker/50199457303'), 'of `URL` and the `PhotoID`.')

Character number 35 through character number 47 of `photo_src` match character number 41 through character number 53 of `URL` and the `PhotoID`.


## 4.

In [24]:
# finding the length of the digits in the second middle squence
len('https://live.staticflickr.com/31337/50199457303_12742db461_o.jpg') - len('https://live.staticflickr.com/31337/50199457303_') - len('_o.jpg')

10

In [25]:
# it looks like every photo source url ends in this sequence: _ + 10 characters + _o. + 3 characters
def check_end_pattern(url):
    # Regular expression to match the pattern
    pattern = r'_[A-Za-z0-9]{10}_o\.[a-z]{3}$'

    # Search for the pattern in the string
    match = re.search(pattern, url)
    
    # Return True if the pattern is found, otherwise False
    return bool(match)

In [26]:
# if they all follow the pattern, they should all be true
subset_data.photo_src.apply(check_end_pattern).value_counts()

photo_src
True    2710
Name: count, dtype: int64