In [4]:
import logging
import requests
import re
import sys
import os 

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

import time
from bs4 import BeautifulSoup, UnicodeDammit

from faker import Faker


In [244]:
class gallery_retrieval(object):
    
    
    def __init__(self, url):
        
        # What url is this:
        url_type = self.what_url_is_this(url)
        self.base_url = url.rsplit('/', 3)[0]
        
        # Get things done:
        first_page_soup = self.fetch_url(url)
        self.gallery_name = get_gallery_name(first_page_soup)
        self.gallery_folder = os.path.join(os.getcwd(), self.gallery_name)
        
        # Based on the url type we need to get the gallery url:
        self.gallery_url = url if url_type == 'gallery' else self.get_gallery_url(first_page_soup)
        
        print(f'URL type: {url_type}')
        print(f'Base URL: {base_url}')
        print(f'Gallery name: {self.gallery_name}')
        print(f'Gallery URL: {self.gallery_url}')
        
        # Initialize empty list for image urls:
        self.image_urls = []
        
        self.get_image_urls_from_gallery(self.gallery_url)
        
        # Create gallery:
        try:
            os.makedirs(self.gallery_folder, exist_ok = True)
        except OSError:
            print ("Creation of the directory %s failed" % self.gallery_folder)
        else:
            print ("Successfully created the directory %s" % self.gallery_folder)
        
      
    def fetch_all_photos(self):
        return 1
    
    @staticmethod
    def get_gallery_url(soup):
        return [x.get('href') for x in soup.findAll('a') if '/gallery.php?' in x.get('href')][0]
        
    @staticmethod
    def what_url_is_this(url):
        '''
        Classifying urls to 'gallery', 'photo' or 'other'
        '''

        if re.match('https:\/\/www\.image.+pictures\/\d+\/.+', url):
            return 'gallery'
        elif re.match('https:\/\/www\.image.+photo\/\d+', url):
            return 'photo'
        else:
            return 'other'

    @staticmethod
    def fetch_url(url):
        '''
        fethc the provided url and returns a soup object
        '''
        response = requests.get(url)

        # Returned html document:
        html = response.text

        # Html encoded into utf8:
        uhtml = UnicodeDammit(html)

        # Creating soup:
        soup = BeautifulSoup(uhtml.unicode_markup, features="html.parser")

        return soup
    
    @staticmethod
    def get_gallery_name(soup):
        '''
        From a soup object, parsed gallery name is returned.
        Not 100% perfect: in theory it could fail, but haven't
        seen any galleries breaking.
        '''

        # Finding relevant td-s:
        tds = [td for td in soup.findAll('td') if 'Uploaded' in td.text and not td.find('td')]

        title = ''

        # This might fail, but will test if something has been found:
        for td in tds:
            try:
                title += [x for x in tds[0].text.split('\n') if x != ''][0]
            except:
                continue

        if title == '':
            title = None
        else:
            title = title.replace(' ', '_')
    
    def get_image_urls_from_gallery(self, gallery_url):
        """
        Based on a soup of a gellery page, we get a parsable url:
        """
        
        # Fetch gallery:
        soup = self.fetch_url(gallery_url)
        
        # Collect all urls on the first page:
        for a in soup.findAll('a'):
            if a.find('img') and a.get('href').startswith('/photo'):
                
                # Adding completed URL to the list:
                self.image_urls.append(f"{self.base_url}{a.get('href')}")
        
        # Is there a next page:
        next_page_urls = [a.get('href') for a in soup.findAll('a') if a.text == ':: next ::']
        
        if len(next_page_urls) > 0:
            next_page_url = f'{self.gallery_url}{next_page_urls[0]}'
            self.get_image_urls_from_gallery(next_page_url)
    
    def save_images(self):
        '''
        This function downloads all image pages and saves relevant metadata + saves photo
        '''
        print('Fetching images...', end = '')
        for image_url in self.image_urls:
            
            # Fetch data:
            soup = self.fetch_url(image_url)
            
            # Extract image id:
            image_id = re.search('\/photo\/(\d+)', image_url).group(1)
            
            # Extract image name:
            image_name = soup.find('title').text.split('rn Pic ')[0][:-3]
            
            # Extract image url:
            image_links = [img.get('src') for img in soup.findAll('img') if img.get('src') and '/images/full' in img.get('src')]
            if len(image_links) == 0:
                print(f'Failed to find image url for this page: {image_url}')
                continue
            
            # Save image
            image_data = requests.get(image_links[0])
            with open(f'{self.gallery_folder}/{image_name}', 'wb') as image_file:
                image_file.write(image_data.content)
                
            print('.', end = '')
        
# gr = gallery_retrieval('https://www.imagefap.com/pictures/390765/Amateur-Strike-Again%21-22')    # gallery
# gr = gallery_retrieval('https://www.imagefap.com/photo/2060212603/?pgid=&gid=2789739&page=0') # picture
# gr = gallery_retrieval('https://www.imagefap.com/pictures/9453495/Violet-Petite-cutie-strips-out-of-jeans-%5BBlue-Eyes%5D') # gallery
gr = gallery_retrieval('https://www.imagefap.com/photo/1585417929/') # photo



URL type: photo
Base URL: https://www.imagefap.com
Gallery name: Violet:_Petite_cutie_strips_out_of_jeans_[Blue_Eyes]
Gallery URL: https://www.imagefap.com/gallery.php?gid=9453495
Successfully created the directory /Users/dsuveges/project/random_notebooks/Violet:_Petite_cutie_strips_out_of_jeans_[Blue_Eyes]


In [245]:
gr.save_images()

Fetching images..............................................................................................................................

In [169]:

url = 'https://www.imagefap.com/photo/1684190954/?pgid=&gid=8420383&page=0#38'
soup = fetch_url(url)
soup.find('div', _class='slideshow')


In [182]:
gallery_url = 'https://www.imagefap.com/pictures/8724883/wife-bisexual-woman-swing-couple-oral-blowjob-ass-1'

soup = fetch_url(gallery_url)

In [185]:
[a.get('href') for a in soup.findAll('a') if a.find('img') and a.get('href').startswith('/photo')]

['/photo/360894076/?pgid=&gid=8724883&page=0',
 '/photo/117813974/?pgid=&gid=8724883&page=0',
 '/photo/1199163290/?pgid=&gid=8724883&page=0',
 '/photo/341278414/?pgid=&gid=8724883&page=0',
 '/photo/921124432/?pgid=&gid=8724883&page=0',
 '/photo/325939700/?pgid=&gid=8724883&page=0',
 '/photo/934341260/?pgid=&gid=8724883&page=0',
 '/photo/1713872469/?pgid=&gid=8724883&page=0',
 '/photo/1431521971/?pgid=&gid=8724883&page=0',
 '/photo/884350586/?pgid=&gid=8724883&page=0',
 '/photo/2121117270/?pgid=&gid=8724883&page=0',
 '/photo/1569406567/?pgid=&gid=8724883&page=0',
 '/photo/1677538498/?pgid=&gid=8724883&page=0',
 '/photo/2131442903/?pgid=&gid=8724883&page=0',
 '/photo/2115945624/?pgid=&gid=8724883&page=0',
 '/photo/1616470083/?pgid=&gid=8724883&page=0',
 '/photo/617837806/?pgid=&gid=8724883&page=0',
 '/photo/1081567058/?pgid=&gid=8724883&page=0',
 '/photo/38794906/?pgid=&gid=8724883&page=0',
 '/photo/1288684627/?pgid=&gid=8724883&page=0',
 '/photo/1329314726/?pgid=&gid=8724883&page=0',
 '

In [200]:
[a.get('href') for a in soup.findAll('a') if a.text == ':: next ::']


{'?gid=8724883&page=1&view=0'}

In [208]:
image_url = 'https://www.imagefap.com/photo/118569449/?pgid=&gid=9453495&page=0'

soup = fetch_url(image_url)



In [234]:
image_links = [a.get('href') for a in soup.findAll('a') if a.find('img') and a.get('href').startswith('/photo')]
image_links

[]

In [240]:
il = [img.get('src') for img in soup.findAll('img') if img.get('src') and '/images/full' in img.get('src')]

In [243]:
image_data = requests.get(il[0])
with open('cicaful.jpg', 'wb') as image:
    image.write(image_data.content)

In [248]:
import pyspark.sql
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext
import re

global spark

# SparkContext.setSystemProperty('spark.executor.memory', '20g')

spark = (pyspark.sql.SparkSession
    .builder
    .appName("phenodigm_parser")
    .config("spark.executor.memory", '10g')
     .config("spark.driver.bindAddress", "localhost")
    .config("spark.driver.memory", '10g')
    .getOrCreate()
)

#   


print('Spark version: ', spark.version)


Spark version:  3.0.0


In [249]:
target_file = '/Users/dsuveges/project_data/ot/target_index/targets'
tdf=(
    spark.read.parquet(target_file)
    .persist()
)

In [255]:
(
    tdf
    .filter(col('approvedSymbol') == 'ABI1')
    .select(col('hallMarks'))
    .write.json('cicaful.json')
    
)

In [257]:
%%bash

cat cicaful.json/*json | jq

{
  "hallMarks": {
    "attributes": [
      {
        "pmid": 16025998,
        "attribute_name": "role in cancer",
        "description": "TSG"
      },
      {
        "pmid": 9694699,
        "attribute_name": "role in cancer",
        "description": "fusion"
      },
      {
        "pmid": 23552839,
        "attribute_name": "role in cancer",
        "description": "TSG"
      },
      {
        "pmid": 9694699,
        "attribute_name": "fusion partner",
        "description": "KMT2A"
      },
      {
        "pmid": 23552839,
        "attribute_name": "mouse model",
        "description": "development of prostatic intraepithelial neoplasia was observed in 8-month-old Abi1 knockout mice, but no progression beyond PIN was observed in mice as old as 12 months"
      },
      {
        "pmid": 28339046,
        "attribute_name": "role in cancer",
        "description": "oncogene"
      }
    ],
    "cancer_hallmarks": [
      {
        "pmid": 28339046,
        "description": "over

In [261]:
import requests
import json
from faker import Faker
from random import randint, choice
import time

# Initialize a localized faker:
fake = Faker('en_GB')

In [262]:
fake.credit_card_number(card_type='visa')

'4548251852346782'

In [280]:
import requests


url = 'https://parcelforce-uk.com/delivery?tracking=EA584319218AF'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

response = requests.get(url, headers=headers)



In [282]:
response.headers['set-cookie']

'PHPSESSID=74a3ec5fee129975666dd604026a5458; path=/'

In [1]:
files = '/Users/dsuveges/project_data/interaction_fix/fix-interactions-parquet/interactionEvidence/'

from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# establish spark connection
spark = (
    SparkSession.builder
    .master('local[*]')
    .getOrCreate()
)

(
    spark.read.parquet(files)
    .select(F.col('targetB'), F.col('targetA'))
    .filter(F.col('targetB').isNull() | F.col('targetA').isNull())
    .count()
)


0

In [3]:
(
    spark.read.parquet(files)
    .printSchema()
)

root
 |-- interactionTypeMiIdentifier: string (nullable = true)
 |-- targetB: string (nullable = true)
 |-- evidenceScore: double (nullable = true)
 |-- intBBiologicalRole: string (nullable = true)
 |-- interactionResources: struct (nullable = true)
 |    |-- databaseVersion: string (nullable = true)
 |    |-- sourceDatabase: string (nullable = true)
 |-- participantDetectionMethodB: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- miIdentifier: string (nullable = true)
 |    |    |-- shortName: string (nullable = true)
 |-- expansionMethodShortName: string (nullable = true)
 |-- interactionDetectionMethodShortName: string (nullable = true)
 |-- intA: string (nullable = true)
 |-- intBSource: string (nullable = true)
 |-- participantDetectionMethodA: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- miIdentifier: string (nullable = true)
 |    |    |-- shortName: string (nullable = true)
 |-- speciesB: struct (n

In [4]:
%%bash

mkdir 


/Users/dsuveges/project/random_notebooks
