In [12]:
import pandas as pd
import numpy as np
import re
import json
import os

from bs4 import BeautifulSoup
import requests
import urllib.request

from parsel import Selector
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys

# Nasa Satellite images

In [20]:
def download_image(url, folder_path, file_name):
    """
    Download the image from the given URL and save it with the specified file name in the specified folder path.
    """
    
    file_path = os.path.join(folder_path, file_name + '.jpg')
    urllib.request.urlretrieve(url, file_path)

# URL and folder path
url = "https://earthobservatory.nasa.gov/images"
folder_path = r"C:\Users\darvi\Desktop\GitHub\Python-Assignment1"
folder_path = folder_path.replace("\\", "/")

# Folder name
folder_name = "Nasa Satellite Images"
file_path = os.path.join(folder_path, folder_name)

# Create the folder if it doesn't exist
if not os.path.exists(file_path):
    os.makedirs(file_path)
    
opts = Options()
driver = webdriver.Chrome(options=opts, executable_path="chromedriver")

driver.get(url)
driver.maximize_window()

Number_of_click = 3 # Adjust the number of times you want to click the "Explore More" button
while Number_of_click > 0:
    Number_of_click -= 1
    explore_more_button = driver.find_element(By.XPATH, '//*[@class="explore-more"]')
    explore_more_button.click()
    sleep(1)  # Add a delay of 2 seconds to allow the content to load

page_source = driver.page_source
# driver.quit()

soup = BeautifulSoup(page_source, "lxml")

Nasa_Satellite_images = {}
image_links = soup.find_all("div", class_="thumbnail-image")
captions = soup.find_all("div", class_="caption")

for link, caption in zip(image_links, captions):
    
    # Get the download link, description, and title
    download_link = link.a.img["src"]
    description = caption.p.text
    title = caption.h4.a.text
    
    # Download the image and save it in the specified folder path
    download_image(download_link, file_path, title)
    
    # Store image information in the dictionary
    Nasa_Satellite_images[title] = {}
    Nasa_Satellite_images[title] = {
        "Image": download_link,
        "Description": description
    }

# Save the image information dictionary as a JSON file
with open('Nasa_Satellite.json', 'w') as f:
    json.dump(Nasa_Satellite_images, f, indent = 4)

# IMDB Gender images

In [3]:
    def download_image(url, folder_path, file_name):
    """
    Download the image from the given URL and save it with the specified file name in the specified folder path.
    """
    
    file_path = os.path.join(folder_path, file_name + '.jpg')
    urllib.request.urlretrieve(url, file_path)

# URL and folder path
folder_path = r"C:\Users\darvi\Desktop\GitHub\Python-Assignment1"
folder_path = folder_path.replace("\\", "/")

# Folder name
folder_name = "IMDB Gender images"
file_path = os.path.join(folder_path, folder_name)

# Create the folder if it doesn't exist
if not os.path.exists(file_path):
    os.makedirs(file_path)
    
    
target = 10001
page = list(range(1, target, 50))
celebrities = {}
categories = ["female", "male"]

for category in categories:
    # Loop through the pages
    for i in page:
        # Get the HTML content of the page
        url = f"https://www.imdb.com/search/name/?gender={category}&start={i}&ref_=rlm"
        html_text = requests.get(url).text
        soup = BeautifulSoup(html_text, "lxml")

        # Extract the information for each celebrity
        names = soup.find_all("h3", class_="lister-item-header")
        images = soup.find_all("img")
        infos = soup.find_all("div", class_="lister-item-content")

        for name, link, info in zip(names, images, infos):
            # Get the name and image link
            name = name.a.text.strip()
            link = link["src"]

            # Download the image and save it in the specified folder path
            download_image(link, file_path, name)

            # Store the image link in the celebrities dictionary
            if category not in celebrities:
                celebrities[category] = {}
            if name not in celebrities[category]:
                celebrities[category][name] = {}
                celebrities[category][name]["Image"] = link

            # Get the role and movie information
            info_element = info.find("p", class_="text-muted text-small")
            info_text = info_element.text.strip() if info_element else ""
            role, movie = info_text.split("|") if "|" in info_text else (info_text, "")
            celebrities[category][name]["Role"] = role.strip()
            celebrities[category][name]["Movie"] = movie.strip()

            # Get the additional history information
            info_paragraphs = info.find_all("p")
            history_text = info_paragraphs[1].text.strip() if len(info_paragraphs) > 1 else ""
            celebrities[category][name]["History"] = history_text

# Save the celebrities dictionary as a JSON file
json_file_path = os.path.join(folder_path, "IMDB_Gender.json")
with open(json_file_path, 'w') as f:
    json.dump(celebrities, f)

# LinkedIn

In [14]:
# Global parameters
country = 'Finland'
pages_max = 10  # maximum of pages with job posts extracting per 1 keyword (each page contains max 25 posts)
request_and = False  # if True the request will be 'Data AND Science' instead of just 'Data Science'
file_name = 'linked-in-jobs_data-science_' + country + '_AND=' + str(request_and) + '.csv'

# Keywords for searching requests
str_ds = 'Data Science, Big data, Machine learning, Data mining, Artificial intelligence, Predictive modeling, Statistical analysis, Data visualization, Deep learning, Natural language processing, Business intelligence, Data warehousing, Data management, Data cleaning, Feature engineering, Time series analysis, Text analytics, Database, SQL, NoSQL, Neural networks, Regression analysis, Clustering, Dimensionality reduction, Anomaly detection, Recommender systems, Data integration, Data governance'
str_ml = 'Machine learning, Data preprocessing, Feature selection, Feature engineering, Data visualization, Model selection, Hyperparameter tuning, Cross-validation, Ensemble methods, Neural networks, Deep learning, Convolutional neural networks, Recurrent neural networks, Natural language processing, Computer vision, Reinforcement learning, Unsupervised learning, Clustering, Dimensionality reduction, Bayesian methods, Time series analysis, Random forest, Gradient boosting, Support vector machines, Decision trees, Regression analysis'
DataScience = str_ds.split(', ')
MachineLearning = str_ml.split(', ')
united_set = set(DataScience) | set(MachineLearning)

# Creating a dictionary to store the job postings
company_posts = {}

def scrape_jobs(keyword, n_max=pages_max * 25):
    '''Extract the jobs from LinkedIn using keyword and store the job data in the company_posts dictionary'''
    site = 'https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={}&location={}&pageNum=0&start={}'

    for i in range(0, n_max, 25):
        url = site.format(keyword, 'Finland', str(i))
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
        response = requests.get(url, headers=headers).text

        soup = BeautifulSoup(response, 'lxml')
        jobs = soup.find_all(
            class_='base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card')

        for job in jobs:
            job_title = job.find('h3', class_='base-search-card__title').text.strip()
            job_company = job.find('h4', class_='base-search-card__subtitle').text.strip()
            job_location = job.find('span', class_='job-search-card__location').text.strip()
            job_link = job.find('a', class_='base-card__full-link')['href'] if job.find('a', class_='base-card__full-link') else ' '
            job_date = job.find('time', class_="job-search-card__listdate")['datetime'] if job.find('time', class_="job-search-card__listdate") else ' '

            # Update the company_posts dictionary
            if job_company not in company_posts:
                company_posts[job_company] = {keyword: []}
            else:
                if keyword not in company_posts[job_company]:                   
                    company_posts[job_company][keyword] = []

            # Append the job details to the corresponding keyword list
            company_posts[job_company][keyword].append({
                'Title': job_title,
                'Location': job_location,
                'Link': job_link,
                'Date': job_date
            })

        time.sleep(0.15)


# Initialization a cycle with keywords from the united_set
separator = '%20AND%20' if request_and else '%20'
for item in united_set:
    print(item, '...', end=' ')
    scrape_jobs(keyword=item.replace(' ', separator))

# Writing the data to a CSV file
with open(file_name, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Company', 'Keyword', 'Title', 'Location', 'Link', 'Date'])

    for company, keyword_data in company_posts.items():
        for keyword, posts in keyword_data.items():
            for post in posts:
                writer.writerow([company, keyword, post['Title'], post['Location'], post['Link'], post['Date']])
                
# Save the image information dictionary as a JSON file
with open('LinkedIn.json', 'w') as f:
    json.dump(company_posts, f, indent = 4)              

Time series analysis ... Data cleaning ... Predictive modeling ... SQL ... Recommender systems ... Random forest ... Data preprocessing ... Ensemble methods ... Data integration ... Data governance ... Recurrent neural networks ... Neural networks ... Model selection ... Reinforcement learning ... Clustering ... Data Science ... Data visualization ... Machine learning ... Cross-validation ... Convolutional neural networks ... Data mining ... Anomaly detection ... Dimensionality reduction ... Text analytics ... Statistical analysis ... Data management ... Feature selection ... Deep learning ... Business intelligence ... Big data ... Computer vision ... Hyperparameter tuning ... Natural language processing ... Feature engineering ... Artificial intelligence ... Regression analysis ... Decision trees ... Data warehousing ... NoSQL ... Bayesian methods ... Database ... Gradient boosting ... Support vector machines ... Unsupervised learning ... 