
# <div style="text-align: center;">
  <span style="color: #505050; font-size: 30px;">**Selenium Webscraping and Automation: Instagram Use Case**</span>
</div>


## Import Modules

In [None]:
#imports here
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import re
import time

## Login Page

In [None]:
# enter your credentials
username_str = 'your username'
password_str = 'your password'

In [None]:
#specify the path to chromedriver.exe (download and save on your computer)
driver = webdriver.Chrome()

#open the webpage
driver.get("http://www.instagram.com")

#target username
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username']")))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='password']")))

#enter username and password
username.clear()
username.send_keys(username_str)
password.clear()
password.send_keys(password_str)

#target the login button and click it
button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']"))).click()

# #nadle NOT NOW
not_now = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//div[contains(text(), "Not now")]'))).click()
not_now2 = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()

## Access Any Page

In [None]:
# After we sign in we can access any page on instagram
# enter the name of any page
company = 'linkedinlearning'
driver.get(f"http://www.instagram.com/{company}/")

In [None]:
driver.quit()

### Get Page Title

### Get Page Main Information: Number of Posts, Number of Followers and Number of Following

In [None]:
# locate the tag that contains the three element: number of posts, followers and following
d = driver.find_elements(By.CLASS_NAME, 'html-span')

# the find_elements return a list with the desirable content
posts = d[0].text
followers = d[1].text
following = d[2].text

# display results
print(f"Number of Followers: {followers}")
print(f"Number of Following: {following}")
print(f"Number of Posts: {posts}")

### Scroll Accross The Page to Get More Posts

In [None]:
urls = set()  # Using a set to avoid duplicate URLs
scrolls = 7  # Number of times you want to scroll
for i in range(scrolls):
    posts = driver.find_elements(By.CLASS_NAME, '_al3l')
    for post in posts:
        l = post.find_element(By.TAG_NAME, 'a').get_attribute('href')
        urls.add(l)  # Adds only new URLs

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  # Scroll down to bottom
    time.sleep(2)  # Wait for the page to load

# driver.quit()  # Close the browser when done

urls = list(urls)  # Convert set back to list if needed

In [None]:
len(urls)

## Final Script: Collecting All Data from Posts

### Collecting post data and text cleaning with regex

In [None]:
# Initialize lists to store information
likes = []
dates = []
captions = []
tot_times = []
tot_users = []
tot_comments = []
tot_num_likes = []
hashtags = []

# Strat looping on collected posts' urls 
for url in urls:
    
    # open the post page using its url
    driver.get(url)
    time.sleep(7.5)
    
    # get post like and store them in a list
    like = driver.find_element(By.CLASS_NAME, 'html-span')
    likes.append(like.text)
    
    # getting post date 
    date = driver.find_element(By.TAG_NAME, 'time').get_attribute('title')
    dates.append(date)
    
    # getting post caption in the comments element as the first comment using its XPATH
    cap = driver.find_element(By.XPATH, '/html/body/div[2]/div/div/div[2]/div/div/div[1]/div[1]/div[2]/section/main/div/div[1]/div/div[2]/div/div[2]/div/div[1]/div/div[2]/div/span/div/span')
    captions.append(cap.text)
    
    # locate the mother tag that contains the comments
    com = driver.find_elements(By.CLASS_NAME, "x18hxmgj")
    comm = []
    
    # getting all comments and store them in a list 
    for x in com:
        comm.append(x.text)
        
    # Regular expression pattern
    pattern = r"\d+[hwdms]" # time pattern, a number followed by one of any letter in the box 
    
    # every iteration we empty the lists to store new data for each post
    times = []
    users = []
    comments = []
    num_likes = []
    hashtag = []
    for i, x in enumerate(comm):
        if re.match(pattern, x):
            times.append(x) # the element that match the pattern (time)
            users.append(comm[i-1]) # user comes always before the time element
            comments.append(comm[i+1]) # comments come after the time element
            num_likes.append(comm[i+2]) # number of comment likes comes after the comment
        hashtag = re.findall(r'#[\w]+', x) # pattern that match the hashtag
    hashtags.append(hashtag)
    tot_times.append(times)
    tot_users.append(users)
    tot_comments.append(comments)
    tot_num_likes.append(num_likes)

### Further Preprocessing to get final results

In [None]:
# delete the first element because it's already collected 
for i, my_list in enumerate(tot_num_likes):
    del my_list[0]
    tot_num_likes[i] = ['0' if item == "Reply" else item for item in my_list] # replace Reply with 0
    
    '''
    when the comment have 0 like, the 'Reply' word will always replace the number of likes 
    So we replace the word 'Reply' with 0 as 0 likes
    
    '''
    
result = []

for sublist in tot_num_likes:
    new_sublist = []
    for item in sublist:
        # Find all occurrences of one or more digits
        numbers = re.findall(r'\d+', item)
        # Convert each found number to an integer and add to the new sublist
        new_sublist.extend([int(num) for num in numbers])
    # Add the new sublist to the result list
    result.append(new_sublist)
    
# Delete the first element from each list
for my_list in tot_comments:
    del my_list[0]
for my_list in tot_users:
    del my_list[0]
for my_list in tot_times:
    del my_list[0]

### Groupping All Together in a Dataframe

In [None]:
import pandas as pd
# Groupping all together with a dataframe
dc = pd.DataFrame()

dc['post likes'] = likes
dc['post date'] = dates
dc['post caption'] = captions
dc['comments'] = tot_comments
dc['time per comment'] = tot_times
dc['user per comment'] = tot_users
dc['likes per comment'] = tot_num_likes
dc['num of comments'] = [len(com) for com in dc['comments']]

dc

In [None]:
dc.to_csv("skillshare.csv", index=False)

### Collecting Pages Data

In [None]:
# list of companies names
companies = ['udemy', 'coursera', 'datacamp', 'linkedinlearning', 'skillshare', 'udacity', 'edxonline']
posts = []
followers = []
following = []

# iterating on each company page url and collecting desirable data
for company in companies:
    driver.get(f"http://www.instagram.com/{company}/")
    # d = driver.find_elements(By.CLASS_NAME, 'html-span')
    # Wait for the elements to be present
    wait = WebDriverWait(driver, 10)
    d = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'html-span')))
    posts.append(d[0].text)
    followers.append(d[1].text)
    following.append(d[2].text)

In [None]:
# Putting all together in a dataframe
companies_df = pd.DataFrame()
companies_df['name'] = companies
companies_df['num of posts'] = posts
companies_df['num of followers'] = followers
companies_df['num of following'] = following

In [None]:
companies_df

In [None]:
companies_df.to_csv('companies_info.csv', index=False)

<div style="text-align: center;">
  <span style="font-size: 3.5em; color: #505050;">Thank You!</span>
</div>
