In [1]:
#Twitter
from tweepy import OAuthHandler, Stream
from tweepy.streaming import StreamListener
import tweepy

#YouTube
from googleapiclient.discovery import build

#Instagram
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import time
import re
from urllib.request import urlopen

#Tumblr
import pytumblr

In [2]:
import pandas as pd, numpy as np
from pandas import json_normalize
import json
from decouple import Config, RepositoryEnv
from selenium.webdriver.common.keys import Keys
import requests

In [3]:
DOTENV_FILE = '.env'
env_config = Config(RepositoryEnv(DOTENV_FILE))

In [4]:
def twitter_fetch():
    Twitter_ACCESS_TOKEN = env_config.get('Twitter_ACCESS_TOKEN')
    Twitter_ACCESS_SECRET = env_config.get('Twitter_ACCESS_SECRET')
    Twitter_CONSUMER_KEY = env_config.get('Twitter_CONSUMER_KEY')
    Twitter_CONSUMER_SECRET = env_config.get('Twitter_CONSUMER_SECRET')
    
    auth = tweepy.OAuthHandler(Twitter_CONSUMER_KEY, Twitter_CONSUMER_SECRET)
    auth.set_access_token(Twitter_ACCESS_TOKEN, Twitter_ACCESS_SECRET)
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True)
    
    for status in tweepy.Cursor(api.home_timeline).items(275):
        print(status._json)

In [5]:
def youtube_fetch(channelId):
    youTubeApiKey = env_config.get('youTubeApiKey')
    youTubeFileLocation = env_config.get('youTubeFileLocation')
    youtube = build('youtube','v3',developerKey=youTubeApiKey)
    youtubeURL = env_config.get('youtubeURL')
    
    statdata=youtube.channels().list(part='statistics',id=channelId).execute()
    contentdata=youtube.channels().list(id=channelId,part='contentDetails').execute()
    playlist_id = contentdata['items'][0]['contentDetails']['relatedPlaylists']['uploads']
    videos = [ ]
    next_page_token = None
    while 1:
        res = youtube.playlistItems().list(playlistId=playlist_id, part='snippet', maxResults=50, pageToken=next_page_token).execute()
        videos += res['items']
        next_page_token = res.get('nextPageToken')
        if next_page_token is None:
            break
        
    video_ids = list(map(lambda x:x['snippet']['resourceId']['videoId'], videos))
    stats = []
    for i in range(0, len(video_ids),40):
        res = (youtube).videos().list(id=','.join(video_ids[i:i+40]),part='statistics').execute()
        stats += res['items']
   
    title=[ ]
    liked=[ ]
    disliked=[ ]
    views=[ ]
    url=[ ]
    comment=[ ]

    for i in range(len(videos)):
        title.append((videos[i])['snippet']['title'])
        url.append(youtubeURL + (videos[i])['snippet']['resourceId']['videoId'])
        liked.append(int((stats[i])['statistics']['likeCount']))
        disliked.append(int((stats[i])['statistics']['dislikeCount']))
        views.append(int((stats[i])['statistics']['viewCount']))
        if 'commentCount' not in (stats[i])["statistics"]:
            comment.append(0)
        else:
            comment.append(int((stats[i])['statistics']['commentCount']))
    
    data={'title':title,'url':url,'liked':liked,'disliked':disliked,'views':views,'comment':comment}
    df=pd.DataFrame(data)
    df.to_csv(youTubeFileLocation, header = False, mode = 'w')

In [6]:
def instagram_fetch(hashtag_list):
    chromeDriverLocation = env_config.get('chromeDriverLocation')
    browser = webdriver.Chrome(chromeDriverLocation)
    
    instagramFileLocation = env_config.get('instagramFileLocation')
    instagramTagURL = env_config.get('instagramTagURL')
    instagramURL = env_config.get('instagramURL')
    
    for hashtag in hashtag_list:
        browser.get(instagramTagURL + hashtag)
        Pagelength = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        links=[]
        source = browser.page_source
        data=bs(source, 'html.parser')
        body = data.find('body')
        script = body.find('script', text=lambda t: t.startswith('window._sharedData'))
        links=[]
        source = browser.page_source
        data=bs(source, 'html.parser')
        body = data.find('body')
        script = body.find('script', text=lambda t: t.startswith('window._sharedData'))
        page_json = script.string.split(' = ', 1)[1].rstrip(';')
        data = json.loads(page_json)
        for link in data['entry_data']['TagPage'][0]['graphql']['hashtag']['edge_hashtag_to_media']['edges']:
            links.append(instagramURL + link['node']['shortcode']+'/')
        result=pd.DataFrame()
        for i in range(len(links)):
            try:
                page = urlopen(links[i]).read()
                data=bs(page, 'html.parser')
                body = data.find('body')
                script = body.find('script')
                raw = script.string.strip().replace('window._sharedData =', '').replace(';', '')
                json_data=json.loads(raw)
                posts =json_data['entry_data']['PostPage'][0]['graphql']
                posts= json.dumps(posts)
                posts = json.loads(posts)
                x = pd.DataFrame.from_dict(json_normalize(posts), orient='columns') 
                x.columns = x.columns.str.replace("shortcode_media.", "")
                result = result.append(x)
            except:
                np.nan
        result = result.drop_duplicates(subset = 'shortcode')
        result.index = range(len(result.index))
        result.to_csv(instagramFileLocation, mode = 'a', header = False)
    browser.quit()

In [7]:
def tumblr_fetch(tagList):
    Tumblr_API_KEY = env_config.get('Tumblr_API_KEY')
    tumblrFileLocation = env_config.get('tumblrFileLocation')
    client = pytumblr.TumblrRestClient(Tumblr_API_KEY)
    for tag in tagList:
        tagdata_json = client.tagged(tag, limit = 200)
        tumblr_df = pd.DataFrame.from_dict(json_normalize(tagdata_json), orient='columns') 
        tumblr_df.to_csv(tumblrFileLocation , mode = 'a', header = False)

In [8]:
def FetchHashtag():
    chromeDriverLocation = env_config.get('chromeDriverLocation')
    browser = webdriver.Chrome(chromeDriverLocation)

    googleURL = env_config.get('googleURL')
    browser.get(googleURL)
    search_query = browser.find_element_by_name('q')

    search_query.send_keys('popular hashtags')
    search_query.send_keys(Keys.ENTER)

    urls = browser.find_elements_by_tag_name('a')
    for i in urls:
        if i.get_attribute('href'):
            if 'google' in i.get_attribute('href'):
                continue
            else:
                url = i.get_attribute('href')
                break
    browser.quit()

    resp=requests.get(url)  
    soup=bs(resp.text,'html.parser')
    hashtag_list = []
    for li in soup.findAll("ul"):
        for i in li.findAll("p"):
            if i.string:
                if '#' in i.string:
                    hashtag_list.append(i.string.replace('#',''))
    instagram_fetch(hashtag_list)
    tumblr_fetch(hashtag_list)

In [None]:
x = input("Which platform do you want to use? \n1. Instagram\n2. Twitter\n3. Youtube\n4. Tumblr\n")
if x == "1":
    FetchHashtag()
elif x == "2":
    twitter_fetch()
elif x == "3":
    channelId = input("Enter channel Id whose data you want to fetch : ")
    youtube_fetch(channelId)
elif x == "4":
    FetchHashtag()
else:
    print("Wrong Value Entered, try again!!")