# Collect Data

In this notebook, we collect data of the [12 candidates of the French presidential campaign](https://fr.wikipedia.org/wiki/Candidats_%C3%A0_l%27%C3%A9lection_pr%C3%A9sidentielle_fran%C3%A7aise_de_2022):
- Nathalie Arthaud  
- Nicolas Dupont-Aignan 
- Anne Hidalgo 
- Yannick Jadot 
- Jean Lassalle 
- Marine Le Pen 
- Emmanuel Macron 
- Jean-Luc Mélenchon 
- Valérie Pécresse 
- Philippe Poutou 
- Fabien Roussel 
- Éric Zemmour 

We explore the following social networks:
> Twitter
> Youtube
> Instagram

We plan to study textual data so we collect tweets, video descriptions on youtube and text posts on Instagram. I give in the following the required criteria such that it respect the [laws](https://www.cnil.fr/fr/communication-politique-quelles-regles-pour-la-collecte-de-donnees-sur-les-reseaux-sociaux):

- I, Mathilde Boltenhagen, will collect, use and delete the data;
- Raw data will not be shared and deleted before the 30th of April;
- The data will be used to extract information, the entire analytic process will be shared in this folder;
- Any correction, access, deletion and other actions given by [les droits Informatique et Libertés](https://www.cnil.fr/fr/les-droits-pour-maitriser-vos-donnees-personnelles) will be considered, please contact [Mathilde Boltenhagen](https://www.linkedin.com/in/mathilde-boltenhagen/);
- The right to make a complaint to the CNIL;


The main question is:
*How do the candidates communicate through the social networks?*



In [None]:
import pandas as pd
from dateutil import parser
import datetime
import os
import json
import tweepy
from instagramy import InstagramUser 
import requests
import time
import random
import pytz
from apiclient.discovery import build 

Ids of the different apps. 

In [None]:
candidatesId = []
candidatesId.append({"name" : "Nathalie Arthaud", "twitterId" : "n_arthaud", "youtubeId" : "UUZsh-MrJftAOP_-ZgRgLScw", "instagramId" : "nathalie_arthaud_lo"})
candidatesId.append({"name" : "Nicolas Dupont-Aignan", "twitterId" : "dupontaignan", "youtubeId" : "UUfA5DnCDX3Ixy5QOAMGtBlA", "instagramId" : "dupontaignan"})
candidatesId.append({"name" : "Anne Hidalgo", "twitterId" : "Anne_Hidalgo", "youtubeId" : "UUcvK-yrz2_dSJUSNDJfq7JA", "instagramId" : "annehidalgo"})
candidatesId.append({"name" : "Yannick Jadot", "twitterId" : "yjadot", "youtubeId" : "UUw7v4zI01GNLUO6jYgIHx0w", "instagramId" : "yannickjadot"})
candidatesId.append({"name" : "Jean Lassalle", "twitterId" : "jeanlassalle", "youtubeId" : "UUdUat4f2yol7iMpCYoFUmNg", "instagramId" : "jeanlassalleoff"})
candidatesId.append({"name" : "Marine Le Pen", "twitterId" : "MLP_officiel", "youtubeId" : "UUU3z3px1_RCqYBwrs8LJVWg", "instagramId" : "marine_lepen"})
candidatesId.append({"name" : "Emmanuel Macron", "twitterId" : "EmmanuelMacron", "youtubeId" : "UUFqGa9uitcB-fWyNZK2xImw", "instagramId" : "emmanuelmacron"})
candidatesId.append({"name" : "Jean-Luc Mélenchon", "twitterId" : "JLMelenchon", "youtubeId" : "UUk-_PEY3iC6DIGJKuoEe9bw", "instagramId" : "jlmelenchon"})
candidatesId.append({"name" : "Valérie Pécresse", "twitterId" : "vpecresse", "youtubeId" : "UUXAKlEXGwoavQuOMaNBeaXw", "instagramId" : "vpecresse"})
candidatesId.append({"name" : "Philippe Poutou", "twitterId" : "PhilippePoutou", "youtubeId" : None, "instagramId" : "philippepoutou_officiel"})
candidatesId.append({"name" : "Fabien Roussel", "twitterId" : "Fabien_Roussel", "youtubeId" : None, "instagramId" : "fabien_roussel"})
candidatesId.append({"name" : "Éric Zemmour", "twitterId" : "ZemmourEric", "youtubeId" : "UUjTbZBXEw-gplUAnMXLYHpg", "instagramId" : "ericzemmour_"})

## TWITTER

In [None]:
# get last date of the previous collect
def getLastDate(file_name, column_name = "created_at"):
    # if the file exists, get maximal date, otherwise, get new year date
    try :
        df = pd.read_csv(file_name, sep=";")
        try :
            return parser.parse(df[column_name].max())
        except TypeError:
            return parser.parse("2022-01-01 00:01:01+00:00")
    except FileNotFoundError:
        return parser.parse("2022-01-01 00:01:01+00:00")

lastupdate = getLastDate("twitter.csv")
lastupdate

In [None]:
# read keys 
with open('twitter.json', 'r') as f:
    twitter_info = json.load(f)

consumer_key = twitter_info['consumer_key'] 
consumer_secret = twitter_info['consumer_secret']
access_token = twitter_info['access_token'] 
access_token_secret = twitter_info['access_token_secret'] 

# set auth
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [None]:
# the output, format is ['id', 'text', 'retweet_count','favorite_count', 'created_at']
collectedTwitterData = []

# for all candidates, get new tweets
for candidate_id in candidatesId:
    print(candidate_id["name"])
    # for each page of tweets
    cursor = tweepy.Cursor(api.user_timeline, id = candidate_id["twitterId"]).items(200)
    for tweet in cursor:
        # not a RT or a reply and after 2022 01 01
        if tweet.created_at >= lastupdate and(list(tweet.text)[:2] != ['R', 'T']) & (list(tweet.text)[0]!='@') :
            collectedTwitterData.append([candidate_id["name"], tweet.text, tweet.retweet_count , tweet.favorite_count, tweet.created_at])
        # we got all the tweets
        elif tweet.created_at < lastupdate: 
            print(tweet.created_at)
            break

In [None]:
# check last before saving, note that you should not 
collectedTwitterData[-1]

Please, ensure to have to rights to save the data: [Developer Agreement and Policy](https://developer.twitter.com/en/developer-terms/agreement-and-policy), [More about restricted uses of the Twitter APIs](https://developer.twitter.com/en/developer-terms/more-on-restricted-use-cases). 

In [None]:
# put into a df and save
df = pd.DataFrame(collectedTwitterData, columns = ['id', 'text', 'retweet_count','favorite_count', 'created_at'])

# Exporting the DataFrame as csv
df.to_csv('twitter.csv', index=False, sep=";", mode='a', header=False)

## YOUTUBE


In [None]:
with open('twitter.json', 'r') as f:
    youtube_key = json.load(f)

In [None]:
# launches requests per channel and per videos to get stats
def getAllDescriptionsOfAnAccount(accountId, name, output):
    nextPageToken = None
    dateOfPost = None
    # while we have new videos to explore for accountId 
    while  dateOfPost is None or dateOfPost <= lastupdate :
        # get api
        youtube = build('youtube', 'v3',developerKey = youtube_key["apikey"])
        # request 50 videos of channel accountId
        request = youtube.playlistItems().list(part = "snippet", playlistId = accountId, maxResults = 50,
                                               pageToken = nextPageToken)
        response = request.execute()
        for item in response["items"]:
            dateOfPost = parser.parse(item["snippet"]["publishedAt"])
            if dateOfPost < lastupdate:
                break
            else:
                # get stats
                request = youtube.videos().list( part = "statistics", id = item["snippet"]["resourceId"]["videoId"])
                rating = request.execute()
                # append to list of outputs
                if 'commentCount' not in rating["items"][0]["statistics"]:
                    rating["items"][0]["statistics"]['commentCount'] = 0
                output.append([name,item["snippet"]["publishedAt"],
                             item["snippet"]["description"],
                             rating["items"][0]["statistics"]["likeCount"],
                             rating["items"][0]["statistics"]["commentCount"]])
        if 'nextPageToken' not in response:
            break
        else:
            nextPageToken = response['nextPageToken']
    return output

In [None]:
# get description of all candidates
lastupdate = getLastDate("youtube.csv")

output = [] 
for c in candidatesId:
    if c["youtubeId"]:
        output = getAllDescriptionsOfAnAccount(c["youtubeId"],c["name"],output)

In [None]:
output[-1]

Please, ensure to have to rights to save the data: [Policies](https://developers.google.com/youtube/terms/developer-policies). 

In [None]:
# put into a df and save
df = pd.DataFrame(output, columns = ['id', 'created_at', 'description','like_count', 'comment_count'])

# Exporting the DataFrame as csv
df.to_csv('youtube.csv',  index=False, sep=";", mode='a', header=False)

## Instagram

In [None]:
# read keys 
with open('instagram.json', 'r') as f:
    insta_info = json.load(f)
    
sessionid = insta_info["sessionid" ]

In [None]:
def appendNextText(name, dateOfPost, e, output):
    # per element, we check if the date is ok and description not null
    if dateOfPost is None or dateOfPost >= lastupdate :
    if len(e["node"]["edge_media_to_caption"]["edges"]) != 0 :
        text = e["node"]["edge_media_to_caption"]["edges"][0]["node"]["text"]
    else:
        text = ""
    output.append([name,text, e["node"]["edge_media_preview_liked_by"]["count"], 
                   e["node"]["edge_media_to_comment"]["count"],
                   dateOfPost])
    return output

In [None]:
def getPhotoDescriptions(name, json, lastupdate, output):
    # for all media photo
    for e in json['entry_data']['ProfilePage'][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"]:
        # get date of the post
        dateOfPost = datetime.datetime.fromtimestamp(e["node"]["taken_at_timestamp"]).replace(tzinfo=pytz.UTC)
        # check the date and description and append to output
        output = appendNextText(name, dateOfPost, e, output)
        
    # iterate with TOKEN
    nextindex = json['entry_data']['ProfilePage'][0]["graphql"]["user"]["edge_owner_to_timeline_media"]['page_info']['end_cursor']
    while dateOfPost is None or dateOfPost >= lastupdate :
        idUser = json['entry_data']['ProfilePage'][0]["graphql"]["user"]["id"]
        # get next media 
        rrr= requests.get("https://www.instagram.com/graphql/query/?query_id=17888483320059182&id="+idUser+"&first=12&after="+nextindex, 
                   headers= {"Cookie":"sessionid=" + sessionid})
        # same loop as before
        for e in rrr.json()["data"]["user"]["edge_owner_to_timeline_media"]["edges"]:
            dateOfPost = datetime.datetime.fromtimestamp(e["node"]["taken_at_timestamp"]).replace(tzinfo=pytz.UTC)
            output = appendNextText(name, dateOfPost, e, output)
        nextindex = rrr.json()["data"]["user"]["edge_owner_to_timeline_media"]['page_info']['end_cursor']   
    return output

In [None]:
listOfDescription = []

In [None]:
# Connecting the profile 
for idC in candidatesId:
    print(idC["instagramId"])
    user = InstagramUser(idC["instagramId"], sessionid = sessionid) 

    # return list of dicts 
    info = user.get_json() 
    listOfDescription += getPhotoDescriptions(idC["name"],info,lastupdate,[])

In [None]:
# note that you should not keep instagram data. 
listOfDescription

Please note that you should not use this instagram script: [Policies](https://developers.facebook.com/docs/instagram-api/)