# CBS

In [5]:
# all imports
import pandas as pd
import os
import instaloader
from instaloader import Post
import sys
import re

In [6]:
# functions    
def outputCSV(dataset, filename):
    dataset.to_csv("C:/Users/Jandie/PyCharmProjects/CBS/data/" + filename, sep=';')
    
def cprint(text):
    sys.stdout.write("\r" + text)
    sys.stdout.flush()

In [7]:
# retrieve data
insta = pd.read_csv('C:/Users/Jandie/PyCharmProjects/CBS/data/Coosto_berichten.csv', delimiter=';')

# remove empty columns
insta = insta.drop(['zoekopdracht', 'type', 'volgers', 'invloed', 'titel', 'type bron'], axis=1)

# check with count()
insta.count()

datum               22
url                 22
sentiment            6
discussielengte     19
views                5
auteur              22
GPS breedtegraad     2
GPS lengtegraad      2
bericht tekst       22
dtype: int64

In [13]:
# Download all posts from instagram using an array of urls
def get_posts(urls):
    posts_dict = {}
    total_length = len(urls)
    
    for index, url in enumerate(urls):
        shortcode = url.split("/")[-2]
        
        try:
            L = instaloader.Instaloader()
            post = Post.from_shortcode(L.context, url.split("/")[-2])
            posts_dict[shortcode] = post
        except:
            pass
        
        cprint("Getting posts " + str(round((index / total_length) * 100)) + "% completed")
        
    return posts_dict


# Get the indexes of the posts which do not exist anymore
def get_non_exsisting_posts(dataset, posts_dict):
    indexes_to_drop = []
    
    for index, row in dataset.iterrows():
        shortcode = row['url'].split("/")[-2]
        if not shortcode in posts_dict:
            indexes_to_drop.append(index)
    
    return indexes_to_drop
            

# Delete posts from the dataset based on an array of indexes
def del_posts(data, indexes_to_drop):
    for index in indexes_to_drop:
        data = data.drop(index=index, axis=0)
    return data


# Enrich dataset with like count
def add_like_count_to_dataset(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split("/")[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'likes count'] = posts_dict[shortcode].likes
            
    return dataset


# Adds utc date to the dataset
def add_date_utc(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split('/')[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'datum utc'] = posts_dict[shortcode].date_utc
            
    return dataset


# Refreshes comment count
def refresh_comment_count(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split('/')[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'discussielengte'] = posts_dict[shortcode].comments
            
    return dataset


# Cleans invalid urls and enriches with like count, data utc and comment count
def clean_und_enrich(dataset):
    posts_dict = get_posts(dataset['url'])
    indexes_to_drop = get_non_exsisting_posts(dataset, posts_dict)
    dataset = del_posts(dataset, indexes_to_drop)
    dataset = add_like_count_to_dataset(dataset, posts_dict)
    dataset = add_date_utc(dataset, posts_dict)
    dataset = refresh_comment_count(dataset, posts_dict)
    
    cprint('\nInvalid urls found: ' + str(len(indexes_to_drop)))
    
    return dataset


insta = clean_und_enrich(insta)

# Resets index
insta.index = range(len(insta))

Getting posts 0% completed

Getting posts 5% completed

Getting posts 11% completed

Getting posts 16% completed

Getting posts 21% completed

Getting posts 26% completed

Getting posts 32% completed

Getting posts 37% completed

Getting posts 42% completed

Getting posts 47% completed

Getting posts 53% completed

Getting posts 58% completed

Getting posts 63% completed

Getting posts 68% completed

Getting posts 74% completed

Getting posts 79% completed

Getting posts 84% completed

Getting posts 89% completed

Getting posts 95% completed


Invalid urls found: 0

In [14]:
def remove_emoji(data):  
    indexes_to_drop = []
    for index, row in data.iterrows():
        a = row['bericht tekst']

        # todo: vul aan met meer emoji's
        emoji_pattern = re.compile("["
                          u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        newValue = emoji_pattern.sub(r'', a)
        if newValue == '' : 
            indexes_to_drop.append(newValue)
        else:
            data.at[index, 'bericht tekst'] = newValue
            
    return data

insta = remove_emoji(insta)

In [15]:
import numpy as np


def improve_sentiment(dataset):
    dataset['sentiment'] = dataset['sentiment'].replace(np.nan, '0')
    
    return dataset


insta = improve_sentiment(insta)

In [16]:
# output new cleaned dataset
data = insta
outputCSV(data, "cleaned.csv")

In [17]:
insta.head()

Unnamed: 0,datum,url,sentiment,discussielengte,views,auteur,GPS breedtegraad,GPS lengtegraad,bericht tekst,likes count,datum utc
0,2017-06-20 17:06,https://instagram.com/p/BVjjp3Cl4uK/,0,8.0,,marusinalavka,,,"@_korobo_ –∫—Å—Ç–∞—Ç–∏ —Ö–æ—á—É —Å–∫–∞–∑–∞—Ç—å, —á—Ç–æ –∫–æ—Ä–æ–±–æ—á–∫–∏ ...",31.0,2017-06-20 09:00:29
1,2017-10-10 21:43,https://instagram.com/p/BaFFd0fBbcw/,0,62.0,,artofobservance,,,So chic,3639.0,2017-10-10 19:36:27
2,2017-04-28 16:47,https://instagram.com/p/BTbF7AEDgHg/,0,1370.0,,amigabali,,,Incredible,534004.0,2017-04-28 09:03:58
3,2017-08-30 17:43,https://instagram.com/p/BYa-vCWnF-U/,+,19.0,,healthy.sneakers,,,üòÑüòÑüëç,305.0,2017-08-30 14:38:09
4,2017-09-04 22:22,https://instagram.com/p/BYoVU-GFypi/,0,43.0,,gpazminoyepez,,,@divaribas.82 aloe y fibern plus. Escribame 09...,2224.0,2017-09-04 19:05:41
