# CBS

In [9]:
# all imports
from instaloader import Post
import pandas as pd
import os
import instaloader
import sys
import re
import numpy as np

In [10]:
# functions    
def outputCSV(dataset, filename):
    dataset.to_csv("data/" + filename, sep=';')
    
def cprint(text):
    sys.stdout.write("\r" + text)
    sys.stdout.flush()

In [11]:
# retrieve data
insta = pd.read_csv('data/chunk3.csv', delimiter=',')

# remove empty columns
insta = insta.drop(['zoekopdracht', 'type', 'volgers', 'invloed', 'titel', 'type bron'], axis=1)

# check with count()
insta.count()

Unnamed: 0          2487
datum               2487
url                 2487
sentiment            967
discussielengte      930
views               2264
auteur              2487
GPS breedtegraad    1348
GPS lengtegraad     1348
bericht tekst       2487
dtype: int64

In [12]:
# Download all posts from instagram using an array of urls
def get_posts(urls):
    posts_dict = {}
    total_length = len(urls)
    
    for index, url in enumerate(urls):
        shortcode = url.split("/")[-2]
        
        try:
            L = instaloader.Instaloader()
            post = Post.from_shortcode(L.context, url.split("/")[-2])
            posts_dict[shortcode] = post
        except:
            pass
        
        cprint("Getting posts " + str(round((index / total_length) * 100)) + "% completed")
        
    return posts_dict


# Get the indexes of the posts which do not exist anymore
def get_non_exsisting_posts(dataset, posts_dict):
    indexes_to_drop = []
    
    for index, row in dataset.iterrows():
        shortcode = row['url'].split("/")[-2]
        if not shortcode in posts_dict:
            indexes_to_drop.append(index)
    
    return indexes_to_drop
            

# Delete posts from the dataset based on an array of indexes
def del_posts(data, indexes_to_drop):
    for index in indexes_to_drop:
        data = data.drop(index=index, axis=0)
    return data


# Enrich dataset with like count
def add_like_count_to_dataset(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split("/")[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'likes count'] = posts_dict[shortcode].likes
    return dataset


# Adds utc date to the dataset
def add_date_utc(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split('/')[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'datum utc'] = posts_dict[shortcode].date_utc
            
    return dataset


# Refreshes comment count
def refresh_comment_count(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split('/')[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'discussielengte'] = posts_dict[shortcode].comments
            
    return dataset


# Refreshes view count
def refresh_views(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split('/')[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'views'] = posts_dict[shortcode].likes
            
    return dataset

# Cleans invalid urls and enriches with like count, data utc, comment count and view count
def clean_und_enrich(dataset):
    posts_dict = get_posts(dataset['url'])
    indexes_to_drop = get_non_exsisting_posts(dataset, posts_dict)
    dataset = del_posts(dataset, indexes_to_drop)
    dataset = add_like_count_to_dataset(dataset, posts_dict)
    dataset = add_date_utc(dataset, posts_dict)
    dataset = refresh_comment_count(dataset, posts_dict)
    dataset = refresh_views(dataset, posts_dict)
    
    cprint('\nInvalid urls found: ' + str(len(indexes_to_drop)))
    
    return dataset

def improve_sentiment(dataset):
    dataset['sentiment'] = dataset['sentiment'].replace(np.nan, '0')
    
    return dataset

def isolate_hashtag(data):  
    total_hashtags = []
    for index, row in data.iterrows():
        text = row['bericht tekst'] 
        
        # find all hashtags in text and isolate them in new column
        total_hashtags.append(re.findall(r"#(\w+)", text))
        
        #remove hashtags from text
        pattern = re.compile("#(\w+)")
        newText = pattern.sub(r'', text)
        data.at[index, 'bericht tekst'] = newText
        
    data['hashtags'] = total_hashtags
    return data

def remove_emoji(data):  
    indexes_to_drop = []
        
    for index, row in data.iterrows():
        a = row['bericht tekst']

        # todo: vul aan met meer emoji's
        emoji_pattern = re.compile("["
                          u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        newValue = emoji_pattern.sub(r'', a)
        newValue = newValue.replace('🥗', '')
        if newValue == '' : 
            indexes_to_drop.append(index)
            
        else:
            data.at[index, 'bericht tekst'] = newValue
    
    pass
    data = del_posts(data, indexes_to_drop)
    return data

In [13]:
insta = improve_sentiment(insta)
insta = remove_emoji(insta)
insta = clean_und_enrich(insta)
insta = isolate_hashtag(insta)

# Resets index
insta.index = range(len(insta))

Getting posts 39% completed
HTTP redirect from https://www.instagram.com/p/BeusRxKhQMy1f1i97VL6ZG4g8-Md11OsFGwgAA0/ to https://www.instagram.com/marywn89
Getting posts 100% completed
Invalid urls found: 365

In [14]:
insta.head(20)

Unnamed: 0.1,Unnamed: 0,datum,url,sentiment,discussielengte,views,auteur,GPS breedtegraad,GPS lengtegraad,bericht tekst,likes count,datum utc,hashtags
0,7500,2017-08-20 20:01,https://instagram.com/p/BYBmExPhOg5/,0,0.0,7.0,draculasbedroom,51.769127,5.528576,Bijna zonder hints. Dracula heeft weer een koe...,7.0,2017-08-20 18:01:39,"[dracula, escaperoom, escape, centrumoss, oss]"
1,7501,2017-08-16 14:04,https://instagram.com/p/BX2p_rjhxt5/,0,0.0,15.0,parnassoscultuurcentrum,52.087502,5.141111,Er wordt flink getekend en geschilderd! @uit...,15.0,2017-08-16 12:04:15,"[parnassos, cultuurcentrum, utrecht, utrechtun..."
2,7502,2017-01-06 23:25,https://instagram.com/p/BO8IkP0D-nS/,0,1.0,14.0,riannehaegens,,,Naar de Wooning 6daagse Rotterdam 2017 wezen k...,14.0,2017-01-06 22:25:03,"[samenmet, Inspiratie, AndereSportenBekijken]"
3,7503,2018-04-15 18:41,https://instagram.com/p/BhmSGT_goU-/,0,0.0,6.0,rsbos51,52.033333,5.95,Water drops,6.0,2018-04-15 16:41:32,"[natuurmonumenten, nature, naturephotography, ..."
4,7505,2017-12-29 16:56,https://instagram.com/p/BdSr3pTjKRO/,0,0.0,30.0,obelink_de,51.959881,6.686708,Herzlichen Dank allen Besucherinnen und Besuch...,30.0,2017-12-29 15:56:20,"[freizeitmesse, obelink, messe, freizeit, mega..."
5,7507,2017-10-19 15:44,https://instagram.com/p/BaboTHVFMFZ/,+,4.0,44.0,about_cosmo,,,Amsterdam,44.0,2017-10-19 13:44:07,"[iloveamsterdam, grachten, fietsen, autumn, mi..."
6,7508,2017-09-17 12:01,https://instagram.com/p/BZI1WBShcWC/,+,0.0,59.0,cutkillavince,51.44812,5.453393,Eindhoven street art,59.0,2017-09-17 10:01:09,"[art, design, eindhoven, street, photo, graffi..."
7,7509,2018-01-20 11:34,https://instagram.com/p/BeKwg92jnvb/,0,1.0,174.0,instascrum,51.700001,5.316667,"Dear rugbysaturday, beware of the prop 🦍 . @ru...",174.0,2018-01-20 10:34:24,"[ereklasse, rugbysaterday, rugbyday, propstyle..."
8,7511,2017-03-28 11:22,https://instagram.com/p/BSLTbodgBAI/,0,2.0,25.0,heerlijk.nl,,,Feest bij restaurant 't Ambachthuys in Hillege...,25.0,2017-03-28 09:22:47,"[culinair, gastronomie, heerlijknl]"
9,7512,2017-04-07 15:56,https://instagram.com/p/BSlivSnDJUA/,+,0.0,31.0,tessabezuurpronk,52.660168,4.848006,"Zie je jezelf al staan? In de natuur, niets da...",31.0,2017-04-07 13:56:48,"[heerhugowaard, waarderhout, relax, ontspan, y..."


In [15]:
# output new cleaned dataset
data = insta
outputCSV(data, "cleaned.csv")