# CBS

In [74]:
# all imports
import pandas as pd
import numpy as np
import os
import instaloader
from instaloader import Post
import sys
import re

In [61]:
# functions    
def importCSV(filename):
    return pd.read_csv(filename, delimiter=';')

def outputCSV(dataset, filename):
    dataset.to_csv("./../data/" + filename, sep=';')
    
def cprint(text):
    sys.stdout.write("\r" + text)
    sys.stdout.flush()

In [63]:
# retrieve data
insta = importCSV('D:/Data/School/S7/Proftaak/Code Base/CBS/data/Coosto_berichten.csv')

# remove empty columns
insta = insta.drop(['zoekopdracht', 'type', 'volgers', 'invloed', 'titel', 'type bron'], axis=1)

# check with count()
insta.count()

datum               22
url                 22
sentiment            6
discussielengte     19
views                5
auteur              22
GPS breedtegraad     2
GPS lengtegraad      2
bericht tekst       22
dtype: int64

In [64]:
# Download all posts from instagram using an array of urls
def get_posts(urls):
    posts_dict = {}
    total_length = len(urls)
    
    for index, url in enumerate(urls):
        shortcode = url.split("/")[-2]
        
        try:
            L = instaloader.Instaloader()
            post = Post.from_shortcode(L.context, url.split("/")[-2])
            posts_dict[shortcode] = post
        except:
            pass
        
        cprint("Getting posts " + str(round((index / total_length) * 100)) + "% completed")
        
    return posts_dict


# Get the indexes of the posts which do not exist anymore
def get_non_exsisting_posts(dataset, posts_dict):
    indexes_to_drop = []
    
    for index, row in dataset.iterrows():
        shortcode = row['url'].split("/")[-2]
        if not shortcode in posts_dict:
            indexes_to_drop.append(index)
    
    return indexes_to_drop
            

# Delete posts from the dataset based on an array of indexes
def del_posts(data, indexes_to_drop):
    for index in indexes_to_drop:
        data = data.drop(index=index, axis=0)
    return data


# Enrich dataset with like count
def add_like_count_to_dataset(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split("/")[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'likes count'] = posts_dict[shortcode].likes
            
    return dataset


# Adds utc date to the dataset
def add_date_utc(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split('/')[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'datum utc'] = posts_dict[shortcode].date_utc
            
    return dataset


# Cleans invalid urls and enriches with like count
def clean_und_enrich(dataset):
    posts_dict = get_posts(dataset['url'])
    indexes_to_drop = get_non_exsisting_posts(dataset, posts_dict)
    dataset = del_posts(dataset, indexes_to_drop)
    dataset = add_like_count_to_dataset(dataset, posts_dict)
    dataset = add_date_utc(dataset, posts_dict)
    
    cprint('\nInvalid urls found: ' + str(len(indexes_to_drop)))
    
    return dataset


insta = clean_und_enrich(insta)

# Resets index
insta.index = range(len(insta))

Getting posts 0% completed

Getting posts 5% completed

Getting posts 9% completed

Getting posts 14% completed

Getting posts 18% completed

Getting posts 23% completed

Getting posts 27% completed

Getting posts 32% completed

Getting posts 36% completed

Getting posts 41% completed

Getting posts 45% completed

Getting posts 50% completed

Getting posts 55% completed

Getting posts 59% completed

Getting posts 64% completed

Getting posts 68% completed

Getting posts 73% completed

Getting posts 77% completed

Getting posts 82% completed

Getting posts 86% completed

Getting posts 91% completed

Getting posts 95% completed


Invalid urls found: 3

In [54]:
def remove_emoji(data):  
    indexes_to_drop = []
    for index, row in data.iterrows():
        a = row['bericht tekst']

        # todo: vul aan met meer emoji's
        emoji_pattern = re.compile("["
                          u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        newValue = emoji_pattern.sub(r'', a)
        if newValue == '' : 
            indexes_to_drop.append(newValue)
        else:
            data.at[index, 'bericht tekst'] = newValue
            
    return data

insta = remove_emoji(insta)
insta

Unnamed: 0,datum,url,sentiment,discussielengte,views,auteur,GPS breedtegraad,GPS lengtegraad,bericht tekst,likes count,hashtags
0,2017-06-20 17:06,https://instagram.com/p/BVjjp3Cl4uK/,,9.0,,marusinalavka,,,"@_korobo_ кстати хочу сказать, что коробочки ...",31.0,
1,2017-10-10 21:43,https://instagram.com/p/BaFFd0fBbcw/,,64.0,,artofobservance,,,So chic,3639.0,
2,2017-04-28 16:47,https://instagram.com/p/BTbF7AEDgHg/,,617.0,,amigabali,,,Incredible,534004.0,
3,2017-08-30 17:43,https://instagram.com/p/BYa-vCWnF-U/,+,20.0,,healthy.sneakers,,,😄😄👍,305.0,
4,2017-09-04 22:22,https://instagram.com/p/BYoVU-GFypi/,,20.0,,gpazminoyepez,,,@divaribas.82 aloe y fibern plus. Escribame 09...,2224.0,
5,2017-11-08 10:29,https://instagram.com/p/BbOnSBlFwmo/,+,10.0,,laurienblomphotography,,,"@remkevet oooh wat goed haha, heerlijk!:)",46.0,
6,2017-07-20 08:02,https://instagram.com/p/BWwfFxkFoHW/,,,10.0,breaking_glass,,,Lovely Japanese copy of Pin Ups. One of my fav...,63.0,
7,2018-02-10 22:27,https://instagram.com/p/Bdp4B0Pj6TL/,,3124.0,,belmacurkicc,,,@amanda.carlberg,223473.0,
8,2018-02-21 18:01,https://instagram.com/p/BfdzxFYlj-h/,+,596.0,,_lilysmith97_,,,@nobbsy91 look at this!,103847.0,
9,2017-08-28 17:40,https://instagram.com/p/BYVy0NRBGdr/,,2.0,,heimint,,,.......#,17.0,


In [84]:
def isolate_hashtag(data):  
    total_hashtags = []
    for index, row in data.iterrows():
        text = row['bericht tekst'] 
        print(text)
        
        # find all hashtags in text and isolate them in new column
        total_hashtags.append(re.findall(r"#(\w+)", text))
        
        #remove hashtags from text
        pattern = re.compile("#(\w+)")
        newText = pattern.sub(r'', text)
        data.at[index, 'bericht tekst'] = newText
        
    data['hashtags'] = total_hashtags
    return data

insta = importCSV('D:/Data/School/S7/Proftaak/Code Base/CBS/data/Coosto_berichten.csv')
insta = insta.drop(['zoekopdracht', 'type', 'volgers', 'invloed', 'titel', 'type bron'], axis=1)

insta = insta = clean_und_enrich(insta)
insta = remove_emoji(insta)
result = isolate_hashtag(insta)
result

@_korobo_  кстати хочу сказать, что коробочки снимать гораздо сложней.  Я пробовала свои отснять. Не просто, так сказать....
So chic
Incredible
😄😄👍
@divaribas.82 aloe y fibern plus. Escribame 0994895191
#itsbeginningtolookalotlikexmas #gelnails #gelnagels #nails #nagels #glitter #glitternails #nailart #nailstagram #instanails #nailitdaily #nailfie #nailsdesign #naildesigns #prettynails #nailpassion #xmasnails #christmasnails #happyholidays #shertogenbosch #debeauty_brows_lashes_nails #glitzandglam
@remkevet oooh wat goed haha, heerlijk!:)
Show ‘em you doing better
Lovely Japanese copy of Pin Ups. One of my favourites - soon after dark Emily cry's. #vinyljunkie #vinyladdict #welltempered #davidbowie #nowspinning
@amanda.carlberg
@nobbsy91  look at this!
     .......#
I can't wait to spend this weekend at #wondercon - - #tbt to Wondercon last year when I ran into @brucewayne626's amazing Armored Batman.
Schönen Abend für Euch 
Het programma word steeds slechter!!! Het nieuwe format is al

Unnamed: 0,datum,url,sentiment,discussielengte,views,auteur,GPS breedtegraad,GPS lengtegraad,bericht tekst,hashtags
0,2017-06-20 17:06,https://instagram.com/p/BVjjp3Cl4uK/,,9.0,,marusinalavka,,,"@_korobo_ кстати хочу сказать, что коробочки ...",[]
1,2017-10-10 21:43,https://instagram.com/p/BaFFd0fBbcw/,,64.0,,artofobservance,,,So chic,[]
2,2017-04-28 16:47,https://instagram.com/p/BTbF7AEDgHg/,,617.0,,amigabali,,,Incredible,[]
3,2017-08-30 17:43,https://instagram.com/p/BYa-vCWnF-U/,+,20.0,,healthy.sneakers,,,😄😄👍,[]
4,2017-09-04 22:22,https://instagram.com/p/BYoVU-GFypi/,,20.0,,gpazminoyepez,,,@divaribas.82 aloe y fibern plus. Escribame 09...,[]
5,2017-12-13 16:55,https://instagram.com/p/BcpfDpUDJYW/,,3.0,105.0,debeauty_brows_lashes_nails,,,,"[itsbeginningtolookalotlikexmas, gelnails, gel..."
6,2017-11-08 10:29,https://instagram.com/p/BbOnSBlFwmo/,+,10.0,,laurienblomphotography,,,"@remkevet oooh wat goed haha, heerlijk!:)",[]
7,2017-10-16 19:46,https://instagram.com/p/BaUVm3LA6dk/,,,401.0,floor.vincke,51.5667,4.8,Show ‘em you doing better,[]
8,2017-07-20 08:02,https://instagram.com/p/BWwfFxkFoHW/,,,10.0,breaking_glass,,,Lovely Japanese copy of Pin Ups. One of my fav...,"[vinyljunkie, vinyladdict, welltempered, david..."
9,2018-02-10 22:27,https://instagram.com/p/Bdp4B0Pj6TL/,,3124.0,,belmacurkicc,,,@amanda.carlberg,[]


In [41]:
# output new cleaned dataset
data = insta
outputCSV(data, "cleaned.csv")