# CBS

In [2]:
# all imports
from instaloader import Post
import pandas as pd
import sklearn as sk
import os
import instaloader
import sys
import re
import numpy as np

  readline_hook.enable(use_pyreadline=use_pyreadline)


In [8]:
# functions    
def outputCSV(dataset, filename):
    dataset.to_csv("../../data/" + filename, sep=';')
    
def cprint(text):
    sys.stdout.write("\r" + text)
    sys.stdout.flush()

In [3]:
# retrieve data
insta = pd.read_csv('../../data/Coosto_berichten.csv', delimiter=';')

# remove empty columns
insta = insta.drop(['zoekopdracht', 'type', 'titel'], axis=1)

# check with count()
insta.count()

datum               100
url                 100
sentiment            44
discussielengte      36
views                91
auteur              100
volgers             100
invloed             100
GPS breedtegraad     58
GPS lengtegraad      58
bericht tekst       100
type bron           100
dtype: int64

In [4]:
# Download all posts from instagram using an array of urls
def get_posts(urls):
    posts_dict = {}
    total_length = len(urls)
    
    for index, url in enumerate(urls):
        shortcode = url.split("/")[-2]
        
        try:
            L = instaloader.Instaloader()
            post = Post.from_shortcode(L.context, url.split("/")[-2])
            posts_dict[shortcode] = post
        except:
            pass
        
        cprint("Getting posts " + str(round((index / total_length) * 100)) + "% completed")
        
    return posts_dict


# Get the indexes of the posts which do not exist anymore
def get_non_exsisting_posts(dataset, posts_dict):
    indexes_to_drop = []
    
    for index, row in dataset.iterrows():
        shortcode = row['url'].split("/")[-2]
        if not shortcode in posts_dict:
            indexes_to_drop.append(index)
    
    return indexes_to_drop
            

# Delete posts from the dataset based on an array of indexes
def del_posts(data, indexes_to_drop):
    for index in indexes_to_drop:
        data = data.drop(index=index, axis=0)
    return data


# Enrich dataset with like count
def add_like_count_to_dataset(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split("/")[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'likes count'] = posts_dict[shortcode].likes
    return dataset


# Adds utc date to the dataset
def add_date_utc(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split('/')[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'datum utc'] = posts_dict[shortcode].date_utc
            
    return dataset


# Refreshes comment count
def refresh_comment_count(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split('/')[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'discussielengte'] = posts_dict[shortcode].comments
            
    return dataset


# Refreshes view count
def refresh_views(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split('/')[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'views'] = posts_dict[shortcode].likes
            
    return dataset

# Cleans invalid urls and enriches with like count, data utc, comment count and view count
def clean_und_enrich(dataset):
    posts_dict = get_posts(dataset['url'])
    indexes_to_drop = get_non_exsisting_posts(dataset, posts_dict)
    dataset = del_posts(dataset, indexes_to_drop)
    dataset = add_like_count_to_dataset(dataset, posts_dict)
    dataset = add_date_utc(dataset, posts_dict)
    dataset = refresh_comment_count(dataset, posts_dict)
    dataset = refresh_views(dataset, posts_dict)
    
    cprint('\nInvalid urls found: ' + str(len(indexes_to_drop)))
    
    return dataset

def improve_sentiment(dataset):
    dataset['sentiment'] = dataset['sentiment'].replace(np.nan, '0')
    
    return dataset

def isolate_hashtag(data):  
    total_hashtags = []
    for index, row in data.iterrows():
        text = row['bericht tekst'] 
        
        # find all hashtags in text and isolate them in new column
        total_hashtags.append(re.findall(r"#(\w+)", text))
        
        #remove hashtags from text
        pattern = re.compile("#(\w+)")
        newText = pattern.sub(r'', text)
        data.at[index, 'bericht tekst'] = newText
        
    data['hashtags'] = total_hashtags
    return data

def remove_emoji(data):  
    indexes_to_drop = []
        
    for index, row in data.iterrows():
        a = row['bericht tekst']

        # todo: vul aan met meer emoji's
        emoji_pattern = re.compile("["
                          u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        newValue = emoji_pattern.sub(r'', a)
        newValue = newValue.replace('🥗', '')
        if newValue == '' : 
            indexes_to_drop.append(index)
            
        else:
            data.at[index, 'bericht tekst'] = newValue
    
    pass
    data = del_posts(data, indexes_to_drop)
    return data

In [5]:
insta = improve_sentiment(insta)
insta = remove_emoji(insta)
insta = clean_und_enrich(insta)
insta = isolate_hashtag(insta)

# Resets index
insta.index = range(len(insta))

Getting posts 99% completed
Invalid urls found: 10New Life..New Interior Love the Windowseat. Thanx to Studio #daphnalaurens  #interior #interiordesigner #interiordesign #daphnalaurens #dutch #dutchdesign #eindhoven #holland #ontwerpers  www.daphnalaurens.nl  #interioradvice #interieur #inspiration #inspiratie #bench #windowseat #bankje #raamdecoratie #color #frame #licht #light #letloverule 
Zo eindelijk mijn hoofd leeg maken en relaxen met mijn leukerds op de bank met netflix een chaotische dag met veel afspraken en moeilijke/emotionele cliënten gesprekken zorgt voor een onrustig gevoel die ik gelukkig bij thuis komst meestal wel van mij af kan schudden! #happy #healthy #day #dutch #girl #weert #dog #goldenretriever #puppy #dogsofinsta #love #lifestyle #cute #home #inrichting #dietician #nutrition #dietist
Texel wat ben je gaaf!#beachlife #beachday #strand #beachgirl #island #waddeneiland #texel #meivakantie2017 #springbreak #holiday #strandpaal #zand #summeriscoming #boat #waterspor

In [6]:
insta.head(20)

Unnamed: 0,datum,url,sentiment,discussielengte,views,auteur,volgers,invloed,GPS breedtegraad,GPS lengtegraad,bericht tekst,type bron,likes count,datum utc,hashtags
0,2017-10-04 23:39,https://instagram.com/p/BZ12v7anQYg/,+,3.0,63.0,snapshots_by_eve,0,0,51.450001,5.46667,New Life..New Interior Love the Windowseat. Th...,instagram,63.0,2017-10-04 21:39:15,"[daphnalaurens, interior, interiordesigner, in..."
1,2017-01-10 20:29,https://instagram.com/p/BPGHmmvgHka/,+,1.0,64.0,daantjj94,0,0,,,Zo eindelijk mijn hoofd leeg maken en relaxen ...,instagram,64.0,2017-01-10 19:29:02,"[happy, healthy, day, dutch, girl, weert, dog,..."
2,2017-04-30 19:03,https://instagram.com/p/BThGWOTgj5Y/,+,6.0,64.0,123hamamdoek,0,0,53.066666,4.8,Texel wat ben je gaaf!,instagram,64.0,2017-04-30 17:03:08,"[beachlife, beachday, strand, beachgirl, islan..."
3,2018-01-04 17:31,https://instagram.com/p/BdiMqJQhqXG/,+,1.0,36.0,martes_den_haag,0,0,52.084721,4.31372,"Dear friends, due to unforeseen circumstances ...",instagram,36.0,2018-01-04 16:31:28,"[hooistraat, buurtschap2005, lunchroom, thehag..."
4,2017-11-13 21:36,https://instagram.com/p/BbcvV3Vl4Qd/,0,1.0,148.0,lincolnreis2016,0,0,-23.938519,-46.326088,"Nesta segunda-feira (13/11), as crianças da V...",instagram,148.0,2017-11-13 20:36:18,"[Breda, JuntosSomosMaisFortes, AquiÉTrabalho, ..."
5,2017-11-10 17:56,https://instagram.com/p/BbUn2YQAgXg/,+,1.0,28.0,telecombinatierotterdamnoord,0,0,51.935833,4.477778,Mevrouw Vermeulen is dolgelukkig met haar nieu...,instagram,28.0,2017-11-10 16:56:55,"[telecombinatie, Rotterdam, zwartjanstraat, no..."
6,2018-03-06 23:20,https://instagram.com/p/Bf_5FKoHSiC/,0,1.0,13.0,didirire,0,0,51.916698,4.5,,instagram,13.0,2018-03-06 22:20:24,"[sunset, rotterdam, erasmusbrug]"
7,2017-12-26 20:46,https://instagram.com/p/BdLXxCSBTz0/,0,1.0,25.0,henk.vanloon,0,0,,,,instagram,25.0,2017-12-26 19:46:00,"[11dec2017, oirschot, NL, winter, wintertime, ..."
8,2017-02-19 17:57,https://instagram.com/p/BQs2CPeFJgK/,0,0.0,28.0,hablemosdetenis,0,0,51.916698,4.5,"Gran remontada de Tsonga por 4-6, 6-4 y 6-1 a...",instagram,28.0,2017-02-19 16:57:18,"[ABNAMROWTT, ATP500, ATP, HablemosDeTenis, Ten..."
9,2018-01-12 20:55,https://instagram.com/p/Bd3Kat2hZ-i/,+,0.0,9.0,wilco_nl,0,0,51.925831,4.00389,Zooooo mooi,instagram,9.0,2018-01-12 19:55:56,"[texel, slufter]"


In [9]:
# output new cleaned dataset
data = insta
outputCSV(data, "cleaned.csv")

In [3]:
# text analyse d.m.v. machine_learning
posts = pd.read_csv("../../data/cleaned.csv")

ParserError: Error tokenizing data. C error: Expected 23 fields in line 17, saw 31
