# CBS

In [5]:
# all imports
from instaloader import Post
import pandas as pd
import os
import instaloader
import sys
import re
import numpy as np

filepath = '../data/'
filename = 'Coosto_berichten.csv'

In [6]:
# functions    
def outputCSV(dataset, filename):
    dataset.to_csv(filepath + filename, sep=';')
    
def cprint(text):
    sys.stdout.write("\r" + text)
    sys.stdout.flush()

In [7]:
# retrieve data
insta = pd.read_csv(filepath + filename, delimiter=';')

# remove empty columns
insta = insta.drop(['zoekopdracht', 'type', 'volgers', 'invloed', 'titel', 'type bron'], axis=1)

# check with count()
insta.count()

datum               22
url                 22
sentiment            6
discussielengte     19
views                5
auteur              22
GPS breedtegraad     2
GPS lengtegraad      2
bericht tekst       22
dtype: int64

In [8]:
# Download all posts from instagram using an array of urls
def get_posts(urls):
    posts_dict = {}
    total_length = len(urls)
    
    for index, url in enumerate(urls):
        shortcode = url.split("/")[-2]
        
        try:
            L = instaloader.Instaloader()
            post = Post.from_shortcode(L.context, url.split("/")[-2])
            posts_dict[shortcode] = post
        except:
            pass
        
        cprint("Getting posts " + str(round((index / total_length) * 100)) + "% completed")
        
    return posts_dict


# Get the indexes of the posts which do not exist anymore
def get_non_exsisting_posts(dataset, posts_dict):
    indexes_to_drop = []
    
    for index, row in dataset.iterrows():
        shortcode = row['url'].split("/")[-2]
        if not shortcode in posts_dict:
            indexes_to_drop.append(index)
    
    return indexes_to_drop
            

# Delete posts from the dataset based on an array of indexes
def del_posts(data, indexes_to_drop):
    for index in indexes_to_drop:
        data = data.drop(index=index, axis=0)
    return data


# Enrich dataset with like count
def add_like_count_to_dataset(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split("/")[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'likes count'] = posts_dict[shortcode].likes
    return dataset


# Adds utc date to the dataset
def add_date_utc(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split('/')[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'datum utc'] = posts_dict[shortcode].date_utc
            
    return dataset


# Refreshes comment count
def refresh_comment_count(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split('/')[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'discussielengte'] = posts_dict[shortcode].comments
            
    return dataset


# Refreshes view count
def refresh_views(dataset, posts_dict):
    for index, row in dataset.iterrows():
        shortcode = row['url'].split('/')[-2]
        if shortcode in posts_dict:
            dataset.at[index, 'views'] = posts_dict[shortcode].likes
            
    return dataset

# Cleans invalid urls and enriches with like count, data utc, comment count and view count
def clean_und_enrich(dataset):
    posts_dict = get_posts(dataset['url'])
    indexes_to_drop = get_non_exsisting_posts(dataset, posts_dict)
    dataset = del_posts(dataset, indexes_to_drop)
    dataset = add_like_count_to_dataset(dataset, posts_dict)
    dataset = add_date_utc(dataset, posts_dict)
    dataset = refresh_comment_count(dataset, posts_dict)
    dataset = refresh_views(dataset, posts_dict)
    
    #cprint('\nInvalid urls found: ' + str(len(indexes_to_drop)))
    
    return dataset

def improve_sentiment(dataset):
    dataset['sentiment'] = dataset['sentiment'].replace(np.nan, '0')
    
    return dataset

def isolate_hashtag(data):  
    total_hashtags = []
    for index, row in data.iterrows():
        text = row['bericht tekst'] 
        #print(text)
        
        # find all hashtags in text and isolate them in new column
        total_hashtags.append(' '.join(re.findall(r"#(\w+)", text)))
        
        #remove hashtags from text
        pattern = re.compile("#(\w+)")
        newText = pattern.sub(r'', text)
        data.at[index, 'bericht tekst'] = newText
        
    
    data['hashtags'] = total_hashtags
    return data

def remove_emoji(data):  
    indexes_to_drop = []
        
    for index, row in data.iterrows():
        a = row['bericht tekst']

        # todo: vul aan met meer emoji's
        emoji_pattern = re.compile("["
                          u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        newValue = emoji_pattern.sub(r'', a)
        newValue = newValue.replace('🥗', '')
        if newValue == '' : 
            indexes_to_drop.append(index)
            
        else:
            data.at[index, 'bericht tekst'] = newValue
    
    pass
    data = del_posts(data, indexes_to_drop)
    return data

In [9]:
#insta = improve_sentiment(insta)
#insta = remove_emoji(insta)
#insta = clean_und_enrich(insta)
#insta = isolate_hashtag(insta)

# Resets index
#insta.index = range(len(insta))

In [10]:
#insta.head(20)

In [11]:
# output new cleaned dataset
#data = insta
#outputCSV(data, "cleaned.csv")

In [19]:
chunksize = 2500
made_files = 0

def split_file(made_files):
    for i,chunk in enumerate(pd.read_csv(filepath + filename, delimiter=';', chunksize=chunksize)):
        chunk = chunk.drop(['zoekopdracht', 'type', 'volgers', 'invloed', 'titel', 'type bron'], axis=1)
        chunk.to_csv(filepath + 'set{}.csv'.format(i), sep=';')
        made_files +=1
    return made_files

made_files = split_file(made_files)
print(made_files)
    

1


In [20]:
#Cleaning al data files after another
def clean_splitted_files():
    for num in range(0, made_files):
        setfile = pd.read_csv(filepath + "set"+ str(num) + '.csv', delimiter=';')

        setfile = improve_sentiment(setfile)
        setfile = remove_emoji(setfile)
        setfile = clean_und_enrich(setfile)
        setfile = isolate_hashtag(setfile)

        # Resets index
        outputCSV(setfile,"set"+ str(num) + '.csv')
        print( "\n" + str(num + 1) + " file done")

clean_splitted_files()

Getting posts 95% completed
1files done


In [11]:
#Mergin all all datafile in one file
def merge_splitted_files_to_one():
    merged_filename = "merged_file.csv"
    try:
        os.remove(filepath + merged_filename)
    except OSError:
        pass

    outputfile = open(filepath + merged_filename, "a", encoding="utf8")
    for line in open(filepath + "set0.csv", encoding="utf8"):
        outputfile.write(line)
    # now the rest:    
    for num in range(1,made_files):
        setfile = open(filepath +"set"+ str(num) +".csv", encoding="utf8")
        setfile.__next__() # skip the header
        for line in setfile:
             outputfile.write(line)
        setfile.close() # not really needed
    outputfile.close()
    
merge_splitted_files_to_one()
print("Done! :P")

Done! :P
