In [56]:
import json
import pandas
import re
import string
import time
from textblob import TextBlob
import pickle
import functools

from collections import defaultdict, Counter
from datetime import datetime

In [57]:
#JSON to python Object
with open('..\\RawData\\B0\\B0.json', encoding='utf8') as access_json:
    read_content = json.load(access_json)

In [58]:
def cut_mediaId(urlStr):
    p = re.compile("\/(\d\S*)(.jpg|.mp4)")
    result = p.search(urlStr)
    idString = result.group(1)
    return idString

def timestampConv(timestamp):
    return(datetime.utcfromtimestamp(timestamp).strftime('%H:%M %d-%m-%Y '))

def getCaption(media_data):
        caption_access = media_data['edge_media_to_caption']
        edges_access = caption_access['edges']
        try:
            node_access = edges_access[0]
            text_access = node_access['node']
            caption = text_access["text"]
            return caption
        except IndexError:
            return ''


#Removes specified symbols and lowers texts
#Could also include Emojis
def texPrep(text):
    res = text.lower()
    symFilter = string.punctuation+"–•"
    res = res.translate(str.maketrans('', '', symFilter))
    res = re.sub("[^A-Za-züöä]+|[\n]|[h][t][t][p]\S*|[\t]"," ",res)
    res = res.strip()
    return res

#List to string
def wordComp(text, eList):
    temp = dict()
    words = text.split()
    for word in words:
        for e in eList:
            if word == e:
                if word not in temp:
                    temp[word] = 1
                else:
                    temp[word] = temp[word] + 1
    a = temp.copy()
    temp.clear()    
    return a

def getConceptsListFromJson(path, keyIdList):
    with open(path, encoding='utf8') as access_json:
        read_concepts = json.load(access_json)
    mediaConceptList = []
    for key in keyIdList:
        for item in read_concepts:
            if key == list(item.keys())[0]:
                access = item[key]
                tempList = []
                for concept in access:
                    conceptTemp = []
                    conceptTemp.append(concept['name'])
                    conceptTemp.append(concept['value'])
                    tempList.append(conceptTemp)
                mediaConceptList.append(tempList)
                keyNotIn = True
                break;
            else:
                keyNotIn = False
        if keyNotIn == False:
            mediaConceptList.append(None) 
    return mediaConceptList

def Average(lst): 
    try:
        return functools.reduce(lambda a, b: a + b, lst) / len(lst) 
    except TypeError:
        return 0

In [59]:
#Accessing GraphProfileInfo
profile_access = read_content['GraphProfileInfo']

In [60]:
#GraphImageValues
pinfo = profile_access['info']
profileDict = {'name' : pinfo['full_name'],
               'time' : timestampConv(profile_access['created_time']),
               'followers' : pinfo['followers_count'],
               'following' : pinfo['following_count'],
               'posts' : pinfo['posts_count']}

In [61]:
#Accessing GraphImages
media_access = read_content['GraphImages']

In [62]:
#Comment and PostCaption Extraction/ preparation
mediaCaptionList = []
mediaCaptionWordCount = []


#2D CommentList
mediaCommentList = []
mediaCommentAvgWordCount = []
for media_data in media_access:
    mediaCommentList.append([])

pointerx = 0
for media_data in media_access:
    mediaCaptionList.append(texPrep(getCaption(media_data).lower()))
    mediaCaptionWordCount.append(len(texPrep(getCaption(media_data)).split()))
    comments_access = media_data['comments']
    data_access = comments_access['data']
    mediaCommentWordCount = []                             
    for data in data_access:
        mediaCommentList[pointerx].append(texPrep(data['text']))
        mediaCommentWordCount.append(len(texPrep(data['text']).split()))
        
    mediaCommentAvgWordCount.append(round(Average(mediaCommentWordCount), 2))
    pointerx += 1
    

In [63]:
sentCa = []
caPol = []
caSub = []

for text in mediaCaptionList:
    listToStr = ''.join([str(elem) for elem in text]) 
    blob = TextBlob(text)
#     try:
#         blob = blob.translate(to='en')
#     except:
#         pass
    sentCa.append(blob.sentiment)
    caPol.append(blob.sentiment.polarity)
    caSub.append(blob.sentiment.subjectivity)
    

In [64]:
sentCo = []
coPol = []
coSub = []

for text in mediaCommentList:
    listToStr = ''.join([str(elem) for elem in text]) 
    temptext = '. '.join(map(str, text)) 
    blob = TextBlob(temptext)
#     try:
#         blob = blob.translate(to='en')
#     except:
#         pass
    sentCo.append(blob.sentiment)
    coPol.append(blob.sentiment.polarity)
    coSub.append(blob.sentiment.subjectivity)


In [65]:
timestamp = []
likeCount = []
commentCount = []
mediaId = []
commentsDisabled = []
isVideo = []
shortCode = []

for media_data in media_access:
    timestamp.append(timestampConv(media_data['taken_at_timestamp']))
    likeCount_access = media_data['edge_media_preview_like']
    likeCount.append(likeCount_access['count'])
    commentCount_access = media_data['edge_media_to_comment']
    commentCount.append(commentCount_access['count'])
    accessId = media_data['urls']
    mediaId.append(cut_mediaId(accessId[0]))
    commentsDisabled.append(media_data['comments_disabled'])
    isVideo.append(media_data['is_video'])
    shortCode.append(media_data['shortcode'])
    
#Flipp List
commentsDisabled = [not i for i in commentsDisabled]

# Check for uniqueness
flag = 0
flag = len(set(mediaId)) == len(mediaId)
assert(flag == True)

In [66]:
# Get Clarifai Concepts from Data
mediaConceptList = getConceptsListFromJson('..\\ProcessedData\\Clarifai\\B0Pred.json', mediaId)

In [67]:
#Profile variables
df_pf = pandas.DataFrame.from_dict([profileDict])
# df_pf.to_excel('C:\\Users\\Daniel\\Desktop\\Bachelorthesisdata\\DataExtraction\\ubspro.xlsx', index = False)
df_pf

Unnamed: 0,name,time,followers,following,posts
0,UBS,00:00 06-10-2010,69744,39,746


In [68]:
#Response variables
dic_rv = {"MediaId" : mediaId, "Likes" : likeCount, "Comments" : commentCount,"CommentAvgWordCount": mediaCommentAvgWordCount , "totalCommentPolarity" : coPol, "totalCommentSubjectivity" : coSub}
df_rv = pandas.DataFrame(dic_rv)
df_rv

Unnamed: 0,MediaId,Likes,Comments,CommentAvgWordCount,totalCommentPolarity,totalCommentSubjectivity
0,119458461_1650353565143111_6795079425087612831_n,607,2,0.00,0.000000,0.000000
1,119134046_150786313386065_4228200184618243600_n,437,2,2.00,0.000000,0.000000
2,118975878_2813522342212269_2416149592951258226_n,610,3,2.33,0.000000,0.000000
3,118992100_236573151086751_1362399829598950182_n,785,6,11.17,0.435714,0.668571
4,119048820_2462786327352377_2491683754171384026_n,713,4,7.50,-0.062500,0.437500
...,...,...,...,...,...,...
741,40101910_284643735481674_7019445138957719532_n,337,2,3.00,0.000000,0.125000
742,40284383_1914458538855142_7980413325711471563_n,215,1,0.00,0.000000,0.000000
743,40471505_2361904550703552_4199470685947927075_n,256,2,7.50,1.000000,1.000000
744,40326544_2174914742724736_1531757175100145664_n,227,6,20.00,0.104040,0.333838


In [69]:
#Explanatory variables
# , "Concepts": mediaConceptList
dic_ev = {"MediaId" : mediaId, "Timestamp" : timestamp, "CommentsEnabled" : commentsDisabled, "Video" : isVideo, "CaptionPolarity" : caPol, "CaptionSubjectivity" : caSub,"CaptionWordCount": mediaCaptionWordCount, "ICG": mediaConceptList}
df_ev = pandas.DataFrame(dic_ev)
df_ev

Unnamed: 0,MediaId,Timestamp,CommentsEnabled,Video,CaptionPolarity,CaptionSubjectivity,CaptionWordCount,ICG
0,119458461_1650353565143111_6795079425087612831_n,14:41 13-09-2020,True,False,0.002500,0.512500,143,"[[landscape, 0.9857326], [desert, 0.96861947],..."
1,119134046_150786313386065_4228200184618243600_n,13:33 12-09-2020,True,False,0.115909,0.319223,70,"[[pencil, 0.9938507], [college, 0.988171], [sc..."
2,118975878_2813522342212269_2416149592951258226_n,15:05 11-09-2020,True,False,0.116667,0.395833,61,"[[desktop, 0.98611414], [nature, 0.9781365], [..."
3,118992100_236573151086751_1362399829598950182_n,13:12 10-09-2020,True,False,0.166667,0.378704,67,"[[iceberg, 0.9970612], [ice, 0.99633443], [gla..."
4,119048820_2462786327352377_2491683754171384026_n,15:26 09-09-2020,True,False,0.196154,0.370513,125,"[[mountain, 0.99544024], [landscape, 0.9938926..."
...,...,...,...,...,...,...,...,...
741,40101910_284643735481674_7019445138957719532_n,15:07 08-09-2018,True,False,-0.047817,0.310615,68,"[[car, 0.9953246], [vehicle, 0.9947982], [tran..."
742,40284383_1914458538855142_7980413325711471563_n,14:44 07-09-2018,True,False,-0.045192,0.315385,84,"[[shopping, 0.9988129], [cart, 0.99792236], [b..."
743,40471505_2361904550703552_4199470685947927075_n,15:07 06-09-2018,True,False,0.308889,0.480556,83,"[[time, 0.9625405], [business, 0.94777244], [r..."
744,40326544_2174914742724736_1531757175100145664_n,13:37 05-09-2018,True,False,0.234226,0.588988,50,"[[container, 0.9966943], [jar, 0.99496186], [f..."


In [70]:
pickling_on = open("..\\ProcessedData\\B0.pickle","wb")
pickle.dump(df_ev, pickling_on)
pickle.dump(df_rv, pickling_on)
pickle.dump(df_pf, pickling_on)
pickling_on.close()