In [23]:
import json
import pandas
import re
import string
import time
import pickle
import numpy
from textblob import TextBlob
import functools

from collections import defaultdict, Counter
from datetime import datetime
from itertools import chain, islice
from nltk.tokenize import sent_tokenize, word_tokenize

In [24]:
#JSON to python Object
with open('..\\RawData\\B1\\B1.json', encoding='utf8') as access_json:
    read_content = json.load(access_json)

In [25]:
def cut_mediaId(urlStr):
    p = re.compile("\/(\d\S*)(.jpg|.mp4)")
    result = p.search(urlStr)
    idString = result.group(1)
    return idString

def timestampConv(timestamp):
    return(datetime.utcfromtimestamp(timestamp).strftime('%H:%M %d-%m-%Y '))

def getCaption(media_data):
        caption_access = media_data['edge_media_to_caption']
        edges_access = caption_access['edges']
        try:
            node_access = edges_access[0]
            text_access = node_access['node']
            caption = text_access["text"]
            return caption
        except IndexError:
            return ''

# Return first n items of the iterable as a list
def take(n, iterable):
    return list(islice(iterable, n))

#Removes specified symbols and lowers texts
#Could also include Emojis
def texPrep(text):
    res = text.lower()
    symFilter = string.punctuation+"–•"
    res = res.translate(str.maketrans('', '', symFilter))
    res = re.sub("[^A-Za-züöä]+|[\n]|[h][t][t][p]\S*|[\t]"," ",res)
    res = res.strip()
    return res

#List to string
def wordComp(text, eList):
    temp = dict()
    words = text.split()
    for word in words:
        for e in eList:
            if word == e:
                if word not in temp:
                    temp[word] = 1
                else:
                    temp[word] = temp[word] + 1
    a = temp.copy()
    temp.clear()    
    return a

def getConceptsListFromJson(path, keyIdList):
    with open(path, encoding='utf8') as access_json:
        read_concepts = json.load(access_json)
    mediaConceptList = []
    for key in keyIdList:
        for item in read_concepts:
            if key == list(item.keys())[0]:
                access = item[key]
                tempList = []
                for concept in access:
                    conceptTemp = []
                    conceptTemp.append(concept['name'])
                    conceptTemp.append(concept['value'])
                    tempList.append(conceptTemp)      
                mediaConceptList.append(tempList)
                keyNotIn = True
                break;
            else:
                keyNotIn = False
        if keyNotIn == False:
            mediaConceptList.append(None) 
    return mediaConceptList
        
def getConceptsListFromJsonV(path, keyIdList):
    with open(path, encoding='utf8') as access_json:
        read_concepts = json.load(access_json)
    mediaConceptList = []
    for key in keyIdList:
        for frame in read_concepts:
            if key == list(frame.keys())[0]:
                tempList = []
                for item in frame[key]:
                    for concept in item['data']['concepts']:
#                         tempList.append(concept['name'])
                        conceptTemp = []
                        conceptTemp.append(concept['name'])
                        conceptTemp.append(concept['value'])
                        tempList.append(conceptTemp)
                keyNotIn = True
                break;
            else:
                keyNotIn = False
        if keyNotIn == True:
            mediaConceptList.append(tempList)
        if keyNotIn == False:
            mediaConceptList.append(None) 
    return mediaConceptList

def flattenList(alist):
    flat_list = []
    for sublist in mediaConceptListV:
        if sublist is None:
            flat_list.append(None)
        else: 
            tempList = []
            for item in sublist:    
                tempList.append(item[0])
            flat_list.append(tempList)
    return flat_list


def getMostCommonFromVideo(conceptList):
    newList = []
#     flatConceptList = flattenList(conceptList)
    for video in conceptList:
        if video is not None:
            tagList = []
            valueList = {}
            for concept in video:
                if concept[0] not in valueList:
                    valueList[concept[0]] = []
                valueList[concept[0]].append(concept[1])
                tagList.append(concept[0])     
            
            occ = Counter(tagList)  
            allConceptList = []
            for i in occ.most_common(20):
                tempMConceptList = []
                tempMConceptList.append(i[0])
                tempMConceptList.append(numpy.average(valueList[i[0]]))
                allConceptList.append(tempMConceptList)
            newList.append(allConceptList)
        else:
            newList.append(None)
            
    return newList



    
def mergeList(list1, list2):
    copy1 = list1.copy()
    copy2 = list2.copy()
    copy1 = ['None' if v is None else v for v in copy1]
    copy2 = ['None' if v is None else v for v in copy2]
    output = []
    for x in copy1:
        if x == 'None':
            for y in copy2: 
                if y == 'None':
                    copy2.remove(y)
                else:
                    output.append(y)
                    copy2.remove(y)
        else:
            output.append(x)
    return output

def Average(lst): 
    try:
        return functools.reduce(lambda a, b: a + b, lst) / len(lst) 
    except TypeError:
        return 0

In [26]:
#Accessing GraphProfileInfo
profile_access = read_content['GraphProfileInfo']

In [27]:
#GraphImageValues
pinfo = profile_access['info']
profileDict = {'name' : pinfo['full_name'],
               'time' : timestampConv(profile_access['created_time']),
               'followers' : pinfo['followers_count'],
               'following' : pinfo['following_count'],
               'posts' : pinfo['posts_count']}

In [28]:
#Accessing GraphImages
media_access = read_content['GraphImages']

In [29]:
#Comment and PostCaption Extraction/ preparation
mediaCaptionList = []
mediaCaptionWordCount = []


#2D CommentList
mediaCommentList = []
mediaCommentAvgWordCount = []
for media_data in media_access:
    mediaCommentList.append([])

pointerx = 0
for media_data in media_access:
    mediaCaptionList.append(texPrep(getCaption(media_data).lower()))
    mediaCaptionWordCount.append(len(texPrep(getCaption(media_data)).split()))
    comments_access = media_data['comments']
    data_access = comments_access['data']
    mediaCommentWordCount = []                             
    for data in data_access:
        mediaCommentList[pointerx].append(texPrep(data['text']))
        mediaCommentWordCount.append(len(texPrep(data['text']).split()))
        
    mediaCommentAvgWordCount.append(round(Average(mediaCommentWordCount), 2))
    pointerx += 1
    

In [30]:
sentCa = []
caPol = []
caSub = []

for text in mediaCaptionList:
    listToStr = ''.join([str(elem) for elem in text]) 
    blob = TextBlob(text)
#     try:
#         blob = blob.translate(to='en')
#     except:
#         pass
    sentCa.append(blob.sentiment)
    caPol.append(blob.sentiment.polarity)
    caSub.append(blob.sentiment.subjectivity)
    

In [31]:
sentCo = []
coPol = []
coSub = []

for text in mediaCommentList:
    listToStr = ''.join([str(elem) for elem in text]) 
    temptext = '. '.join(map(str, text)) 
    blob = TextBlob(temptext)
#     try:
#         blob = blob.translate(to='en')
#     except:
#         pass
    sentCo.append(blob.sentiment)
    coPol.append(blob.sentiment.polarity)
    coSub.append(blob.sentiment.subjectivity)


In [32]:
timestamp = []
likeCount = []
commentCount = []
mediaId = []
commentsDisabled = []
isVideo = []
shortCode = []

for media_data in media_access:
    timestamp.append(timestampConv(media_data['taken_at_timestamp']))
    likeCount_access = media_data['edge_media_preview_like']
    likeCount.append(likeCount_access['count'])
    commentCount_access = media_data['edge_media_to_comment']
    commentCount.append(commentCount_access['count'])
    accessId = media_data['urls']
    mediaId.append(cut_mediaId(accessId[0]))
    commentsDisabled.append(media_data['comments_disabled'])
    isVideo.append(media_data['is_video'])
    shortCode.append(media_data['shortcode'])
    
#Flipp List
commentsDisabled = [not i for i in commentsDisabled]

# Check for uniqueness
flag = 0
flag = len(set(mediaId)) == len(mediaId)
assert(flag == True)

In [33]:
# Get Clarifai Image Concepts from Data
mediaConceptListI = getConceptsListFromJson('..\\ProcessedData\\Clarifai\\B1Pred.json', mediaId)

In [34]:
# Get Clarifai Video Concepts from Data
mediaConceptListV = getConceptsListFromJsonV('..\\ProcessedData\\Clarifai\\B1PredV.json', mediaId)

In [35]:
mostCommon = getMostCommonFromVideo(mediaConceptListV)

In [36]:
mediaConceptList = mergeList(mediaConceptListI, mostCommon)

In [37]:
#Profile variables
df_pf = pandas.DataFrame.from_dict([profileDict])
# df_pf.to_excel('C:\\Users\\Daniel\\Desktop\\Bachelorthesisdata\\DataExtraction\\wellsfargopro.xlsx', index = False)
df_pf

Unnamed: 0,name,time,followers,following,posts
0,Wells Fargo,00:00 06-10-2010,89662,54,272


In [38]:
#Response variables
dic_rv = {"MediaId" : mediaId, "Likes" : likeCount, "Comments" : commentCount, "totalCommentPolarity" : coPol, "CommentAvgWordCount": mediaCommentAvgWordCount, "totalCommentSubjectivity" : coSub}
df_rv = pandas.DataFrame(dic_rv)
df_rv

Unnamed: 0,MediaId,Likes,Comments,totalCommentPolarity,CommentAvgWordCount,totalCommentSubjectivity
0,121956880_171467364611628_4986685135118973822_n,204,59,0.095430,15.74,0.559315
1,121530632_375214783529972_2186779159543342931_n,391,30,0.171087,13.30,0.454010
2,120829749_341972467232372_3719255655593396633_n,500,107,0.093808,28.35,0.484094
3,119855190_452853985630952_5979221108272615688_n,538,1356,0.027823,19.79,0.478211
4,119222906_120450356467709_5290671636298070816_n,264,146,0.039081,23.27,0.480358
...,...,...,...,...,...,...
267,10693560_1510081459240155_587168077_n,107,27,0.084746,18.92,0.475079
268,1389793_913072325386764_255645387_n,103,4,1.000000,3.50,1.000000
269,10522839_1551211985091680_1444434316_n,116,5,0.291667,6.00,0.450000
270,1388858_346794825487750_1327851000_n,101,7,0.131250,44.80,0.279167


In [39]:
#Explanatory variables
dic_ev = {"MediaId" : mediaId, "Timestamp" : timestamp, "CommentsEnabled" : commentsDisabled, "Video" : isVideo, "CaptionPolarity" : caPol, "CaptionSubjectivity" : caSub,"CaptionWordCount": mediaCaptionWordCount, "ICG": mediaConceptList}
df_ev = pandas.DataFrame(dic_ev)
df_ev

Unnamed: 0,MediaId,Timestamp,CommentsEnabled,Video,CaptionPolarity,CaptionSubjectivity,CaptionWordCount,ICG
0,121956880_171467364611628_4986685135118973822_n,14:01 15-10-2020,True,True,0.500000,0.500000,38,"[[people, 0.9598162616249999], [horizontal, 0...."
1,121530632_375214783529972_2186779159543342931_n,13:26 15-10-2020,True,False,0.000000,0.000000,29,"[[people, 0.9588461235483872], [woman, 0.96106..."
2,120829749_341972467232372_3719255655593396633_n,14:07 05-10-2020,True,False,0.000000,0.000000,29,"[[woman, 0.9858032434146342], [people, 0.96870..."
3,119855190_452853985630952_5979221108272615688_n,16:00 21-09-2020,True,False,0.500000,0.500000,26,"[[man, 0.956414969512195], [indoors, 0.9765654..."
4,119222906_120450356467709_5290671636298070816_n,16:14 15-09-2020,True,True,-0.025568,0.477273,31,"[[fun, 0.9493503839999999], [family, 0.9363821..."
...,...,...,...,...,...,...,...,...
267,10693560_1510081459240155_587168077_n,21:51 06-10-2014,True,False,0.266136,0.446288,124,"[[man, 0.9927246], [people, 0.9895527], [popco..."
268,1389793_913072325386764_255645387_n,16:21 06-10-2014,True,False,0.259949,0.536735,100,"[[people, 0.9933882], [artisan, 0.97386533], [..."
269,10522839_1551211985091680_1444434316_n,17:37 03-10-2014,True,False,0.177083,0.477083,95,"[[mammal, 0.99661726], [animal, 0.99302787], [..."
270,1388858_346794825487750_1327851000_n,20:59 02-10-2014,True,False,0.446875,0.856250,116,"[[elderly, 0.99939466], [elder, 0.99899286], [..."


In [40]:
pickling_on = open("..\\ProcessedData\\B1.pickle","wb")
pickle.dump(df_ev, pickling_on)
pickle.dump(df_rv, pickling_on)
pickle.dump(df_pf, pickling_on)
pickling_on.close()