In [1]:
import pandas as pd
import json 
import glob
import re
from datetime import datetime

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
def transform_datetime(df, column): 
    df[column] = df[column].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.000Z") if len(x) > 10 else None)
    return df

In [4]:
follower = None
df = []
for follower in glob.glob("Data/Twitter/*_Follower.json"):
    sub = re.search('(?<=Data\/Twitter\\\\).*(?=\_Follower)', follower)
    name = sub.group()
    with open(follower, 'r', encoding='utf8') as f:
        follower = json.load(f)
        df.append({'account' : name, 'follower' : follower})
follower = pd.DataFrame(df)

df = follower.explode('follower')
df = pd.concat([df, df["follower"].apply(pd.Series)], axis=1)
df = pd.concat([df, df["public_metrics"].apply(pd.Series)], axis=1)
df = df.drop(columns=['follower', 'public_metrics'])

df.to_csv('twitter_follower.csv', index=False)

In [5]:
following = None
df = []
for follower in glob.glob("Data/Twitter/*_Following.json"):
    sub = re.search('(?<=Data\/Twitter\\\\).*(?=\_Following)', follower)
    name = sub.group()
    with open(follower, 'r', encoding='utf8') as f:
        follower = json.load(f)
        df.append({'account' : name, 'following' : follower})
follower = pd.DataFrame(df)

df = follower.explode('following')
df = pd.concat([df, df["following"].apply(pd.Series)], axis=1)
df = pd.concat([df, df["public_metrics"].apply(pd.Series)], axis=1)
df = df.drop(columns=['following', 'public_metrics'])

df.to_csv('twitter_following.csv', index=False)

In [6]:
tweets = None
df = []
for follower in glob.glob("Data/Twitter/*_Tweets.json"):
    sub = re.search('(?<=Data\/Twitter\\\\).*(?=\_Tweets)', follower)
    name = sub.group()
    with open(follower, 'r', encoding='utf8') as f:
        follower = json.load(f)
        df.append({'account' : name, 'tweets' : follower})
tweets = pd.DataFrame(df)

df = tweets.explode('tweets')
df = pd.concat([df, df["tweets"].apply(pd.Series)], axis=1)
df = pd.concat([df, df["public_metrics"].apply(pd.Series)], axis=1)
df["referenced_tweets"] = df["referenced_tweets"].apply(lambda x: x[0]['id'] if isinstance(x, list) else None)
df = transform_datetime(df, 'created_at')
df = df.drop(columns=['tweets', 'public_metrics'])

df.to_csv('twitter_tweets.csv', index=False)

In [7]:
def transform_datetime(df, column): 
    df[column] = df[column].apply(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ") if isinstance(x,str) and len(x) > 10 else None)
    return df

In [8]:
orders = None

with open('Data/Prodigy/Prodigy_Orders.json', 'r', encoding='utf8') as f:
    orders = json.load(f)
    orders = pd.DataFrame(orders)


orders = orders["order"].apply(pd.Series)
orders = orders.drop(columns=['download_items', 'order_custom_checkout_fields', 'downloads', 'gateway_transaction_ids'])

cart = pd.concat([orders['id'], orders["cart"].apply(pd.Series)], axis=1)
cart = cart.rename(columns={"id": "order_id"})
cart = cart.explode('cart_items')
cart = pd.concat([cart, cart["cart_items"].apply(pd.Series)], axis=1)
cart = pd.concat([cart, cart["cart_item"].apply(pd.Series)], axis=1)
cart = cart.drop(columns=['cart_items', 'cart_item'])
cart = transform_datetime(cart, 'started_checkout_at')

orders = orders.drop(columns=['cart'])
orders = transform_datetime(orders, 'updated_at')
orders = transform_datetime(orders, 'created_at')

cart.to_csv('prodigy_cart.csv', index=False)
orders.to_csv('prodigy_orders.csv', index=False)

In [9]:
products = None
with open('Data/Prodigy/Prodigy_Products.json', 'r', encoding='utf8') as f:
    products = json.load(f)
    products = pd.DataFrame(products)
    
products
products = products["product"].apply(pd.Series)
products = products.drop(columns=['attachment'])
products = transform_datetime(products, 'updated_at')
products = transform_datetime(products, 'created_at')
products.to_csv('prodigy_products.csv', index=False)

In [23]:
packages = None
with open('Data/Prodigy/Prodigy_Packages.json', 'r', encoding='utf8') as f:
    packages = json.load(f)
    packages = pd.DataFrame(packages)
    
packages
packages = packages["package"].apply(pd.Series)
packages['components'] = packages['components'].apply(lambda x: x['product_ids'])
packages = transform_datetime(packages, 'updated_at')
packages = transform_datetime(packages, 'created_at')
packages.to_csv('prodigy_packages.csv', index=False)

In [11]:
repositories = None
with open('Data/GitHub/EXPLOSION_Repositories.json', 'r', encoding='utf8') as f:
    repositories = json.load(f)
    repositories = pd.DataFrame(repositories)

repositories = repositories.drop(columns=['owner', 'permissions', 'license', 'topics']) 
issues = repositories[['id', 'name', 'issues']]
issues = issues.explode('issues')
issues = pd.concat([issues, issues["issues"].apply(pd.Series)], axis=1)
issues = issues.drop(columns=['issues', 0])
issues = pd.concat([issues, issues["user"].apply(pd.Series)], axis=1)
issues = transform_datetime(issues,'closed_at')
issues = transform_datetime(issues,'created_at')
issues = transform_datetime(issues,'updated_at')

pulls = repositories[['id', 'name', 'pulls']]
pulls = pulls.explode('pulls')
pulls = pd.concat([pulls, pulls["pulls"].apply(pd.Series)], axis=1)
pulls = pd.concat([pulls, pulls["user"].apply(pd.Series)], axis=1)
pulls = pulls.drop(columns=['user', '_links', 0, 'base'])
pulls = transform_datetime(pulls,'closed_at')
pulls = transform_datetime(pulls,'created_at')
pulls = transform_datetime(pulls,'merged_at')
pulls = transform_datetime(pulls,'updated_at')

commits = repositories[['id', 'name', 'commits']]
commits = commits.explode('commits')
commits = pd.concat([commits, commits["commits"].apply(pd.Series)], axis=1)
commits = commits.drop(columns=['author'])
commits = pd.concat([commits, commits["commit"].apply(pd.Series)], axis=1)
commits = pd.concat([commits, commits["author"].apply(pd.Series)], axis=1)
commits = commits.drop(columns=['verification', 'committer', 'author', 'parents', 'committer', 'tree', 'commit'])
commits = transform_datetime(commits,'date')

repositories.to_csv('github_repositories.csv', index=False)
issues.to_csv('github_issues.csv', index=False)
pulls.to_csv('github_pulls.csv', index=False)
commits.to_csv('github_commits.csv', index=False)

In [12]:
counter_progression = None
with open('Data/GitHub/EXPLOSION_GitHub_Counters_Progression.json', 'r', encoding='utf8') as f:
    counter_progression = json.load(f)
    counter_progression = pd.DataFrame(counter_progression)

counter_progression.to_csv('gihub_counter.csv', index=False)

In [13]:
traffic_progression = None
with open('Data/GitHub/EXPLOSION_GitHub_Traffic_Progression.json', 'r', encoding='utf8') as f:
    traffic_progression = json.load(f)
    traffic_progression = pd.DataFrame(traffic_progression)

traffic_progression.to_csv('gihub_traffic.csv', index=False)


In [14]:
df = traffic_progression.loc[traffic_progression['name'] =='spaCy']

In [15]:
df

Unnamed: 0,timestamp,id,name,views,clones,popular_paths,referrers
15,1639575585,21467110,spaCy,"[{'timestamp': '2021-12-01T00:00:00Z', 'count'...","[{'timestamp': '2021-12-01T00:00:00Z', 'count'...","[{'path': '/explosion/spaCy', 'title': 'GitHub...","[{'referrer': 'Google', 'count': 19548, 'uniqu..."
58,1639652681,21467110,spaCy,"[{'timestamp': '2021-12-02T00:00:00Z', 'count'...","[{'timestamp': '2021-12-02T00:00:00Z', 'count'...","[{'path': '/explosion/spaCy', 'title': 'GitHub...","[{'referrer': 'Google', 'count': 19677, 'uniqu..."
101,1640016116,21467110,spaCy,"[{'timestamp': '2021-12-06T00:00:00Z', 'count'...","[{'timestamp': '2021-12-06T00:00:00Z', 'count'...","[{'path': '/explosion/spaCy', 'title': 'GitHub...","[{'referrer': 'Google', 'count': 19336, 'uniqu..."
144,1640096900,21467110,spaCy,"[{'timestamp': '2021-12-07T00:00:00Z', 'count'...","[{'timestamp': '2021-12-07T00:00:00Z', 'count'...","[{'path': '/explosion/spaCy', 'title': 'GitHub...","[{'referrer': 'Google', 'count': 19070, 'uniqu..."


In [16]:
view = df[['timestamp', 'popular_paths']]
view

Unnamed: 0,timestamp,popular_paths
15,1639575585,"[{'path': '/explosion/spaCy', 'title': 'GitHub..."
58,1639652681,"[{'path': '/explosion/spaCy', 'title': 'GitHub..."
101,1640016116,"[{'path': '/explosion/spaCy', 'title': 'GitHub..."
144,1640096900,"[{'path': '/explosion/spaCy', 'title': 'GitHub..."


In [17]:
view = view.explode('popular_paths')
view = view["popular_paths"].apply(pd.Series)
view

Unnamed: 0,path,title,count,uniques
15,/explosion/spaCy,GitHub - explosion/spaCy: 💫 Industrial-strengt...,3611,2388
15,/explosion/spaCy/issues/4577,OSError: [E050] Can't find model 'en_core_web_...,1770,1568
15,/explosion/spaCy/issues,Issues · explosion/spaCy · GitHub,1269,351
15,/explosion/spaCy/tree/master/spacy,spaCy/spacy at master · explosion/spaCy · GitHub,838,471
15,/explosion/spaCy/discussions,New Discussion · explosion/spaCy,804,204
15,/explosion/spaCy/search,Search · token,780,216
15,/explosion/spaCy/discussions/9571,UserWarning: User provided device_type of 'cud...,579,463
15,/explosion/spaCy/releases,Releases · explosion/spaCy · GitHub,367,228
15,/explosion/spaCy/tree/master/spacy/lang,spaCy/spacy/lang at master · explosion/spaCy ·...,322,191
15,/explosion/spaCy/issues/7453,[E053] Could not read config.cfg · Issue #7453...,287,241


In [18]:
counter_progression

Unnamed: 0,timestamp,id,name,size,stargazers_count,watchers_count,forks_count,open_issues_count,forks,watchers,contributor,pulls,issues,commits
0,1639575585,433368801,.github,0,0,0,0,0,0,0,1,0,0,1
1,1639575585,218997674,catalogue,103,101,101,18,4,18,101,8,20,26,96
2,1639575585,24161806,cymem,148,408,408,30,3,30,408,7,17,33,151
3,1639575585,107000608,cython-blis,5144,182,182,29,7,29,182,10,35,62,532
4,1639575585,69860607,displacy,131,325,325,77,8,77,325,7,9,9,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,1640096900,318726532,thinc-apple-ops,25,55,55,5,0,5,55,5,9,10,21
167,1640096900,152978391,thinc_gpu_ops,89,9,9,2,0,2,9,3,1,1,13
168,1640096900,230892092,tokenizations,2350,120,120,13,0,13,120,4,65,75,291
169,1640096900,430534914,vscode-prodigy,27112,21,21,0,0,0,21,1,4,4,17


In [21]:
test = None

with open('Data/Twitter/Twitter_Follower_Progression.json', 'r', encoding='utf8') as f:
    test = json.load(f)
    test = pd.DataFrame(test)
    
test.to_csv('twitter_follower_progression.csv')

In [22]:
test = None

with open('Data/Twitter/Twitter_Following_Progression.json', 'r', encoding='utf8') as f:
    test = json.load(f)
    test = pd.DataFrame(test)
    
test.to_csv('twitter_following_progression.csv')