Quantify the number of savers and spenders

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
from tqdm import tqdm
import os

In [2]:
df_money = pd.read_csv('../../data/asking_money_sample.csv')
df_no_money = pd.read_csv('../../data/asking_no_money_sample.csv')
df_some_money = pd.read_csv('../../data/asking_some_money_sample.csv')
df = pd.concat([df_money, df_some_money, df_no_money], ignore_index=False)

In [3]:
def convert_datetime(x):
    if not isinstance(x, str):
        return x
    # Ignore time zone info
    if 'T' in x and 'Z' in x:
        x = x.split('T')[0]
    try:
        temp = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    except:
        try:
            temp = datetime.strptime(x, '%Y-%m-%d')
        except:
            try:
                temp = datetime.strptime(x, '%m/%d/%Y')
            except:
                try:
                    temp = datetime.strptime(x, '%m/%d/%Y %H:%M:%S')
                except:
                    temp = datetime.strptime(x, '%m/%d/%Y %H:%M')
    return temp

In [4]:
# return 0 for neither saver or spender, 1 for saver, and 2 for spender
def analyze_oc_date(url):
    path = '../data_mining/files/20190719_OpenCollective CSV'
    url_name = url.split('#')[0].rstrip()
    url_name = url_name.split('/')[-1]
    url_name = url_name.split(' ')[0]
    directory = os.fsencode(path)
    for file in os.listdir(directory):
        # Open csv file
        file = file.decode("utf-8")
        name = file.split('--')[0]
        if name == url_name.lower():
            df_oc = pd.read_csv(path+'/'+file)
            df_oc['Transaction Date'] = df_oc['Transaction Date'].apply(convert_datetime)
            total_earning = df_oc[(df_oc['Transaction Amount'] > 0)&
                (df_oc['Transaction Date'] <= '2019-05-23')]['Transaction Amount'].sum()
            total_expense = df_oc[(df_oc['Transaction Amount'] < 0)&
                (df_oc['Transaction Date'] <= '2019-05-23')]['Transaction Amount'].sum()*-1
            if total_expense < 0.25*total_earning:
                return 1
            elif total_expense > 0.75*total_earning:
                return 2
            else:
                return 0
    return 0

In [5]:
dicts = df.to_dict('records')
results = {'saver':0, 'spender':0, 'neither':0}
for row in dicts:
    if not pd.isna(row['opencollective_url']):
        res = analyze_oc_date(row['opencollective_url'])
        if res == 1:
            print('saver:', row['slug'])
            results['saver'] += 1
        elif res == 2:
            print('spender:', row['slug'])
            results['spender'] += 1
        else:
            results['neither'] += 1

spender: wangfangning/webpack-cli
saver: trollianspace/trollian
saver: feathersjs/feathers
saver: facebook/docusaurus
saver: ryandharper/ryanharper.co.uk
saver: burtonator/polar-bookshelf
saver: sinonjs/samsam
saver: slaveuser/bootstrap20190320
saver: lukehelminiak/Alpha
saver: Actinium-project/ln-plugin-pony
saver: goby-lang/goby
saver: sous-chefs/ossec
saver: Andreezw/Tablero
saver: micaeloliveira/gatsby-starter-lumen
spender: discourse/wp-discourse


In [6]:
results

{'saver': 13, 'spender': 2, 'neither': 12}