In [1]:
# https://www.dropbox.com/developers/apps/info/pu644k33a199294#settings

import pandas as pd
import numpy as np
import dropbox
import camelot
import tempfile
import fitz
import os
import re

In [3]:
dbx = dropbox.Dropbox(DB_TOKEN)


In [4]:
def contains(string:str, contains:list) -> bool:
    for cont in contains:
        if cont in string:
            return True
    
    return False

## Classify

In [5]:
def classify_file(path, file_bytes, verbose=False):
    try:
        extension = os.path.splitext(path)[1]

        if extension == ".pdf":
                reader = fitz.open(stream=file_bytes)
                content = reader.load_page(0).get_text()
        elif extension == ".xlsx":
                content = pd.read_excel(file_bytes).to_string()
        elif extension == ".xlsb":
                content = pd.read_excel(file_bytes, engine='pyxlsb').to_string()
        else:
                return "OTHER"

        content = content.lower()
        if "wrapbook" in content:
            return "OTHER"
        elif contains(content, ["cost summary", "hot budget", "film production cost summary"]):
            return "CS"
        elif "purchase order" in content:
            return "PO"
        elif "payroll" in content:
            return "PR"
        else:
            return "OTHER"
    except Exception as e:
        print(e) if verbose else print("classification error at ", path)
        return "OTHER"

## Department Getter

In [6]:
def get_dept_from_line(ln:int) -> str:
    try:
        ln = int(ln)
    except ValueError:
        return ln

    if ln in range(51):
        return "PRE-PRODUCTION | WRAP LABOR"
    elif ln in range(51, 101):
        return "SHOOTING LABOR"
    elif ln in range(101, 114):
        return "PRE-PRODUCTION | WRAP EXPENSES"
    elif ln in range(114, 140):
        return "LOCATION AND TRAVEL"
    elif ln in range(140,151):
        return "MAKEUP, WARDROBE, AND ANIMALS"
    elif ln in range(151, 168):
        return "STUDIO | STAGE RENTAL / EXPENSES"
    elif ln in range(168,181):
        return "ART DEPARTMENT LABOR"
    elif ln in range(181, 193):
        return "ART DEPARTMENT EXPENSES"
    elif ln in range(193, 211):
        return "EQUIPMENT COSTS"
    elif ln in range(211, 217):
        return "FILMSTOCK, DEVELOP AND PRINT"
    elif ln in range(217,227):
        return "MISCELLANEOUS"
    elif ln in range(227, 234):
        return "DIRECTOR | CREATIVE FEES"
    elif ln in range(234, 271):
        return "TALENT LABOR"
    elif ln in range(271, 277):
        return "TALENT EXPENSES"
    elif ln in range(277, 282):
        return "POST PRODUCTION LABOR"
    elif ln in range(282, 330):
        return "EDITORIAL | FINISHING | POST PRODUCTION"
    else:
        return "OTHER"

## Cost Summary Reader

In [7]:
HB_CS_COLS = ["SECTION", "drop", "BID TOTALS", "ACTUAL", "VARIANCE"]

def read_hot_budget_cs(file_bytes) -> pd.DataFrame:
    _df = camelot.read_pdf(file_bytes)._tables[1].df.copy()
    
    _df.drop(12, inplace=True)

    _df.columns = HB_CS_COLS
    _df.drop(columns=["drop"], inplace=True)
    _df = _df.loc[1:]

    _df = _df.replace([r"CS\d+\b ", r".*\n", "\)", ","], "", regex=True).replace("\(", "-", regex=True)

    _df[_df.columns[1:]] = _df.iloc[:, 1:].replace("", np.nan).astype(float)

    _df = _df.dropna(thresh=2).fillna(0.0) # if drop empty rows

    return _df.reset_index(drop=True)


def read_GetActual_cs(file_bytes) -> pd.DataFrame:
    reader = fitz.open(stream=file_bytes)
    content = reader.load_page(0).get_text()

    start = re.search(r"\b[A-Z]\s", content[2:]).start()
    content = re.sub(r"\b[A-Z]\s|Bid Actual|\,|\)", "", content.replace("(", "-"))
    content = content[start:content.find("\nGRAND TOTAL")].split("\n")
    _df = pd.DataFrame(columns=["SECTION", "BID TOTALS", "ACTUAL"])
    
    for line in content:
        vals = line.split("$")
        if len(vals) > 1:
            _df.loc[len(_df)] = vals[:3]

    _df[["BID TOTALS", "ACTUAL"]] = _df[["BID TOTALS", "ACTUAL"]].astype(float)
    _df = _df.drop(_df[_df.SECTION.str.contains("SUB TOTAL")].index)

    _df["VARIANCE"] = _df.ACTUAL - _df["BID TOTALS"]
    _df.SECTION = _df.SECTION.apply(str.strip)

    return _df


def read_cost_summary(file_obj, extension) -> pd.DataFrame:
    reader = fitz.open(stream=file_obj)
    content = reader.load_page(0).get_text()
    
    try:
        if "HOT BUDGET" in content:
            return read_hot_budget_cs(file_obj)
        elif "Film Production Cost Summary" in content:
            return read_GetActual_cs(file_obj)
    except:
        return pd.DataFrame()

In [8]:
# HB_CS_COLS = ["SECTION", "drop", "BID TOTALS", "ACTUAL", "VARIANCE"]

# def read_hot_budget_cs(path) -> pd.DataFrame:
#     _df = camelot.read_pdf(path)._tables[1].df.copy()
    
#     _df.drop(12, inplace=True)

#     _df.columns = HB_CS_COLS
#     _df.drop(columns=["drop"], inplace=True)
#     _df = _df.loc[1:]

#     _df = _df.replace([r"CS\d+\b ", r".*\n", "\)", ","], "", regex=True).replace("\(", "-", regex=True)

#     _df[_df.columns[1:]] = _df.iloc[:, 1:].replace("", np.nan).astype(float)

#     _df = _df.dropna(thresh=2).fillna(0.0) # if drop empty rows

#     return _df.reset_index(drop=True)


# def read_GetActual_cs(path) -> pd.DataFrame:
#     with open(path, "rb") as file:
#         reader = PyPDF2.PdfReader(file)
#         content = reader.pages[0].extract_text()

#     start = re.search(r"\b[A-Z]\s", content[2:]).start()
#     content = re.sub(r"\b[A-Z]\s|Bid Actual|\,|\)", "", content.replace("(", "-"))
#     content = content[start:content.find("\nGRAND TOTAL")].split("\n")
#     _df = pd.DataFrame(columns=["SECTION", "BID TOTALS", "ACTUAL"])
#     print(content)
#     for line in content:
#         vals = line.split("$")
#         if len(vals) > 1:
#             _df.loc[len(_df)] = vals[:3]

#     _df[["BID TOTALS", "ACTUAL"]] = _df[["BID TOTALS", "ACTUAL"]].astype(float)
#     _df = _df.drop(_df[_df.SECTION.str.contains("SUB TOTAL")].index)

#     _df["VARIANCE"] = _df.ACTUAL - _df["BID TOTALS"]
#     _df.SECTION = _df.SECTION.apply(str.strip)

#     return _df


# def read_cost_summary(path) -> pd.DataFrame:
#     with open(path, "rb") as file:
#         reader = PyPDF2.PdfReader(file)
#         content = reader.pages[0].extract_text()
#     try:
#         if "HOT BUDGET" in content:
#             return read_hot_budget_cs(path)
#         elif "Film Production Cost Summary" in content:
#             return read_GetActual_cs(path)
#     except:
#         return pd.DataFrame()

In [9]:
def read_sheet(file_obj, extension) -> pd.DataFrame:
    if extension == ".xlsx":
        _df = pd.read_excel(file_obj, header=4)
    if extension == ".xlsb":
        _df = pd.read_excel(file_obj, engine='pyxlsb', header=4)
    
    _df = _df.replace(["\)", ","], "", regex=True).replace("\(", "-", regex=True)
    _df.ACTUAL = _df.ACTUAL.astype(float)
    if "RATE" in _df.columns:
        _df.RATE = _df.RATE.astype(float)
    
    return _df

## Payroll Reader

In [10]:
PR_COLS = ['LINE', 'PAYEE', 'PO', 'F1', 'F2', 'DAYS', 'RATE', 'BASE', '1.5', '2', '3', 'TAXABLE', 'NON-TAX', 'TOTAL ST', 'TOTAL OT', 'ACTUAL', 'FRINGE 1', 'FRINGE 2', 'LINE DESCRIPTION']

def read_pdf_payroll(path) -> pd.DataFrame:
    _df = camelot.read_pdf(path)._tables[0].df.copy()
    
    _df.columns = PR_COLS
    _df = _df.iloc[1:].reset_index(drop=True).replace("", np.nan).dropna(how="all")

    _df.LINE.fillna(_df.PAYEE, inplace=True)
    _df[['LINE', 'PAYEE']] = _df.LINE.str.split(" ", n=1, expand=True)

    _df = _df.replace(["\)", ","], "", regex=True).replace("\(", "-", regex=True)
    _df.ACTUAL = _df.ACTUAL.astype(float)

    return _df

In [11]:
def read_payroll(file_obj, extension) -> pd.DataFrame:
    if extension == ".pdf":
        return read_pdf_payroll(path)
    else:
        return read_sheet(path, extension)

## Purchase Order Log Reader

In [12]:
PO_COLS = ["LINE", "PAYEE", "PO", "DATE", "PAYID", "ACTUAL", "LINE DESCRIPTION"]


def read_pdf_purchase_order(file_obj) -> pd.DataFrame:
    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
        temp_pdf.write(file_obj.read())
        _df = camelot.read_pdf(temp_pdf.name)._tables[0].df.copy()
    
    _df.columns = PO_COLS
    _df = _df.iloc[1:].reset_index(drop=True).replace("", np.nan).dropna(how="all")

    _df.LINE.fillna(_df.PAYEE, inplace=True)
    _df[['LINE', 'PAYEE']] = _df.LINE.str.split(" ", n=1, expand=True)

    _df.ACTUAL.fillna(_df["LINE DESCRIPTION"], inplace=True)
    _df[['ACTUAL', 'LINE DESCRIPTION']] = _df.ACTUAL.str.split(" ", n=1, expand=True)

    _df = _df.replace(["\)", ","], "", regex=True).replace("\(", "-", regex=True)
    _df.ACTUAL = _df.ACTUAL.astype(float)

    return _df

In [13]:
def read_purchase_order(file_obj, extension) -> pd.DataFrame:
    if extension == ".pdf":
        return read_pdf_purchase_order(file_obj)
    else:
        return read_sheet(file_obj, extension)

In [14]:
def file_to_df(_type:str, path:str, file_obj:bytes) -> pd.DataFrame:
    extension = os.path.splitext(path)[1]

    if _type == "CS":
        return read_cost_summary(file_obj, extension)
    elif _type == "PR":
        return read_payroll(file_obj, extension)
    elif _type == "PO":
        return read_purchase_order(file_obj, extension)
    else:
        return pd.DataFrame()

## Dropbox

In [15]:
TEST_LINK = "https://www.dropbox.com/home/_JOB_ACTUALS"

In [16]:
class DbxDataRetriever:
    datasets = {
        "CS" : [],
        "PR" : [],
        "PO" : []
    }

    def __init__(self, link, dbx) -> None:
        self.path = self.path_from_link(link)
        self.dbx = dbx

    def path_from_link(self, path):
        start_key = "sh/"

        if start_key in path:
            end = path.find("?")
        else:
            start_key = "home/"
            end = len(path)
        
        start = path.find(start_key) + len(start_key) - 1
        return path[start : end]
    
    def get_content_and_type(self, dbx_path):
        _meta, res = dbx.files_download(dbx_path)
        file_obj = res.content
        _type = classify_file(dbx_path, file_obj, verbose=True)

        return _type, file_obj

    def gen_data(self):
        res = dbx.files_list_folder(self.path)

        for entry in res.entries:
            current_path = entry.path_display

            if isinstance(entry, dropbox.files.FileMetadata):
                project_name = current_path.split("/")[-3]
                _type, file_obj = self.get_content_and_type(current_path)
                # self.datasets[_type].append(file_obj)
                df = file_to_df(_type, current_path, file_obj)
                print(df)
                
            elif isinstance(entry, dropbox.files.FileMetadata):
                pass




In [17]:
link = "https://www.dropbox.com/home/_JOB_ACTUALS/22003_EA/Budget"

dbx = dropbox.Dropbox(DB_TOKEN)


retr = DbxDataRetriever(link, dbx)

retr.gen_data()

Empty DataFrame
Columns: []
Index: []


FileDataError: cannot open broken document

In [None]:
# payroll_dfs = []
# cs_dfs = []

# dbx = dropbox.Dropbox(DB_TOKEN)


# def download_folder_contents(directory_path):
#     result = dbx.files_list_folder(directory_path, recursive=True)
#     dir_name = os.path.basename(directory_path)
#     dir_files = {}
    
#     for entry in result.entries:
#         current_path = entry.path_display

#         if isinstance(entry, dropbox.files.FileMetadata):
#             # Download the file
#             file_name = os.path.basename(current_path)
#             meta, res = dbx.files_download(current_path)
#             content = res.content
#         elif isinstance(entry, dropbox.files.FolderMetadata):
#             # Recursively download contents of subfolder
#             download_folder_contents(current_path)



# def download(dbx, folder, subfolder, name):
#     """Download a file.
#     Return the bytes of the file, or None if it doesn't exist.
#     """
#     path = '/%s/%s/%s' % (folder, subfolder.replace(os.path.sep, '/'), name)
#     while '//' in path:
#         path = path.replace('//', '/')

#     try:
#         md, res = dbx.files_download(path)
#     except dropbox.exceptions.HttpError as err:
#         print('*** HTTP error', err)
#         return None
    
#     data = res.content
#     print(len(data), 'bytes; md:', md)
#     return data

In [None]:
# start_dir = "data/JOB_ACTUALS"

# payroll_dfs = []
# cs_dfs = []


# for directory in os.listdir(start_dir):
#     dir_path = os.path.join(start_dir, directory)
#     good = True
#     dir_files = {}

#     if os.path.isdir(dir_path):
#         for file in os.listdir(dir_path):
#             file_path = os.path.join(dir_path, file)
#             _type = classify_file(file_path)
#             if not _type in dir_files:
#                 dir_files[_type] = []
            
#             dir_files[_type].append(file_path)
    
#     for _type in ["PO", "PR", "CS"]:
#         if not _type in dir_files:
#             good = False
    
#     if dir_files.get("PR"):
#         for pr_path in dir_files.get("PR"):
#             df = read_payroll(pr_path)
#             df["PROJECT_NAME"] = directory
#             payroll_dfs.append(df)
    
    # if dir_files.get("CS"):
    #     for path in dir_files.get("CS"):
    #         df = read_cost_summary(path)
    #         df["PROJECT_NAME"] = directory
    #         cs_dfs.append(df)

#     if good:
#         print(directory)
        
    


In [None]:
# pr_combined = pd.concat(payroll_dfs)
# pr_combined.RATE = pr_combined.RATE.astype(float)
# pr_combined.DAYS = pr_combined.DAYS.astype(float)

# pr_combined["EST"] = pr_combined.RATE * pr_combined.DAYS
# pr_combined["VARIANCE"] = pr_combined.ACTUAL - pr_combined.EST
# pr_combined["VAR_PCT"] = pr_combined.VARIANCE / pr_combined.EST * 100
# pr_combined["SECTION"] = pr_combined.LINE.apply(get_dept_from_line)


In [None]:
# pr_combined.PROJECT_NAME.unique()

In [None]:
# # fig, ax = plt.subplots(figsize=(10, 10))

# BY = "SECTION"
# FOR = "VARIANCE"

# pr_grouped = pr_combined.groupby(BY).mean(numeric_only=True).sort_values(FOR, ascending=True).query("%s > 0" % FOR)[FOR]

# # pr_grouped.plot(kind="barh", ax=ax)

# # ax.set_title("Payee's Going Over Budget")
# # ax.set_ylabel("Payee")
# # ax.set_xlabel("Amount Over Budget ($)")
# pr_grouped

In [None]:
# cs_combined = pd.concat(cs_dfs)

# cs_grouped = cs_combined.groupby("SECTION").mean(numeric_only=True).sort_values("VARIANCE", ascending=False).round(2)
# cs_grouped["VAR_PCT"] = (cs_grouped.VARIANCE / (cs_grouped["BID TOTALS"] + 1e-3)).round(2) * 100

# cs_grouped