In [1]:
# Import the data set via dask => runs a lot faster than with pandas
import dask.dataframe as dd

org = dd.read_csv('data/travistorrent_11_1_2017.csv', blocksize=25000000, usecols=['tr_build_id', "tr_prev_build", "tr_status"])
org.head()

Unnamed: 0,tr_build_id,tr_prev_build,tr_status
0,106060,105310.0,passed
1,106060,105310.0,passed
2,106060,105310.0,passed
3,106060,105310.0,passed
4,106060,105310.0,passed


In [21]:
# Load data, set index 
df = dd.read_csv('data/travistorrent_11_1_2017.csv', blocksize=25000000, usecols=['tr_build_id', "tr_prev_build", "tr_status", "git_prev_built_commit"])
df.set_index('tr_build_id')
df = df.head(10000000)
df = df[df.tr_build_id.notnull()] # To be sure
# Find all passed builds ids (=> check whether all jobs really passed)
# Map the failed status to 1 and all other to 0
def check_if_passed(df):
    if df.tr_status == 'failed':
        df.tr_status = 1
    else:
        df.tr_status = 0
    return df

maped_status = df.apply(check_if_passed, axis=1)

# Get all passed builds
grouped_build_id = maped_status.groupby(['tr_build_id']).tr_status.sum()
grouped_build_id = grouped_build_id.where(lambda x: x ==  0 ).dropna()

passed_builds = df.where(lambda x: x.tr_build_id.isin(grouped_build_id.keys().values))
passed_builds = passed_builds[passed_builds.tr_prev_build.notnull()]
passed_builds.count()



tr_build_id              16781
git_prev_built_commit    16781
tr_prev_build            16781
tr_status                16781
dtype: int64

In [16]:
# Get all passed builds where the prev build failed
passed_whith_prev_failed = passed_builds.where(lambda x: x.tr_prev_build.isin(grouped_build_id.keys().values) == False)
passed_whith_prev_failed.count()

tr_build_id              3373
git_prev_built_commit    3373
tr_prev_build            3373
tr_status                3373
dtype: int64

In [47]:
# Determine all failed prev for each starting point (passed with prev failed)

# Helper to find relevant data
def get_build_rows_by_build_id(tr_build_id):
    return df.where(lambda x: x.tr_build_id==tr_build_id)

# Result arr with custom data structure, contains the build id and the
result_dict = dict()
def search_for_prev_builds(row):# Determine all failed preds for a specific starting pint
    if (row.tr_build_id in result_dict.keys()):
        #print("Wrong state for build id {0}".format(row.tr_build_id))
        return
    if not row.tr_build_id > 0 :
        return 

    result_dict[row.tr_build_id] = [] # List of previous builds
    
    current_prev_build_id = row.tr_prev_build
    while(current_prev_build_id > 0):
        prev_build_rows = get_build_rows_by_build_id(current_prev_build_id)
        prev_build_rows.to_csv("test_{0}.csv".format(current_prev_build_id))
        #print("buildid {0}".format(current_prev_build_id))
        #print(prev_build_rows.head())
        if(len(prev_build_rows.tr_prev_build) > 0 ):
            result_dict[row.tr_build_id].append({"build_id": current_prev_build_id, "builds": prev_build_rows})
            current_prev_build_id = prev_build_rows.head(1).tr_prev_build[0]
        else:
            current_prev_build_id = 0
    return row

result_data = passed_whith_prev_failed[passed_whith_prev_failed.tr_prev_build.notnull()]
result_data = passed_whith_prev_failed.apply(search_for_prev_builds, axis=1)


In [57]:

for key, value in result_dict.items():
    print("Build:id {0} len {1}".format(key,len(value)))
   #for frame in value: 
      # print(frame['builds'].dropna())

Build:id 284341.0 len 1
Build:id 133121.0 len 1
Build:id 109571.0 len 1
Build:id 281305.0 len 1
Build:id 120839.0 len 1
Build:id 291850.0 len 1
Build:id 267277.0 len 1
Build:id 283801.0 len 1
Build:id 242861.0 len 1
Build:id 221699.0 len 1
Build:id 284692.0 len 1
Build:id 296481.0 len 1
Build:id 181271.0 len 1
Build:id 142360.0 len 1
Build:id 184345.0 len 1
Build:id 276534.0 len 1
Build:id 188449.0 len 1
Build:id 158043.0 len 1
Build:id 121893.0 len 1
Build:id 135206.0 len 1
Build:id 248871.0 len 1
Build:id 143400.0 len 1
Build:id 261162.0 len 1
Build:id 212999.0 len 1
Build:id 220204.0 len 1
Build:id 215901.0 len 1
Build:id 172082.0 len 1
Build:id 230451.0 len 1
Build:id 205878.0 len 1
Build:id 160824.0 len 1
Build:id 129082.0 len 1
Build:id 134207.0 len 1
Build:id 304651.0 len 1
Build:id 224325.0 len 1
Build:id 145478.0 len 1
Build:id 247137.0 len 1
Build:id 182344.0 len 1
Build:id 226377.0 len 1
Build:id 254027.0 len 1
Build:id 286083.0 len 1
Build:id 165965.0 len 1
Build:id 208976.

In [40]:
df.where(lambda x: x.tr_build_id==311729.0).dropna().head()

Unnamed: 0,tr_build_id,git_prev_built_commit,tr_prev_build,tr_status
43405,311729.0,3cc15a8837425108bc7f935cddde2a60cf4f95f9,309896.0,passed
43406,311729.0,3cc15a8837425108bc7f935cddde2a60cf4f95f9,309896.0,passed
43407,311729.0,3cc15a8837425108bc7f935cddde2a60cf4f95f9,309896.0,passed
43408,311729.0,3cc15a8837425108bc7f935cddde2a60cf4f95f9,309896.0,passed
43409,311729.0,3cc15a8837425108bc7f935cddde2a60cf4f95f9,309896.0,passed


In [37]:
df.tail()

Unnamed: 0,tr_build_id,git_prev_built_commit,tr_prev_build,tr_status
43460,311925,793600ef63817c0bafeb162321aed0f977b9cc1d,311729.0,passed
43461,311968,e45f20808cbe6d348ea65f094e0104f82626cefd,310812.0,errored
43462,311968,e45f20808cbe6d348ea65f094e0104f82626cefd,310812.0,errored
43463,311968,e45f20808cbe6d348ea65f094e0104f82626cefd,310812.0,errored
43464,311968,e45f20808cbe6d348ea65f094e0104f82626cefd,310812.0,errored


TypeError: __init__() missing 3 required positional arguments: 'name', 'meta', and 'divisions'