In [1]:
import pandas as pd

## Przygotowanie `DataFrame`'u z projektami

In [2]:
def prepare_project_time_series(projects):
    projects = split_projects_by_years(projects)
    projects = split_projects_by_months(projects)
    add_months_from_create_col(projects)
    drop_invalid_rows(projects)
    projects.to_pickle('../../data/02_prepared_data/projects_time_series.pkl')
    return projects

In [3]:
def split_projects_by_years(projects):
    """
    Tworzy nowy DataFrame, uzupełniając projekty o kolumnę years.
    """
    
    years = pd.Series(list(range(projects.created_at.min().year, 
                                 projects.created_at.max().year + 1)), name = 'year')
    projects = projects \
            .assign(key=1).merge(years.to_frame('year').assign(key=1), on='key') \
            .drop('key', 1).copy()
    return projects

In [4]:
def split_projects_by_months(projects):
    """
    Tworzy nowy DataFrame, uzupełniając projekty o kolumny months.
    """
    
    months = pd.Series(list(range(1, 13)), name = 'month')
    projects = projects \
            .assign(key=1).merge(months.to_frame('month').assign(key=1), on='key') \
            .drop('key', 1).copy()
    return projects

In [5]:
def add_months_from_create_col(projects):
    """
    Dodaje do DataFrame kolumnę z informacją o ilości miesiący, które upłynęły
    od utworzenia projektu.
    """
    
    projects['months_from_create'] = \
        (projects['year'] - projects['created_at'].dt.year) * 12 + \
        (projects['month'] - projects['created_at'].dt.month)

In [6]:
def drop_invalid_rows(projects):
    """
    Usuwa wiersze, w których czas od utworzenia jest ujemny.
    """
    
    projects.drop(projects[projects['months_from_create'] < 0].index, inplace=True)

In [7]:
projects = prepare_project_time_series(pd.read_pickle('../../data/01_data_from_db/projects.pkl'))
projects

Unnamed: 0,project_id,name,language,created_at,year,month,months_from_create
13,1,akka,Scala,2009-02-16 12:51:54,2009,2,0
14,1,akka,Scala,2009-02-16 12:51:54,2009,3,1
15,1,akka,Scala,2009-02-16 12:51:54,2009,4,2
16,1,akka,Scala,2009-02-16 12:51:54,2009,5,3
17,1,akka,Scala,2009-02-16 12:51:54,2009,6,4
...,...,...,...,...,...,...,...
7820279,108739,rails,Ruby,2013-09-27 17:53:25,2013,12,3
7820348,108740,rails,Ruby,2013-09-26 14:38:42,2013,9,0
7820349,108740,rails,Ruby,2013-09-26 14:38:42,2013,10,1
7820350,108740,rails,Ruby,2013-09-26 14:38:42,2013,11,2


## Commity

In [8]:
def add_year_and_month_to_df(df):
    df['year'] = df['created_at'].dt.year
    df['month'] = df['created_at'].dt.month

In [9]:
def prepare_commits(commits):
    add_year_and_month_to_df(commits)
    commits = add_new_commits_col(commits)
    add_total_commits_col(commits)
    commits.to_pickle('../../data/02_prepared_data/new_commits.pkl')
    return commits

In [10]:
def add_new_commits_col(commits):
    return commits.groupby(by= ['project_id', 'year', 'month'], as_index=False) \
        .agg({'created_at': pd.Series.nunique}) \
        .rename(columns={'created_at': 'new_commits'}).copy()

In [11]:
def add_total_commits_col(commits):
    total_commits_list = []
    for index, row in commits.iterrows():
        total_commits_list.append(commits[(((row['year'] > commits['year']) |
            ((row['year'] == commits['year']) & (row['month'] >= commits['month']))) &
            (row['project_id'] == commits['project_id']))]['new_commits'].sum())
    commits['total_commits'] = pd.Series(total_commits_list)

In [12]:
commits = prepare_commits(pd.read_pickle('../../data/01_data_from_db/commits.pkl'))
commits

Unnamed: 0,project_id,year,month,new_commits,total_commits
0,2,2010,5,43,43
1,2,2010,6,4,47
2,2,2010,7,3,50
3,2,2010,8,1,51
4,2,2010,9,1,52
...,...,...,...,...,...
18757,90343,2013,9,2,2
18758,90366,2013,9,13,13
18759,91413,2013,9,1,1
18760,91417,2013,9,1,1


## Commitujący

In [13]:
def prepare_committers(commits):
    add_year_and_month_to_df(commits)
    committers = add_new_committers_col(commits)
    add_total_committers_col(commits, committers)
    committers.to_pickle('../../data/02_prepared_data/unique_committers.pkl')
    return committers

In [14]:
def add_new_committers_col(commits):
    return commits.groupby(by = ['project_id', 'year', 'month'], as_index=False) \
        .agg({'committer_id': pd.Series.nunique}) \
        .rename(columns={'committer_id': 'unique_committers'}).copy()

In [15]:
def add_total_committers_col(commits, committers):
    total_unique_committers = []
    for index, row in committers.iterrows():
        total_unique_committers.append(commits[((commits['project_id'] == row['project_id']) &
            ((commits['year'] < row['year']) | 
            ((commits['year'] == row['year']) & (commits['month'] <= row['month']))))]['committer_id'].nunique())
    committers['total_unique_committers'] = pd.Series(total_unique_committers)

In [16]:
committers = prepare_committers(pd.read_pickle('../../data/01_data_from_db/commits.pkl'))
committers

Unnamed: 0,project_id,year,month,unique_committers,total_unique_committers
0,2,2010,5,1,1
1,2,2010,6,1,1
2,2,2010,7,1,1
3,2,2010,8,1,1
4,2,2010,9,1,1
...,...,...,...,...,...
18757,90343,2013,9,1,1
18758,90366,2013,9,1,1
18759,91413,2013,9,1,1
18760,91417,2013,9,1,1
