In [1]:
import pandas as pd

## Przygotowanie `DataFrame`'u z projektami

In [2]:
def prepare_project_time_series(projects):
    """
    przygotowuje zbiór projektów dodając do niego kolumny związane z czasem
    """
    
    projects = split_projects_by_years(projects)
    projects = split_projects_by_months(projects)
    add_months_from_create_col(projects)
    drop_invalid_rows(projects)
    projects.to_pickle('../../data/02_prepared_data/projects_time_series.pkl')
    return projects

In [3]:
def split_projects_by_years(projects):
    """
    tworzy nowy DataFrame, uzupełniając projekty o kolumnę years
    """
    
    years = pd.Series(list(range(projects.created_at.min().year, 
                                 projects.created_at.max().year + 1)), name = 'year')
    projects = projects \
            .assign(key=1).merge(years.to_frame('year').assign(key=1), on='key') \
            .drop('key', 1).copy()
    return projects

In [4]:
def split_projects_by_months(projects):
    """
    tworzy nowy DataFrame, uzupełniając projekty o kolumny months
    """
    
    months = pd.Series(list(range(1, 13)), name = 'month')
    projects = projects \
            .assign(key=1).merge(months.to_frame('month').assign(key=1), on='key') \
            .drop('key', 1).copy()
    return projects

In [5]:
def add_months_from_create_col(projects):
    """
    dodaje do DataFrame kolumnę z informacją o ilości miesiący, które upłynęły od utworzenia projektu
    """
    
    projects['months_from_create'] = \
        (projects['year'] - projects['created_at'].dt.year) * 12 + \
        (projects['month'] - projects['created_at'].dt.month)

In [6]:
def drop_invalid_rows(projects):
    """
    usuwa wiersze, w których czas od utworzenia jest ujemny.
    """
    
    projects.drop(projects[projects['months_from_create'] < 0].index, inplace=True)

In [7]:
projects = prepare_project_time_series(pd.read_pickle('../../data/01_data_from_db/projects.pkl'))
projects

Unnamed: 0,project_id,name,language,created_at,year,month,months_from_create
13,1,akka,Scala,2009-02-16 12:51:54,2009,2,0
14,1,akka,Scala,2009-02-16 12:51:54,2009,3,1
15,1,akka,Scala,2009-02-16 12:51:54,2009,4,2
16,1,akka,Scala,2009-02-16 12:51:54,2009,5,3
17,1,akka,Scala,2009-02-16 12:51:54,2009,6,4
...,...,...,...,...,...,...,...
7820279,108739,rails,Ruby,2013-09-27 17:53:25,2013,12,3
7820348,108740,rails,Ruby,2013-09-26 14:38:42,2013,9,0
7820349,108740,rails,Ruby,2013-09-26 14:38:42,2013,10,1
7820350,108740,rails,Ruby,2013-09-26 14:38:42,2013,11,2


In [8]:
def add_year_and_month_to_df(df):
    """
    dodaje do DataFrame kolumny year oraz month na podstawie wartości w kolumnie created_at
    """
    
    df['year'] = df['created_at'].dt.year
    df['month'] = df['created_at'].dt.month

## Commity

In [9]:
def prepare_commits(commits):
    """
    zwraca DataFrame commitów przygotowany do połączenia ze zbiorem uczącym
    """
    
    add_year_and_month_to_df(commits)
    commits = add_new_commits_col(commits)
    add_total_commits_col(commits)
    commits.to_pickle('../../data/02_prepared_data/new_commits.pkl')
    return commits

In [10]:
def add_new_commits_col(commits):
    """
    zgrupowanie commitów do obiektu zawierającego kolumnę liczby nowych commitów w miesiącu
    """
    
    return commits.groupby(['project_id', 'year', 'month']) \
        .count().reset_index().rename(columns={'created_at': 'new_commits'}) \
        .drop(columns={'commit_id', 'committer_id'}, axis=1).copy()

In [11]:
def add_total_commits_col(commits):
    """
    dodanie kolumny z łączną liczbą commitów od początku projektu
    """
    
    total_commits_list = []
    for index, row in commits.iterrows():
        total_commits_list.append(commits[(((row['year'] > commits['year']) |
            ((row['year'] == commits['year']) & (row['month'] >= commits['month']))) &
            (row['project_id'] == commits['project_id']))]['new_commits'].sum())
    commits['total_commits'] = pd.Series(total_commits_list)

In [12]:
commits = prepare_commits(pd.read_pickle('../../data/01_data_from_db/commits.pkl'))
commits

Unnamed: 0,project_id,year,month,new_commits,total_commits
0,2,2010,5,43,43
1,2,2010,6,4,47
2,2,2010,7,3,50
3,2,2010,8,1,51
4,2,2010,9,1,52
...,...,...,...,...,...
18757,90343,2013,9,2,2
18758,90366,2013,9,13,13
18759,91413,2013,9,1,1
18760,91417,2013,9,1,1


## Komentarze commitów

In [13]:
def prepare_commit_comments(commit_comments):
    """
    zwraca DataFrame komentarzy commitów przygotowany do połączenia ze zbiorem uczącym
    """
    
    commit_comments = merge_projects_into_commit_comments(commit_comments)
    add_year_and_month_to_df(commit_comments)
    commit_comments = add_new_commit_comments_col(commit_comments)
    add_total_commit_comments(commit_comments)
    commit_comments.to_pickle('../../data/02_prepared_data/new_commit_comments.pkl')
    return commit_comments

In [14]:
def merge_projects_into_commit_comments(commit_comments):
    """
    dołączenie ID projektu do obiektu z komentarzami commitów
    """
    
    commits = pd.read_pickle('../../data/01_data_from_db/commits.pkl')
    return pd.merge(commits, commit_comments, on=['commit_id'], sort=False) \
        .drop(columns={'commit_id', 'committer_id', 'created_at_x'}) \
        .rename(columns={'created_at_y': 'created_at'}).copy()

In [15]:
def add_new_commit_comments_col(commit_comments):
    """
    zgrupowanie komentarzy commitów do obiektu zawierającego kolumnę liczby nowych komentarzy commitów w miesiącu
    """
    
    return commit_comments.groupby(['project_id', 'year', 'month']).count().reset_index() \
        .drop(columns={'commit_comment_id'}) \
        .rename(columns={'created_at': 'new_commit_comments'}).copy()

In [16]:
def add_total_commit_comments(commit_comments):
    """
    dodanie kolumny z łączną liczbą komentarzy commitów od początku projektu
    """
    
    total_commit_comments_list = []
    for index, row in commit_comments.iterrows():
        total_commit_comments_list.append(commit_comments[(((row['year'] > commit_comments['year']) |
            ((row['year'] == commit_comments['year']) & (row['month'] >= commit_comments['month']))) &
            (row['project_id'] == commit_comments['project_id']))]['new_commit_comments'].sum())
    commit_comments['total_commit_comments'] = pd.Series(total_commit_comments_list)

In [17]:
commit_comments = prepare_commit_comments(pd.read_pickle('../../data/01_data_from_db/commit_comments.pkl'))
commit_comments

Unnamed: 0,project_id,year,month,new_commit_comments,total_commit_comments
0,2,2011,7,4,4
1,2,2012,2,1,5
2,2,2012,8,12,17
3,2,2012,9,3,20
4,2,2012,12,4,24
...,...,...,...,...,...
2900,86821,2013,9,4,4
2901,86826,2013,9,1,1
2902,87147,2013,8,1,1
2903,87648,2013,8,1,1


## Commitujący

In [18]:
def prepare_committers(commits):
    """
    zwraca DataFrame commitujących przygotowany do połączenia ze zbiorem uczącym
    """
    
    add_year_and_month_to_df(commits)
    committers = add_new_committers_col(commits)
    add_total_committers_col(commits, committers)
    committers.to_pickle('../../data/02_prepared_data/unique_committers.pkl')
    return committers

In [19]:
def add_new_committers_col(commits):
    """
    zgrupowanie comiitujących do obiektu zawierającego kolumnę liczby unikalnych commitujących w miesiącu
    """
    
    return commits.groupby(by = ['project_id', 'year', 'month'], as_index=False) \
        .agg({'committer_id': pd.Series.nunique}) \
        .rename(columns={'committer_id': 'unique_committers'}).copy()

In [20]:
def add_total_committers_col(commits, committers):
    """
    dodanie kolumny z łączną liczbą unikalnych commitujących od początku projektu
    """
    
    total_unique_committers = []
    for index, row in committers.iterrows():
        total_unique_committers.append(commits[((commits['project_id'] == row['project_id']) &
            ((commits['year'] < row['year']) | 
            ((commits['year'] == row['year']) & (commits['month'] <= row['month']))))]['committer_id'].nunique())
    committers['total_unique_committers'] = pd.Series(total_unique_committers)

In [21]:
committers = prepare_committers(pd.read_pickle('../../data/01_data_from_db/commits.pkl'))
committers

Unnamed: 0,project_id,year,month,unique_committers,total_unique_committers
0,2,2010,5,1,1
1,2,2010,6,1,1
2,2,2010,7,1,1
3,2,2010,8,1,1
4,2,2010,9,1,1
...,...,...,...,...,...
18757,90343,2013,9,1,1
18758,90366,2013,9,1,1
18759,91413,2013,9,1,1
18760,91417,2013,9,1,1


## Issues

In [22]:
def prepare_issues(issues):
    """
    zwraca DataFrame issues przygotowany do połączenia ze zbiorem uczącym
    """
    
    add_year_and_month_to_df(issues)
    issues = add_new_issues_col(issues)
    add_total_issues_col(issues)
    issues.to_pickle('../../data/02_prepared_data/new_issues.pkl')
    return issues

In [23]:
def add_new_issues_col(issues):
    """
    zgrupowanie issues do obiektu zawierającego kolumnę liczby nowych issues w miesiącu
    """
    
    return issues.groupby(['project_id', 'year', 'month']).count().reset_index() \
        .rename(columns={'created_at': 'new_issues'}) \
        .drop(columns={'issue_id'}).copy()

In [24]:
def add_total_issues_col(issues):
    """
    dodanie kolumny z łączną liczbą issues od początku projektu
    """
    
    total_issues_list = []
    for index, row in issues.iterrows():
        total_issues_list.append(issues[(((row['year'] > issues['year']) |
            ((row['year'] == issues['year']) & (row['month'] >= issues['month']))) &
            (row['project_id'] == issues['project_id']))]['new_issues'].sum())
    issues['total_issues'] = pd.Series(total_issues_list)

In [25]:
issues = prepare_issues(pd.read_pickle('../../data/01_data_from_db/issues.pkl'))
issues

Unnamed: 0,project_id,year,month,new_issues,total_issues
0,1,2009,7,46,46
1,1,2009,8,9,55
2,1,2009,9,1,56
3,1,2009,10,3,59
4,1,2010,9,6,65
...,...,...,...,...,...
2964,107672,2013,6,19,273
2965,107672,2013,7,15,288
2966,107672,2013,8,10,298
2967,107672,2013,9,3,301


## Komentarze issues

In [26]:
def prepare_issue_comments(issue_comments):
    """
    zwraca DataFrame komentarzy issues przygotowany do połączenia ze zbiorem uczącym
    """
    
    issue_comments = merge_projects_into_issue_comments(issue_comments)
    add_year_and_month_to_df(issue_comments)
    issue_comments = add_new_issue_comments_col(issue_comments)
    add_total_issue_comments(issue_comments)
    issue_comments.to_pickle('../../data/02_prepared_data/new_issue_comments.pkl')
    return issue_comments

In [27]:
def merge_projects_into_issue_comments(issue_comments):
    """
    dołączenie ID projektu do obiektu z komentarzami issues
    """
    
    issues = pd.read_pickle('../../data/01_data_from_db/issues.pkl')
    return pd.merge(issues, issue_comments, on=['issue_id'], sort=False) \
        .drop(columns={'created_at_x'}) \
        .rename(columns={'created_at_y': 'created_at'}).copy()

In [28]:
def add_new_issue_comments_col(issue_comments):
    """
    zgrupowanie komentarzy issues do obiektu zawierającego kolumnę liczby nowych komentarzy issues w miesiącu
    """
    
    return issue_comments.groupby(['project_id', 'year', 'month']).count().reset_index() \
        .drop(columns={'issue_id', 'comment_id'}) \
        .rename(columns={'created_at': 'new_issue_comments'}).copy()

In [29]:
def add_total_issue_comments(issue_comments):
    """
    dodanie kolumny z łączną liczbą komentarzy issues od początku projektu
    """
    
    total_issue_comments_list = []
    for index, row in issue_comments.iterrows():
        total_issue_comments_list.append(issue_comments[(((row['year'] > issue_comments['year']) |
            ((row['year'] == issue_comments['year']) & (row['month'] >= issue_comments['month']))) &
            (row['project_id'] == issue_comments['project_id']))]['new_issue_comments'].sum())
    issue_comments['total_issue_comments'] = pd.Series(total_issue_comments_list)

In [30]:
issue_comments = prepare_issue_comments(pd.read_pickle('../../data/01_data_from_db/issue_comments.pkl'))
issue_comments

Unnamed: 0,project_id,year,month,new_issue_comments,total_issue_comments
0,1,2009,7,14,14
1,1,2009,8,12,26
2,1,2009,9,1,27
3,1,2009,10,5,32
4,1,2010,11,3,35
...,...,...,...,...,...
2880,107672,2013,5,30,460
2881,107672,2013,6,45,505
2882,107672,2013,7,51,556
2883,107672,2013,8,25,581


## Pull requesty

In [31]:
def pre_prepare_pull_requests(pull_requests):
    """
    wstępne przygotowanie DataFrame'u pull requestów (dodanie roku i miesiąca oraz pogrupowanie)
    """
    
    add_year_and_month_to_df(pull_requests)
    pull_requests = grouped_pull_requests(pull_requests)
    return pull_requests

In [32]:
def grouped_pull_requests(pull_requests):
    """
    pogrupowanie pull requestów po projekcie, miesiącu, wykonanej akcji oraz informacji czy zostanie zmergowany
    dodaje kolumne z ilością odpowiednich pull requestów
    """
    
    pull_requests = pull_requests.groupby(['project_id', 'year', 'month', 'action', 'merged']) \
        .count().reset_index().dropna() \
        .rename(columns={'pull_request_id': 'new_pull_requests'}) \
        .drop(columns={'pull_request_history_id', 'created_at'}).copy()
    pull_requests['new_pull_requests'] = pull_requests['new_pull_requests'].astype('int64')
    return pull_requests

In [33]:
def opened_pull_requests_to_merge(pull_requests):
    """
    dla każdego projektu zlicza w danym miesiącu liczbę nowych otwartych pull requestów, które zostaną zmergowane
    """
    
    df = pull_requests[(pull_requests['action'] == 'opened') & (pull_requests['merged'] == 1)] \
        .rename(columns={'new_pull_requests': 'new_opened_pull_requests_to_merge'}) \
        .drop(columns={'action', 'merged'}).copy()
    df.to_pickle('../../data/02_prepared_data/new_opened_pull_requests_to_merge.pkl')
    return df

In [34]:
def merged_pull_requests(pull_requests):
    """
    dla każdego projektu zlicza w danym miesiącu liczbę nowych zmergowanych pull requestów
    """
    
    df = pull_requests[(pull_requests['action'] == 'merged') & (pull_requests['merged'] == 1)] \
        .rename(columns={'new_pull_requests': 'new_merged_pull_requests'}) \
        .drop(columns={'action', 'merged'}).copy()
    df.to_pickle('../../data/02_prepared_data/new_merged_pull_requests.pkl')
    return df

In [35]:
def closed_merged_pull_requests(pull_requests):
    """
    dla każdego projektu zlicza w danym miesiącu liczbę nowych zamkniętych, zmergowanych pull requestów
    """
    
    df = pull_requests[(pull_requests['action'] == 'closed') & (pull_requests['merged'] == 1)] \
        .rename(columns={'new_pull_requests': 'new_closed_merged_pull_requests'}) \
        .drop(columns={'action', 'merged'}).copy()
    df.to_pickle('../../data/02_prepared_data/new_closed_merged_pull_requests.pkl')
    return df

In [36]:
def opened_pull_requests_to_discard(pull_requests):
    """
    dla każdego projektu zlicza w danym miesiącu liczbę nowych otwarych pull requestów, które nie zostaną zmergowane
    """
    
    df = pull_requests[(pull_requests['action'] == 'opened') & (pull_requests['merged'] == 0)] \
        .rename(columns={'new_pull_requests': 'new_opened_pull_requests_to_discard'}) \
        .drop(columns={'action', 'merged'}).copy()
    df.to_pickle('../../data/02_prepared_data/new_opened_pull_requests_to_discard.pkl')
    return df

In [37]:
def closed_unmerged_pull_requests(pull_requests):
    """
    dla każdego projektu zlicza w danym miesiącu liczbe nowych zamkniętych, niezmergowanych pull requestów
    """
    
    df = pull_requests[(pull_requests['action'] == 'closed') & (pull_requests['merged'] == 0)] \
        .rename(columns={'new_pull_requests': 'new_closed_unmerged_pull_requests'}) \
        .drop(columns={'action', 'merged'}).copy()
    df.to_pickle('../../data/02_prepared_data/new_closed_unmerged_pull_requests.pkl')
    return df

In [38]:
def total_merged_pull_requests(pull_requests):
    """
    dodanie kolumny z łączną liczbą zmergowanych pull requestów od początku projektu
    """
    
    df = pull_requests[(pull_requests['merged'] == 1) &
        (pull_requests['action'] == 'merged')].drop(columns={'merged', 'action'}).copy()
    total_merged_pull_requests = []
    for index, row in df.iterrows():
        total_merged_pull_requests.append(df[
                        (row['project_id'] == df['project_id']) & 
                        ((row['year'] > df['year']) | ((row['year'] == df['year']) &  
                        (row['month'] >= df['month'])))]['new_pull_requests'].sum())
    df['total_merged_pull_requests'] = total_merged_pull_requests
    df.drop(columns={'new_pull_requests'}, inplace=True)
    df.to_pickle('../../data/02_prepared_data/total_merged_pull_requests.pkl')
    return df

In [39]:
def total_unmerged_pull_requests(pull_requests):
    """
    dodanie kolumny z łączną liczbą niezmergowanych pull requestów od początku projektu
    """
    
    df = pull_requests[(pull_requests['merged'] == 0) &
        (pull_requests['action'] == 'closed')].drop(columns={'merged', 'action'}).copy()
    total_unmerged_pull_requests = []
    for index, row in df.iterrows():
        total_unmerged_pull_requests.append(df[
                        (row['project_id'] == df['project_id']) & 
                        ((row['year'] > df['year']) | ((row['year'] == df['year']) &  
                        (row['month'] >= df['month'])))]['new_pull_requests'].sum())
    df['total_unmerged_pull_requests'] = total_unmerged_pull_requests
    df.drop(columns={'new_pull_requests'}, inplace=True)
    df.to_pickle('../../data/02_prepared_data/total_unmerged_pull_requests.pkl')
    return df

In [40]:
pull_requests = pre_prepare_pull_requests(pd.read_pickle('../../data/01_data_from_db/pull_requests_with_history.pkl'))
pull_requests

Unnamed: 0,project_id,year,month,action,merged,new_pull_requests
48,1,2010,9,closed,0,4
49,1,2010,9,closed,1,2
51,1,2010,9,merged,1,2
52,1,2010,9,opened,0,4
53,1,2010,9,opened,1,2
...,...,...,...,...,...,...
25320,107672,2013,9,closed,0,2
25324,107672,2013,9,opened,0,2
25327,107672,2013,10,closed,1,1
25329,107672,2013,10,merged,1,1


In [41]:
opened_pull_requests_to_merge = opened_pull_requests_to_merge(pull_requests)
opened_pull_requests_to_merge

Unnamed: 0,project_id,year,month,new_opened_pull_requests_to_merge
53,1,2010,9,2
59,1,2010,10,2
65,1,2010,11,4
95,1,2011,4,1
113,1,2011,7,3
...,...,...,...,...
25301,107672,2013,5,4
25307,107672,2013,6,4
25313,107672,2013,7,4
25319,107672,2013,8,2


In [42]:
merged_pull_requests = merged_pull_requests(pull_requests)
merged_pull_requests

Unnamed: 0,project_id,year,month,new_merged_pull_requests
51,1,2010,9,2
57,1,2010,10,2
63,1,2010,11,4
93,1,2011,4,1
111,1,2011,7,2
...,...,...,...,...
25299,107672,2013,5,5
25305,107672,2013,6,2
25311,107672,2013,7,6
25317,107672,2013,8,2


In [43]:
closed_merged_pull_requests = closed_merged_pull_requests(pull_requests)
closed_merged_pull_requests

Unnamed: 0,project_id,year,month,new_closed_merged_pull_requests
49,1,2010,9,2
55,1,2010,10,2
61,1,2010,11,4
91,1,2011,4,1
109,1,2011,7,2
...,...,...,...,...
25297,107672,2013,5,5
25303,107672,2013,6,2
25309,107672,2013,7,6
25315,107672,2013,8,2


In [44]:
opened_pull_requests_to_discard = opened_pull_requests_to_discard(pull_requests)
opened_pull_requests_to_discard

Unnamed: 0,project_id,year,month,new_opened_pull_requests_to_discard
52,1,2010,9,4
64,1,2010,11,2
76,1,2011,1,2
88,1,2011,3,3
100,1,2011,5,1
...,...,...,...,...
25300,107672,2013,5,5
25306,107672,2013,6,8
25312,107672,2013,7,8
25318,107672,2013,8,5


In [45]:
closed_unmerged_pull_requests = closed_unmerged_pull_requests(pull_requests)
closed_unmerged_pull_requests

Unnamed: 0,project_id,year,month,new_closed_unmerged_pull_requests
48,1,2010,9,4
72,1,2011,1,1
84,1,2011,3,2
90,1,2011,4,4
102,1,2011,6,3
...,...,...,...,...
25296,107672,2013,5,3
25302,107672,2013,6,8
25308,107672,2013,7,6
25314,107672,2013,8,9


In [46]:
total_merged_pull_requests = total_merged_pull_requests(pull_requests)
total_merged_pull_requests

Unnamed: 0,project_id,year,month,total_merged_pull_requests
51,1,2010,9,2
57,1,2010,10,4
63,1,2010,11,8
93,1,2011,4,9
111,1,2011,7,11
...,...,...,...,...
25299,107672,2013,5,12
25305,107672,2013,6,14
25311,107672,2013,7,20
25317,107672,2013,8,22


In [47]:
total_unmerged_pull_requests = total_unmerged_pull_requests(pull_requests)
total_unmerged_pull_requests

Unnamed: 0,project_id,year,month,total_unmerged_pull_requests
48,1,2010,9,4
72,1,2011,1,5
84,1,2011,3,7
90,1,2011,4,11
102,1,2011,6,14
...,...,...,...,...
25296,107672,2013,5,190
25302,107672,2013,6,198
25308,107672,2013,7,204
25314,107672,2013,8,213


## Komentarze pull requestów

In [48]:
def prepare_pull_request_comments(pull_request_comments):
    """
    zwraca DataFrame komentarzy pull reqeustów przygotowany do połączenia ze zbiorem uczącym
    """
    
    pull_request_comments = merge_projects_into_pull_request_comments(pull_request_comments)
    add_year_and_month_to_df(pull_request_comments)
    pull_request_comments = add_new_pull_request_comments_col(pull_request_comments)
    add_total_pull_request_comments(pull_request_comments)
    pull_request_comments.to_pickle('../../data/02_prepared_data/new_pull_request_comments.pkl')
    return pull_request_comments

In [49]:
def merge_projects_into_pull_request_comments(pull_request_comments):
    """
    dołączenie ID projektu do obiektu z komentarzami pull requestów
    """
    
    pull_requests = pd.read_pickle('../../data/01_data_from_db/pull_requests.pkl')
    return pd.merge(pull_requests, pull_request_comments, on=['pull_request_id'], sort=False) \
        .drop(columns={'pull_request_id', 'merged', 'comment_id'}).copy()

In [50]:
def add_new_pull_request_comments_col(pull_request_comments):
    """
    zgrupowanie pull requestów do obiektu zawierającego kolumnę liczby nowych pull requestów w miesiącu
    """
    
    return pull_request_comments.groupby(['project_id', 'year', 'month']).count().reset_index() \
        .rename(columns={'created_at': 'new_pull_request_comments'}).copy()

In [51]:
def add_total_pull_request_comments(pull_request_comments):
    """
    dodanie kolumny z łączną liczbą komentarzy pull requestów od początku projektu
    """
    
    total_pull_request_comments = []
    for index, row in pull_request_comments.iterrows():
        total_pull_request_comments \
            .append(pull_request_comments[(((row['year'] > pull_request_comments['year']) |
            ((row['year'] == pull_request_comments['year']) & (row['month'] >= pull_request_comments['month']))) &
            (row['project_id'] == pull_request_comments['project_id']))]['new_pull_request_comments'].sum())
    pull_request_comments['total_pull_request_comments'] = pd.Series(total_pull_request_comments)

In [52]:
pull_request_comments = prepare_pull_request_comments(pd.read_pickle('../../data/01_data_from_db/pull_request_comments.pkl'))
pull_request_comments

Unnamed: 0,project_id,year,month,new_pull_request_comments,total_pull_request_comments
0,1,2011,4,5,5
1,1,2011,6,8,13
2,1,2011,10,134,147
3,1,2011,11,158,305
4,1,2011,12,688,993
...,...,...,...,...,...
1150,107672,2013,5,54,324
1151,107672,2013,6,37,361
1152,107672,2013,7,131,492
1153,107672,2013,8,22,514


## Obserwujący

In [53]:
def prepare_watchers(watchers):
    """
    zwraca DataFrame obserwujących przygotowany do połączenia ze zbiorem uczącym
    """
    
    add_year_and_month_to_df(watchers)
    watchers = add_new_watchers_col(watchers)
    add_total_watchers_col(watchers)
    watchers.to_pickle('../../data/02_prepared_data/new_watchers.pkl')
    return watchers

In [54]:
def add_new_watchers_col(watchers):
    """
    zgrupowanie obserwujących do obiektu zawierającego kolumnę liczby nowych obserwujących w miesiącu
    """
    
    return watchers.groupby(['project_id', 'year', 'month']).count().reset_index() \
        .rename(columns={'created_at': 'new_watchers'}).copy()

In [55]:
def add_total_watchers_col(watchers):
    """
    dodanie kolumny z łączną liczbą nowych obserwujących od początku projektu (bez uwzględnienia bieżacego miesiąca)
    """
    
    total_watchers = []
    for index, row in watchers.iterrows():
        total_watchers.append(watchers[(row['project_id'] == watchers['project_id']) & 
                ((row['year'] > watchers['year']) | 
                 ((row['year'] == watchers['year']) & 
                  (row['month'] > watchers['month'])))]['new_watchers'].sum())
    watchers['total_watchers'] = pd.Series(total_watchers)

In [56]:
watchers = prepare_watchers(pd.read_pickle('../../data/01_data_from_db/watchers.pkl'))
watchers

Unnamed: 0,project_id,year,month,new_watchers,total_watchers
0,1,2009,2,323,0
1,1,2009,3,56,323
2,1,2009,4,29,379
3,1,2009,5,24,408
4,1,2009,6,35,432
...,...,...,...,...,...
2847,107672,2013,4,6,1466
2848,107672,2013,5,3,1472
2849,107672,2013,6,3,1475
2850,107672,2013,7,6,1478


## Zbiór uczący

In [57]:
projects = projects.reset_index().drop(columns={'index', 'name', 'created_at'})

In [58]:
def merge_into_projects_time_series(df1, df2):
    """
    przyłączenie tabeli na podstawie ID projektu, roku oraz miesiąca
    """
    
    return pd.merge(df1, df2, on=['project_id', 'year', 'month'], how='left', sort=False)

In [59]:
def fill_NaN_for_monthly_values(df, col_name):
    """
    wypełnienie brakujących danych zerami dla danych unikalnych w każdnym miesiącu
    """
    
    return df[col_name].fillna(0)

In [60]:
def fill_NaN_for_summary_values(df, col_name):
    """
    wypełnienie odpowiednią sumą, dla danych zluczających w poszczególnych projektach
    """
    
    return df.groupby('project_id')[col_name].ffill().fillna(0)

In [61]:
def full_merge(df1, df2, month_val, summ_val):
    """
    wykonanie wszystkich niezbędnych operacji dla przyłączanych danych zawierających dane miesięczne oraz sumy
    """
    
    new_df = merge_into_projects_time_series(df1, df2)
    new_df[month_val] = fill_NaN_for_monthly_values(new_df, month_val)
    new_df[month_val] = new_df[month_val].astype('int64')
    new_df[summ_val] = fill_NaN_for_summary_values(new_df, summ_val)
    new_df[summ_val] = new_df[summ_val].astype('int64')
    return new_df

In [62]:
def merge_monthly_value(df1, df2, month_val):
    """
    wykonanie niezbędnych operacji dla danych unikalnych w skali miesiąca
    """
    
    new_df = merge_into_projects_time_series(df1, df2)
    new_df[month_val] = fill_NaN_for_monthly_values(new_df, month_val)
    new_df[month_val] = new_df[month_val].astype('int64')
    return new_df

In [63]:
def merge_summary_value(df1, df2, summ_val):
    """
    wykonanie niezbędnych operacji dla danych sumujących się od początku projektu
    """

    new_df = merge_into_projects_time_series(df1, df2)
    new_df[summ_val] = fill_NaN_for_summary_values(new_df, summ_val)
    new_df[summ_val] = new_df[summ_val].astype('int64')
    return new_df

In [64]:
def dummies_from_languages(projects):
    """
    z jednej kolumny zawierającej n kategorii, tworzy n kolumn zero-jedynkowych
    """
    
    return pd.get_dummies(projects, columns=['language']) \
        .drop(columns={'project_id', 'year', 'month'}).copy()

In [65]:
# commity
projects = full_merge(projects, commits, 'new_commits', 'total_commits')
projects = full_merge(projects, committers, 'unique_committers', 'total_unique_committers')
# komentarze commitów
projects = full_merge(projects, commit_comments, 'new_commit_comments', 'total_commit_comments')
# issues
projects = full_merge(projects, issues, 'new_issues', 'total_issues')
projects = full_merge(projects, issue_comments, 'new_issue_comments', 'total_issue_comments')
# pull requesty zmergowane
projects = merge_monthly_value(projects, opened_pull_requests_to_merge, 'new_opened_pull_requests_to_merge')
projects = merge_monthly_value(projects, merged_pull_requests, 'new_merged_pull_requests')
projects = merge_monthly_value(projects, closed_merged_pull_requests, 'new_closed_merged_pull_requests')
projects = merge_summary_value(projects, total_merged_pull_requests, 'total_merged_pull_requests')
# pull requesty niezmergowane
projects = merge_monthly_value(projects, opened_pull_requests_to_discard, 'new_opened_pull_requests_to_discard')
projects = merge_monthly_value(projects, closed_unmerged_pull_requests, 'new_closed_unmerged_pull_requests')
projects = merge_summary_value(projects, total_unmerged_pull_requests, 'total_unmerged_pull_requests')
# komentarze pull requestów
projects = full_merge(projects, pull_request_comments, 'new_pull_request_comments', 'total_pull_request_comments')
# ilość obserwujących
projects = full_merge(projects, watchers, 'new_watchers', 'total_watchers')

In [66]:
training_set = dummies_from_languages(projects)
training_set

Unnamed: 0,months_from_create,new_commits,total_commits,unique_committers,total_unique_committers,new_commit_comments,total_commit_comments,new_issues,total_issues,new_issue_comments,...,language_Lua,language_Objective-C,language_PHP,language_Perl,language_Python,language_R,language_Ruby,language_Scala,language_Shell,language_TypeScript
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1810596,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1810597,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1810598,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1810599,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [67]:
training_set.to_pickle('../../data/03_training_set/training_set.pkl')