In [1]:
import pandas as pd

In [2]:
projects_time_series = pd.read_pickle('../../data/02_prepared_data/projects_time_series.pkl')
projects_time_series.head()

Unnamed: 0,project_id,name,language,created_at,year,month,months_from_create
13,1,akka,Scala,2009-02-16 12:51:54,2009,2,0
14,1,akka,Scala,2009-02-16 12:51:54,2009,3,1
15,1,akka,Scala,2009-02-16 12:51:54,2009,4,2
16,1,akka,Scala,2009-02-16 12:51:54,2009,5,3
17,1,akka,Scala,2009-02-16 12:51:54,2009,6,4


Usunięcie zbędnych kolumn i zresetowanie indeksów

In [3]:
projects_time_series = projects_time_series.reset_index().drop(columns={'index', 'name', 'created_at'})
projects_time_series.head()

Unnamed: 0,project_id,language,year,month,months_from_create
0,1,Scala,2009,2,0
1,1,Scala,2009,3,1
2,1,Scala,2009,4,2
3,1,Scala,2009,5,3
4,1,Scala,2009,6,4


In [4]:
def merge_into_projects_time_series(df1, df2):
    """Przyłączenie tabeli na podstawie ID projektu i szeregu czasowego"""
    return pd.merge(df1, df2, on=['project_id', 'year', 'month'], how='left', sort=False)

In [5]:
def fill_NaN_for_monthly_values(df, col_name):
    """Wypełnienie brakujących danych zerami dla danych unikalnych w każdnym miesiącu"""
    return df[col_name].fillna(0)

In [6]:
def fill_NaN_for_summary_values(df, col_name):
    """Wypełnienie odpowiednią sumą, dla danych zluczających w poszczególnych projektach"""
    return df.groupby('project_id')[col_name].ffill().fillna(0)

In [7]:
def full_merge(df1, df2, month_val, summ_val):
    """Wykonanie wszystkich niezbędnych operacji dla przyłączanych danych zawierających dane miesięczne oraz sumy"""
    new_df = merge_into_projects_time_series(df1, df2)
    new_df[month_val] = fill_NaN_for_monthly_values(new_df, month_val)
    new_df[month_val] = new_df[month_val].astype('int64')
    new_df[summ_val] = fill_NaN_for_summary_values(new_df, summ_val)
    new_df[summ_val] = new_df[summ_val].astype('int64')
    return new_df

In [8]:
def merge_monthly_value(df1, df2, month_val):
    """Wykonanie niezbędnych operacji dla danych unikalnych w skali miesiąca"""
    new_df = merge_into_projects_time_series(df1, df2)
    new_df[month_val] = fill_NaN_for_monthly_values(new_df, month_val)
    new_df[month_val] = new_df[month_val].astype('int64')
    return new_df

In [9]:
def merge_summary_value(df1, df2, summ_val):
    """Wykonanie niezbędnych operacji dla danych sumujących się od początku projektu"""
    new_df = merge_into_projects_time_series(df1, df2)
    new_df[summ_val] = fill_NaN_for_summary_values(new_df, summ_val)
    new_df[summ_val] = new_df[summ_val].astype('int64')
    return new_df

In [10]:
# commity
new_commits = pd.read_pickle('../../data/02_prepared_data/new_commits.pkl')
unique_committers = pd.read_pickle('../../data/02_prepared_data/unique_committers.pkl')
# komentarze commitów
new_commit_comments = pd.read_pickle('../../data/02_prepared_data/new_commit_comments.pkl')
# issues
new_issues = pd.read_pickle('../../data/02_prepared_data/new_issues.pkl')
new_issue_comments = pd.read_pickle('../../data/02_prepared_data/new_issue_comments.pkl')
# pull requesty zmergowane
new_opened_pull_requests_to_merge = pd.read_pickle('../../data/02_prepared_data/new_opened_pull_requests_to_merge.pkl')
new_merged_pull_requests = pd.read_pickle('../../data/02_prepared_data/new_merged_pull_requests.pkl')
new_closed_merged_pull_requests = pd.read_pickle('../../data/02_prepared_data/new_closed_merged_pull_requests.pkl')
total_merged_pull_requests = pd.read_pickle('../../data/02_prepared_data/total_merged_pull_requests.pkl')
# pull requesty niezmergowane
new_opened_pull_requests_to_discard = pd.read_pickle('../../data/02_prepared_data/new_opened_pull_requests_to_discard.pkl')
new_closed_unmerged_pull_requests = pd.read_pickle('../../data/02_prepared_data/new_closed_unmerged_pull_requests.pkl')
total_unmerged_pull_requests = pd.read_pickle('../../data/02_prepared_data/total_unmerged_pull_requests.pkl')
# komentarze pull requestów
new_pull_request_comments = pd.read_pickle('../../data/02_prepared_data/new_pull_request_comments.pkl')
# ilość obserwujących
new_watchers = pd.read_pickle('../../data/02_prepared_data/new_watchers.pkl')

In [11]:
# commity
projects_time_series = \
    full_merge(projects_time_series, new_commits, 'new_commits', 'total_commits')
projects_time_series = \
    full_merge(projects_time_series, unique_committers, 'unique_committers', 'total_unique_committers')
# komentarze commitów
projects_time_series = \
    full_merge(projects_time_series, new_commit_comments, 'new_commit_comments', 'total_commit_comments')
# issues
projects_time_series = \
    full_merge(projects_time_series, new_issues, 'new_issues', 'total_issues')
projects_time_series = \
    full_merge(projects_time_series, new_issue_comments, 'new_issue_comments', 'total_issue_comments')
# pull requesty zmergowane
projects_time_series = \
    merge_monthly_value(projects_time_series, new_opened_pull_requests_to_merge, 'new_opened_pull_requests_to_merge')
projects_time_series = \
    merge_monthly_value(projects_time_series, new_merged_pull_requests, 'new_merged_pull_requests')
projects_time_series = \
    merge_monthly_value(projects_time_series, new_closed_merged_pull_requests, 'new_closed_merged_pull_requests')
projects_time_series = \
    merge_summary_value(projects_time_series, total_merged_pull_requests, 'total_merged_pull_requests')
# pull requesty niezmergowane
projects_time_series = \
    merge_monthly_value(projects_time_series, new_opened_pull_requests_to_discard, 'new_opened_pull_requests_to_discard')
projects_time_series = \
    merge_monthly_value(projects_time_series, new_closed_unmerged_pull_requests, 'new_closed_unmerged_pull_requests')
projects_time_series = \
    merge_summary_value(projects_time_series, total_unmerged_pull_requests, 'total_unmerged_pull_requests')
# komentarze pull requestów
projects_time_series = \
    full_merge(projects_time_series, new_pull_request_comments, 'new_pull_request_comments', 'total_pull_request_comments')
# ilość obserwujących
projects_time_series = \
    full_merge(projects_time_series, new_watchers, 'new_watchers', 'total_watchers')

projects_time_series.head()

Unnamed: 0,project_id,language,year,month,months_from_create,new_commits,total_commits,unique_committers,total_unique_committers,new_commit_comments,...,new_merged_pull_requests,new_closed_merged_pull_requests,total_merged_pull_requests,new_opened_pull_requests_to_discard,new_closed_unmerged_pull_requests,total_unmerged_pull_requests,new_pull_request_comments,total_pull_request_comments,new_watchers,total_watchers
0,1,Scala,2009,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,323,0
1,1,Scala,2009,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,56,323
2,1,Scala,2009,4,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,29,379
3,1,Scala,2009,5,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,24,408
4,1,Scala,2009,6,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,35,432


In [12]:
projects_time_series.columns

Index(['project_id', 'language', 'year', 'month', 'months_from_create',
       'new_commits', 'total_commits', 'unique_committers',
       'total_unique_committers', 'new_commit_comments',
       'total_commit_comments', 'new_issues', 'total_issues',
       'new_issue_comments', 'total_issue_comments',
       'new_opened_pull_requests_to_merge', 'new_merged_pull_requests',
       'new_closed_merged_pull_requests', 'total_merged_pull_requests',
       'new_opened_pull_requests_to_discard',
       'new_closed_unmerged_pull_requests', 'total_unmerged_pull_requests',
       'new_pull_request_comments', 'total_pull_request_comments',
       'new_watchers', 'total_watchers'],
      dtype='object')

Przykład, dla którego widać niezerowe kolumny

In [13]:
projects_time_series[projects_time_series['unique_committers'] > 30] # dummy columns z języka

Unnamed: 0,project_id,language,year,month,months_from_create,new_commits,total_commits,unique_committers,total_unique_committers,new_commit_comments,...,new_merged_pull_requests,new_closed_merged_pull_requests,total_merged_pull_requests,new_opened_pull_requests_to_discard,new_closed_unmerged_pull_requests,total_unmerged_pull_requests,new_pull_request_comments,total_pull_request_comments,new_watchers,total_watchers
409,12,C++,2011,8,8,336,11143,41,78,646,...,28,28,43,28,23,71,3,3,67,1180
410,12,C++,2011,9,9,384,11527,32,85,470,...,30,30,73,24,27,98,7,10,83,1247
411,12,C++,2011,10,10,376,11903,39,101,447,...,29,29,102,35,32,130,11,21,66,1330
412,12,C++,2011,11,11,303,12206,31,110,291,...,25,25,127,15,18,148,19,40,60,1396
413,12,C++,2011,12,12,422,12628,39,122,381,...,40,40,167,20,20,168,9,49,52,1456
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1081895,69158,Python,2013,5,13,308,1788,53,220,20,...,64,64,297,139,172,778,215,1035,49,7249
1081897,69158,Python,2013,7,15,162,2087,31,238,13,...,19,19,340,82,96,949,304,1448,51,7339
1081899,69158,Python,2013,9,17,139,2370,32,253,5,...,31,31,390,129,131,1198,467,2385,9,7425
1234334,78852,Ruby,2013,8,64,148,588,53,115,10,...,145,145,4704,133,69,2510,338,2409,0,0


W ostatnim etapie, język programowania zostaje rozbity na poszczególne kolumny, na tzw. *dummy variables*.
Oprócz tego kolumny służące jako łącznik pomiędzy wszystkimi danymi nie są już potrzebne - jest to ID projektu, miesiąc i rok.

In [22]:
training_set = pd.get_dummies(projects_time_series, columns=['language']) \
    .drop(columns={'project_id', 'year', 'month'})
training_set

Unnamed: 0,months_from_create,new_commits,total_commits,unique_committers,total_unique_committers,new_commit_comments,total_commit_comments,new_issues,total_issues,new_issue_comments,...,language_Lua,language_Objective-C,language_PHP,language_Perl,language_Python,language_R,language_Ruby,language_Scala,language_Shell,language_TypeScript
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1810596,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1810597,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1810598,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1810599,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [23]:
training_set.to_pickle('../../data/03_training_set/training_set.pkl')