## Oczyszczenie danych

Skrypt ten odpowiada za oczyszczenie danych pobranych z bazy danych i zapisanie ich do plików `.pkl`.

Szczegóły na temat procesu oczyszczania danych znajdują się w rozdziale 4.2.3 pracy.

In [1]:
import pandas as pd

data_from_db = '../data/from_db/'
cleaned_data_path = '../data/cleaned/'

In [2]:
def print_summary(name, df):
    """
    Podsumowanie informacji na temat podanego DataFrame.
    
    Parameters
    ----------
    name : str
        nazwa ramki danych
    df : pandas.DataFrame
        ramka danych, której informacje należy wydrukować
    """
    
    print(f'\n\n=============={name}==============\n\n')
    print(df.head())
    print(f'\nWymiary df: {df.shape}')
    print(f'Rozmiar danych:')
    df.info(memory_usage='deep')

In [3]:
def data_mining(name, df):
    """
    Oczyszcza bezpośrednio pobrane z bazy danych zbiory, wykonując na nich, jeśli jest to potrzebne:
        - usunięcie zbędnych kolumn
        - usunięcie wierszy z brakującymi danymi
        - ustawienie odpowiednich typów danych dla każdej z kolumn
        - filtrowanie tylko wierszy, które mają wymagane powiązania z innymi tabelami
        - zmianę nazw kolumn, w celu standaryzacji
    Po oczyszczeniu zapisuje wyniki do pliku oraz zwraca.
    
    Parameters
    ----------
    name : str
        nazwa oczyszczanej tabeli
    df : DataFrame
        obiekt zawierający tabele do oczyszczenia
            
    Returns
    -------
    pandas.DataFrame
        oczyszczona ramka danych
    """
    
    print(f'\n\n==============OCZYSZCZANIE TABELI {name}==============\n\n')
    if name == 'projects':
        
        df.drop(['deleted', 'ext_ref_id', 'url', 'owner_id', 'description', 'forked_from'], 
                axis=1, inplace=True)
        df.dropna(subset=['language'], how='any', inplace=True)
        df['language'] = df['language'].astype('category')
        df['created_at'] = df['created_at'].astype('datetime64[ns]')
        df.rename(columns={'id': 'project_id'}, inplace=True)
        
    elif name == 'commits':
        
        df.drop(['sha', 'author_id', 'ext_ref_id'], axis=1, inplace=True)
        projects = pd.read_pickle(cleaned_data_path + 'projects.pkl')
        df = df[df['project_id'].isin(projects['project_id'])].copy()
        df.rename(columns={'id': 'commit_id'}, inplace=True)
        
    elif name == 'commit_comments':
        
        df.drop(['user_id', 'line', 'position', 'ext_ref_id', 'comment_id', 'body'], 
                axis=1, inplace=True)
        commits = pd.read_pickle(cleaned_data_path + 'commits.pkl')
        df = df[df['commit_id'].isin(commits['commit_id'])].copy()
        df.rename(columns={'id': 'commit_comment_id'}, inplace=True)
        
    elif name == 'issues':
        
        df.drop(['reporter_id', 'assignee_id', 'issue_id', 'pull_request', 
                 'pull_request_id', 'ext_ref_id'], 
                axis=1, inplace=True)
        df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
        df.dropna(subset=['created_at'], how='any', inplace=True)
        projects = pd.read_pickle(cleaned_data_path + 'projects.pkl')
        df = df[df['repo_id'].isin(projects['project_id'])]
        df.rename(columns={'id': 'issue_id', 'repo_id': 'project_id'}, inplace=True)
        
    elif name == 'issue_comments':
        
        df.drop(['user_id', 'ext_ref_id'], axis=1, inplace=True)
        df['comment_id'] = df['comment_id'].astype('int64')
        issues = pd.read_pickle(cleaned_data_path + 'issues.pkl')
        df = df[df['issue_id'].isin(issues['issue_id'])]
        
    
    elif name == 'pull_requests':
        
        df.drop(['head_repo_id', 'head_commit_id', 'base_commit_id', 
                 'user_id', 'intra_branch', 'pullreq_id'], axis=1, inplace=True)
        projects = pd.read_pickle(cleaned_data_path + 'projects.pkl')
        df = df[df['base_repo_id'].isin(projects['project_id'])]
        df.rename(columns={'id': 'pull_request_id', 'base_repo_id': 'project_id'}, inplace=True)
    
    elif name == 'pull_request_history':
        
        df.drop(['ext_ref_id', 'actor_id'], axis=1, inplace=True)
        df['action'] = df['action'].astype('category')
        pull_requests = pd.read_pickle(cleaned_data_path + 'pull_requests.pkl')
        df = df[df['pull_request_id'].isin(pull_requests['pull_request_id'])]
        df.rename(columns={'id': 'pull_request_history_id'}, inplace=True)
    
    elif name == 'pull_request_comments':
        
        df.drop(['user_id', 'position', 'commit_id', 'ext_ref_id', 'body'], axis=1, inplace=True)
        df['comment_id'] = df['comment_id'].astype('int64')
        pull_requests = pd.read_pickle(cleaned_data_path + 'pull_requests.pkl')
        df = df[df['pull_request_id'].isin(pull_requests['pull_request_id'])]
    
    elif name == 'watchers':
        
        df.drop(['user_id', 'ext_ref_id'], axis=1, inplace=True)
        projects = pd.read_pickle(cleaned_data_path + 'projects.pkl')
        df = df[df['repo_id'].isin(projects['project_id'])]
        df.rename(columns={'repo_id': 'project_id'}, inplace=True)        
        
    df.to_pickle(cleaned_data_path + name + '.pkl')
    print('Zapisano wynik do pliku.')
    return df

In [4]:
def prepare_pull_requests_with_history():
    """
    Łączy pull requesty i ich historię w jeden wspólny DataFrame i zapisuje go do pliku.
    
    Returns
    -------
    pandas.DataFrame
        połączona ramka danych
    """
    
    print(f'\n\n==============TWORZENIE TABELI pull_requests_with_history==============\n\n')
    pr = pd.read_pickle(cleaned_data_path + 'pull_requests.pkl')
    prh = pd.read_pickle(cleaned_data_path + 'pull_request_history.pkl')
    pr_with_history = pd.merge(pr, prh, on=['pull_request_id'], how='left', sort=False)
    pr_with_history.dropna(how='any', inplace=True)
    pr_with_history['pull_request_history_id'] = pr_with_history['pull_request_history_id'].astype('int64')
    pr_with_history.to_pickle(cleaned_data_path + 'pull_requests_with_history.pkl')
    return pr_with_history

In [5]:
names = [
    'projects', 
    'commits', 
    'commit_comments', 
    'issues', 
    'issue_comments',
    'pull_requests', 
    'pull_request_history', 
    'pull_request_comments',
    'watchers'
]

dfs = [ pd.read_pickle(f'{data_from_db}{table_name}.pkl') for table_name in names ]
new_dfs = []

In [6]:
for name, df in zip(names, dfs):
    print_summary(name, df)





   id                                                url  owner_id  \
0   1             https://api.github.com/repos/akka/akka         1   
1   2       https://api.github.com/repos/hadley/devtools         2   
2   3  https://api.github.com/repos/johnmyleswhite/Pr...         3   
3   4   https://api.github.com/repos/mavam/stat-cookbook         6   
4   5   https://api.github.com/repos/facebook/hiphop-php         8   

              name                                        description  \
0             akka                                       Akka Project   
1         devtools         Tools to make an R developer's life easier   
2  ProjectTemplate  A template utility for R projects that provide...   
3    stat-cookbook            The probability and statistics cookbook   
4       hiphop-php          Virtual Machine, Runtime, and JIT for PHP   

  language          created_at                ext_ref_id  forked_from  deleted  
0    Scala 2009-02-16 12:51:54  52343e2ebd3543bb7f00000

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188603 entries, 0 to 188602
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   id               188603 non-null  int64         
 1   pull_request_id  188603 non-null  int64         
 2   created_at       188603 non-null  datetime64[ns]
 3   ext_ref_id       188603 non-null  object        
 4   action           188603 non-null  object        
 5   actor_id         188603 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 31.7 MB




   pull_request_id  user_id comment_id  position  \
0                5        3    2717312        11   
1                5       58    2722940        11   
2                5       58    2723020        11   
3                5       58    2724044        11   
4               14     1106    4776654        46   

                                                body  commit_id  \
0  I'

In [7]:
for name, df in zip(names, dfs):
    new_df = data_mining(name, df)
    new_dfs.append(new_df)

new_df = prepare_pull_requests_with_history()
new_dfs.append(new_df)
names.append('pull_requests_with_history')





Zapisano wynik do pliku.




Zapisano wynik do pliku.




Zapisano wynik do pliku.




Zapisano wynik do pliku.




Zapisano wynik do pliku.




Zapisano wynik do pliku.




Zapisano wynik do pliku.




Zapisano wynik do pliku.




Zapisano wynik do pliku.






In [8]:
for name, df in zip(names, new_dfs):
    print_summary(name, df)





   project_id             name language          created_at
0           1             akka    Scala 2009-02-16 12:51:54
1           2         devtools        R 2010-05-03 04:08:49
2           3  ProjectTemplate        R 2010-08-24 17:22:36
3           4    stat-cookbook        R 2012-04-23 20:24:37
4           5       hiphop-php      C++ 2010-01-02 01:17:06

Wymiary df: (108616, 4)
Rozmiar danych:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 108616 entries, 0 to 108717
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   project_id  108616 non-null  int64         
 1   name        108616 non-null  object        
 2   language    108616 non-null  category      
 3   created_at  108616 non-null  datetime64[ns]
dtypes: category(1), datetime64[ns](1), int64(1), object(1)
memory usage: 9.3 MB




   commit_id  committer_id  project_id          created_at
0          1            17          10 2012-04-