In [1]:
import pandas as pd

from sqlalchemy import create_engine

import sys
sys.path.insert(1, '../../.')
from src import config

In [2]:
db_connection = create_engine(config.database['db_con_str'])

## Pobranie i oczyszczenie danych komentarzy commitów

In [3]:
pr_comments = pd.read_sql('pull_request_comments', con=db_connection)

### Informacje o zbiorze

In [4]:
pr_comments.shape

(54892, 8)

In [5]:
pr_comments.dtypes

pull_request_id             int64
user_id                     int64
comment_id                 object
position                    int64
body                       object
commit_id                   int64
created_at         datetime64[ns]
ext_ref_id                 object
dtype: object

In [6]:
pr_comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54892 entries, 0 to 54891
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   pull_request_id  54892 non-null  int64         
 1   user_id          54892 non-null  int64         
 2   comment_id       54892 non-null  object        
 3   position         54892 non-null  int64         
 4   body             54892 non-null  object        
 5   commit_id        54892 non-null  int64         
 6   created_at       54892 non-null  datetime64[ns]
 7   ext_ref_id       54892 non-null  object        
dtypes: datetime64[ns](1), int64(4), object(3)
memory usage: 3.4+ MB


In [7]:
pr_comments.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54892 entries, 0 to 54891
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   pull_request_id  54892 non-null  int64         
 1   user_id          54892 non-null  int64         
 2   comment_id       54892 non-null  object        
 3   position         54892 non-null  int64         
 4   body             54892 non-null  object        
 5   commit_id        54892 non-null  int64         
 6   created_at       54892 non-null  datetime64[ns]
 7   ext_ref_id       54892 non-null  object        
dtypes: datetime64[ns](1), int64(4), object(3)
memory usage: 18.0 MB


In [8]:
pr_comments.head()

Unnamed: 0,pull_request_id,user_id,comment_id,position,body,commit_id,created_at,ext_ref_id
0,5,3,2717312,11,I'm a little confused by this combination. Cou...,102,2013-01-21 19:38:15,52343f49bd35436de80000c0
1,5,58,2722940,11,The idea here is that the settings specified i...,102,2013-01-22 08:38:46,52343f49bd35436de80000c1
2,5,58,2723020,11,...I just test-ran this code again and realize...,102,2013-01-22 08:49:02,52343f49bd35436de80000c2
3,5,58,2724044,11,"Ok, I am pretty new to R so this was an intere...",102,2013-01-22 10:46:03,52343f49bd35436de80000c3
4,14,1106,4776654,46,You might want to check `eventExpr == 0` just ...,2520,2013-06-19 16:32:14,52344137bd35436de1000459


### Usuwanie zbędnych danych

In [9]:
pr_comments.drop(['position', 'ext_ref_id'], axis=1, inplace=True)

In [10]:
pr_comments.shape

(54892, 6)

In [11]:
pr_comments.isnull().sum()

pull_request_id    0
user_id            0
comment_id         0
body               0
commit_id          0
created_at         0
dtype: int64

### Zmiana typów

In [12]:
pr_comments.dtypes

pull_request_id             int64
user_id                     int64
comment_id                 object
body                       object
commit_id                   int64
created_at         datetime64[ns]
dtype: object

In [13]:
pr_comments.comment_id = pr_comments.comment_id.astype('int64')

In [14]:
pr_comments.dtypes

pull_request_id             int64
user_id                     int64
comment_id                  int64
body                       object
commit_id                   int64
created_at         datetime64[ns]
dtype: object

In [15]:
pr_comments.isnull().sum()

pull_request_id    0
user_id            0
comment_id         0
body               0
commit_id          0
created_at         0
dtype: int64

### Filtrowanie

Filtrowanie tylko tych komentarzy, które dotyczą analizowanych pull requestów

In [16]:
pull_requests = pd.read_pickle('../../data/01_data_from_db/pull_requests.pkl')
pull_requests.head()

Unnamed: 0,id,base_repo_id,pullreq_id,merged
0,1,4,2,1
1,2,4,1,0
2,3,3,12,0
3,4,3,10,1
4,5,3,8,1


In [17]:
pr_comments.shape

(54892, 6)

In [18]:
pr_comments[pr_comments.pull_request_id.isin(pull_requests.id)].shape

(54892, 6)

In [19]:
pr_comments[pr_comments.pull_request_id.isin(pull_requests.pullreq_id)].shape

(10236, 6)

In [20]:
pr_comments = pr_comments[pr_comments.pull_request_id.isin(pull_requests.id)]

Ponowne sprawdzenie informacji o zbiorze

In [21]:
pr_comments.shape

(54892, 6)

In [22]:
pr_comments.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54892 entries, 0 to 54891
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   pull_request_id  54892 non-null  int64         
 1   user_id          54892 non-null  int64         
 2   comment_id       54892 non-null  int64         
 3   body             54892 non-null  object        
 4   commit_id        54892 non-null  int64         
 5   created_at       54892 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 10.8 MB


### Eksport do `.pkl`

In [23]:
pr_comments.to_pickle('../../data/01_data_from_db/pull_requests_comments.pkl')