In [1]:
import pandas as pd

from sqlalchemy import create_engine

import sys
sys.path.insert(1, '../../.')
from src import config

In [2]:
db_connection = create_engine(config.database['db_con_str'])

## Pobranie i oczyszczenie danych issues

In [3]:
issues = pd.read_sql('issues', con=db_connection)

### Informacje o zbiorze

In [4]:
issues.shape

(150362, 9)

In [5]:
issues.dtypes

id                          int64
repo_id                     int64
reporter_id               float64
assignee_id               float64
issue_id                   object
pull_request                int64
pull_request_id           float64
created_at         datetime64[ns]
ext_ref_id                 object
dtype: object

In [6]:
issues.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150362 entries, 0 to 150361
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   id               150362 non-null  int64         
 1   repo_id          150362 non-null  int64         
 2   reporter_id      71407 non-null   float64       
 3   assignee_id      6257 non-null    float64       
 4   issue_id         150362 non-null  object        
 5   pull_request     150362 non-null  int64         
 6   pull_request_id  80729 non-null   float64       
 7   created_at       150361 non-null  datetime64[ns]
 8   ext_ref_id       150362 non-null  object        
dtypes: datetime64[ns](1), float64(3), int64(3), object(2)
memory usage: 10.3+ MB


In [7]:
issues.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150362 entries, 0 to 150361
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   id               150362 non-null  int64         
 1   repo_id          150362 non-null  int64         
 2   reporter_id      71407 non-null   float64       
 3   assignee_id      6257 non-null    float64       
 4   issue_id         150362 non-null  object        
 5   pull_request     150362 non-null  int64         
 6   pull_request_id  80729 non-null   float64       
 7   created_at       150361 non-null  datetime64[ns]
 8   ext_ref_id       150362 non-null  object        
dtypes: datetime64[ns](1), float64(3), int64(3), object(2)
memory usage: 28.3 MB


In [8]:
issues.head()

Unnamed: 0,id,repo_id,reporter_id,assignee_id,issue_id,pull_request,pull_request_id,created_at,ext_ref_id
0,1,4,,,2,1,1.0,2012-10-10 07:53:00,52343f39bd35436de30000b6
1,2,4,,,1,1,2.0,2012-06-20 15:23:42,52343f39bd35436de30000b7
2,3,3,,,12,1,3.0,2013-05-15 12:48:49,52343f45bd35436de80000b3
3,4,3,,,10,1,4.0,2013-01-29 17:08:05,52343f45bd35436de80000b4
4,5,3,,,8,1,5.0,2013-01-21 17:37:35,52343f45bd35436de80000b5


### Usuwanie zbędnych danych

In [9]:
issues.drop(['reporter_id', 'assignee_id', 'issue_id', 'pull_request', 'pull_request_id', 'ext_ref_id'], axis=1, inplace=True)

In [10]:
issues.shape

(150362, 3)

Nie ma żadnych brakujących danych

### Zmiana typów

In [11]:
issues.dtypes

id                     int64
repo_id                int64
created_at    datetime64[ns]
dtype: object

In [12]:
issues.memory_usage(deep=True)

Index             128
id            1202896
repo_id       1202896
created_at    1202896
dtype: int64

Ponieważ jeden wiersz zawiera niepoprawnie sformatowaną datę, muszę użyć innej funkcji do konwersji, niezawierającej argumentu `inplace`

In [13]:
issues.created_at = pd.to_datetime(issues.created_at, errors='coerce')

In [14]:
issues.dtypes

id                     int64
repo_id                int64
created_at    datetime64[ns]
dtype: object

### Usunięcie `NaN`

In [15]:
issues.isnull().sum()

id            0
repo_id       0
created_at    1
dtype: int64

In [16]:
issues.dropna(subset=['created_at'], how='any', inplace=True)

In [17]:
issues.shape

(150361, 3)

In [18]:
issues.isnull().sum()

id            0
repo_id       0
created_at    0
dtype: int64

### Filtrowanie

Filtrowanie tylko tych issues, które są przypisane do projektów, które analizuję

In [19]:
projects = pd.read_pickle('../../data/01_data_from_db/projects.pkl')
projects.head()

Unnamed: 0,id,name,language,created_at
0,1,akka,Scala,2009-02-16 12:51:54
1,2,devtools,R,2010-05-03 04:08:49
2,3,ProjectTemplate,R,2010-08-24 17:22:36
3,4,stat-cookbook,R,2012-04-23 20:24:37
4,5,hiphop-php,C++,2010-01-02 01:17:06


In [20]:
issues = issues[issues.repo_id.isin(projects.id)]

### Ponowne sprawdzenie informacji o zbiorze

In [21]:
issues.shape

(150361, 3)

In [22]:
issues.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150361 entries, 0 to 150361
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   id          150361 non-null  int64         
 1   repo_id     150361 non-null  int64         
 2   created_at  150361 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2)
memory usage: 4.6 MB


### Eksport do '.pkl'

In [23]:
issues.to_pickle('../../data/01_data_from_db/issues.pkl')