## Task 1

* `username` – developer's username
* `commits` – number of commits by the developer
* `changed_lines` – number of changed lines
* `new_files` – number of new files added by dev

Sorted by username

### pandas

In [1]:
import pandas as pd
from pandas.io.json import json_normalize
import json

In [2]:
with open('commits.json', 'r') as f:
    data = json.load(f)

In [3]:
data[0]

{'username': 'bober38',
 'commit_time': '2021/03/12 05:45:02',
 'files': [{'name': 'utils/net.py', 'changed_lines': 85}]}

In [4]:
df = json_normalize(data, 'files', ['username', 'commit_time'])
df.head()

  df = json_normalize(data, 'files', ['username', 'commit_time'])


Unnamed: 0,name,changed_lines,username,commit_time
0,utils/net.py,85,bober38,2021/03/12 05:45:02
1,library/utils.py,36,coder007,2021/03/02 15:01:24
2,quality/utils.py,38,vanessic,2021/01/22 10:14:08
3,core/library.py,17,john_snow,2021/01/16 15:21:07
4,frontend/tools.py,6,john_snow,2021/01/16 15:21:07


In [5]:
df['commit_time'] = pd.to_datetime(df['commit_time'])

In [6]:
df.groupby('username')['changed_lines'].sum()

username
bober38         885
coder007        396
ivan_ivanov    1473
john_snow      1211
pet            1896
vanessic       1622
yegor          1341
zeno           1211
Name: changed_lines, dtype: int64

In [7]:
df.groupby('username')['commit_time'].nunique()

username
bober38        12
coder007        7
ivan_ivanov    15
john_snow      12
pet            14
vanessic       14
yegor          14
zeno           12
Name: commit_time, dtype: int64

In [8]:
agg = df.groupby(['name', 'username'])[['commit_time']].min().sort_values(['name', 'commit_time'])
agg

Unnamed: 0_level_0,Unnamed: 1_level_0,commit_time
name,username,Unnamed: 2_level_1
backend/library.py,vanessic,2021-01-16 05:58:01
backend/library.py,john_snow,2021-01-18 00:13:13
backend/library.py,yegor,2021-03-13 15:34:49
backend/library.py,ivan_ivanov,2021-03-14 03:24:59
backend/library.py,coder007,2021-03-17 05:24:34
...,...,...
utils/tools.py,vanessic,2021-03-15 07:39:18
utils/utils.py,john_snow,2021-01-03 06:01:48
utils/utils.py,coder007,2021-01-16 20:52:16
utils/utils.py,yegor,2021-02-27 10:56:31


In [9]:
d = {}
for file in agg.reset_index()['name'].unique():
    d[file] = agg.loc[file].iloc[0].name

In [10]:
d = pd.DataFrame([d]).T.reset_index().groupby(0).count().to_dict()['index']
d

{'bober38': 2,
 'coder007': 1,
 'ivan_ivanov': 4,
 'john_snow': 4,
 'pet': 2,
 'vanessic': 7,
 'yegor': 7,
 'zeno': 1}

In [11]:
df_ = pd.DataFrame([d]).T.reset_index()

In [12]:
df_.rename(columns={'index':'username', 0:'new_files'}, inplace=True)

In [15]:
df_['changed_lines'] = df.groupby('username')['changed_lines'].sum().values

In [16]:
df_['commits'] = df.groupby('username')['commit_time'].nunique().values

In [17]:
df_

Unnamed: 0,username,new_files,changed_lines,commits
0,bober38,2,885,12
1,coder007,1,396,7
2,ivan_ivanov,4,1473,15
3,john_snow,4,1211,12
4,pet,2,1896,14
5,vanessic,7,1622,14
6,yegor,7,1341,14
7,zeno,1,1211,12


## Dictionaries

In [3]:
from collections import defaultdict
import json

In [4]:
with open('commits.json', 'r') as f:
    data = json.load(f)

In [6]:
data[0]

{'username': 'bober38',
 'commit_time': '2021/03/12 05:45:02',
 'files': [{'name': 'utils/net.py', 'changed_lines': 85}]}

In [9]:
data[0]['files'][0]['changed_lines']

85

In [10]:
data = sorted(data, key=lambda x: pd.to_datetime(x['commit_time']))

In [12]:
d = defaultdict(list)

In [13]:
d

defaultdict(list, {})

In [14]:
d[999]

[]

In [15]:
d1 = {}

In [16]:
d1[999]

KeyError: 999

In [17]:
d

defaultdict(list, {999: []})

In [18]:
d[1].append(12)

In [19]:
d

defaultdict(list, {999: [], 1: [12]})

In [30]:
table = defaultdict(lambda: {'commits':0, 'changed_lines':0, 'new_files':0})

In [31]:
table['yegor']

{'commits': 0, 'changed_lines': 0, 'new_files': 0}

In [32]:
table['ivan']['new_files']

0

In [26]:
table

defaultdict(<function __main__.<lambda>()>,
            {'yegor': {'commits': 0, 'changed_lines': 0, 'new_files': 0}})

In [34]:
data[0]

{'username': 'yegor',
 'commit_time': '2021/01/01 03:14:55',
 'files': [{'name': 'utils/tools.py', 'changed_lines': 51},
  {'name': 'balancer/tools.py', 'changed_lines': 22},
  {'name': 'library/utils.py', 'changed_lines': 75}]}

In [29]:
data[0]['username']

'yegor'

In [35]:
data[0]['files']

[{'name': 'utils/tools.py', 'changed_lines': 51},
 {'name': 'balancer/tools.py', 'changed_lines': 22},
 {'name': 'library/utils.py', 'changed_lines': 75}]

In [36]:
new_files = set()
table = defaultdict(lambda: {'commits':0, 'changed_lines':0, 'new_files':0})

for commit in data:
    user = commit['username']
    table[user]['commits'] += 1
    for file in commit['files']:
        table[user]['changed_lines'] += file['changed_lines']
        if file['name'] not in new_files: 
            new_files.add(file['name'])
            table[user]['new_files'] += 1
table

defaultdict(<function __main__.<lambda>()>,
            {'yegor': {'commits': 14, 'changed_lines': 1341, 'new_files': 7},
             'coder007': {'commits': 7, 'changed_lines': 396, 'new_files': 1},
             'bober38': {'commits': 12, 'changed_lines': 885, 'new_files': 2},
             'ivan_ivanov': {'commits': 15,
              'changed_lines': 1473,
              'new_files': 4},
             'john_snow': {'commits': 12,
              'changed_lines': 1211,
              'new_files': 4},
             'vanessic': {'commits': 14,
              'changed_lines': 1622,
              'new_files': 7},
             'zeno': {'commits': 12, 'changed_lines': 1211, 'new_files': 1},
             'pet': {'commits': 14, 'changed_lines': 1896, 'new_files': 2}})

In [37]:
pd.DataFrame(table).T.reset_index().sort_values(by='index')

Unnamed: 0,index,commits,changed_lines,new_files
2,bober38,12,885,2
1,coder007,7,396,1
3,ivan_ivanov,15,1473,4
4,john_snow,12,1211,4
7,pet,14,1896,2
5,vanessic,14,1622,7
0,yegor,14,1341,7
6,zeno,12,1211,1
