# Imports

In [137]:
from datetime import datetime
import json
import pandas as pd

# Helpers

In [138]:
def to_datetime(ts):
    return datetime.fromtimestamp(ts)

In [139]:
def group_time(user):
    """
        `dict_time` have '0', '1' ... as a key. In the keys have a list
        with `ts` values. The `ts` values are pull from `user.ts` and
        these values will be use for compare the `dict_time` with the
        `DataFrame` and update the `group_m` column.
    """
    global dict_time

    if len(dict_time.keys()) > 0: 
        group_count = sorted(dict_time.keys())[-1]
        
        last_ts = max(dict_time[group_count])
        
        last_datetime = to_datetime(last_ts)
        user_datetime = to_datetime(user.ts)
        
        dif = (last_datetime - user_datetime).total_seconds()
        
        if abs(dif) < 120:
            dict_time.get(group_count).append(user.ts)
        else:
            group_count = str(int(group_count) + 1)

            dict_time[group_count] = [user.ts]

        user.group_m = group_count
    else: 
        dict_time['0'] = [user.ts]
        user.group_m = '0'
    
    return user

In [140]:
def change_id(value):
    """
        Change group int id (like '0') to the first `timestamp`
        value in the group.
    """
    value.group_men = str(min(value.ts))

    return value

In [141]:
def correct_json(user_json):
    for key in user_json.keys():
        json_correct = json.loads(user_json.get(key))
        
        for i, element in enumerate(json_correct):
            del element['group_men']
            json_correct[i] = {k: v for k, v in element.items() if v is not None}
        
        user_json[key] = json_correct
        
    return user_json

# Group Messages

## Read json

In [142]:
users_df = pd.read_json('U1ZQR43RB.json')
users_df.shape

(214, 13)

In [143]:
users_df.head()

Unnamed: 0,user,type,subtype,ts,text,inviter,purpose,files,upload,display_as_bot,edited,item_type,item
0,U0MFNAG05,message,channel_join,1471111000.0,<@U0MFNAG05> has joined the channel,,,,,,,,
1,U0KK0T3CG,message,channel_join,1471111000.0,<@U0KK0T3CG> has joined the channel,U0MFNAG05,,,,,,,
2,U0MFNAG05,message,channel_purpose,1471111000.0,<@U0MFNAG05> set the channel purpose: Discussi...,,Discussion about all physics project related s...,,,,,,
3,U1ZQR43RB,message,channel_join,1471111000.0,<@U1ZQR43RB> has joined the channel,U0MFNAG05,,,,,,,
4,U0MFNAG05,message,,1471111000.0,"Hong, let’s move our discussion here",,,,,,,,


In [144]:
users_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 13 columns):
user              214 non-null object
type              214 non-null object
subtype           5 non-null object
ts                214 non-null float64
text              214 non-null object
inviter           2 non-null object
purpose           1 non-null object
files             26 non-null object
upload            26 non-null float64
display_as_bot    26 non-null float64
edited            1 non-null object
item_type         1 non-null object
item              1 non-null object
dtypes: float64(3), object(10)
memory usage: 21.9+ KB


## Group by users

In [145]:
total_users = users_df['user'].unique()

In [146]:
total_users

array(['U0MFNAG05', 'U0KK0T3CG', 'U1ZQR43RB', 'USLACKBOT'], dtype=object)

Example:

In [147]:
test = users_df.groupby(['user']).get_group('U0MFNAG05')
test.head()

Unnamed: 0,user,type,subtype,ts,text,inviter,purpose,files,upload,display_as_bot,edited,item_type,item
0,U0MFNAG05,message,channel_join,1471111000.0,<@U0MFNAG05> has joined the channel,,,,,,,,
2,U0MFNAG05,message,channel_purpose,1471111000.0,<@U0MFNAG05> set the channel purpose: Discussi...,,Discussion about all physics project related s...,,,,,,
4,U0MFNAG05,message,,1471111000.0,"Hong, let’s move our discussion here",,,,,,,,
9,U0MFNAG05,message,,1471111000.0,just post it to this channel,,,,,,,,
11,U0MFNAG05,message,,1471111000.0,we try to avoid emails,,,,,,,,


## Create subgroups for all users

In [148]:
dict_time = {}

for user in total_users:
    user_df = users_df.groupby(['user']).get_group(user)
    
    """
        Group user with the function `group_time`
        and put the value of the group in collum `group_m`
    """
    user_df['group_m'] = 'NaN'
    dict_time = {}
    user_df = user_df.apply(group_time, axis=1)
    
    """
        Rename `group_m` to `group_men` and changes
        `group_men` value from `int` to `timestemp`
    """
    user_df['group_men'] = 'NaN'
    user_df = user_df.groupby('group_m').apply(change_id)
    del user_df['group_m']
    
    user_json = user_df.groupby('group_men').apply(
                    lambda x: x.to_json(orient='records'))
    user_json = json.loads(user_json.to_json(orient='index'))
    user_json = correct_json(user_json)
    
    with open('output/{}.json'.format(user), 'w') as outfile:
        json.dump(user_json, outfile)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
