# Generate data for generators section

Explain format of the data:

`USER_ID,TV_201711_M,TV_201711_A,TV_201711_N,VOD_201711_M,VOD_201711_A,VOD_201711_N`


In [2]:
import numpy as np
import pandas as pd

## TV / VOD consumption

In [2]:
def generate_data_for_user(user_id, year, month):
    tv_minutes = random_state.randint(10, 360) * 30
    tv_minutes_part_of_day = random_state.multinomial(tv_minutes, [0.2, 0.3, 0.5])

    vod_minutes = random_state.randint(10, 240) * 30
    vod_minutes_part_of_day = random_state.multinomial(vod_minutes, [0.2, 0.2, 0.6])

    tv_template = 'TV_{year}{month}_{pod}'
    vod_template = 'VOD_{year}{month}_{pod}'
    columns = (
        ['USER_ID'] 
        + [tv_template.format(year=year, month=month, pod=pod) for pod in ['M', 'A', 'N']]
        + [vod_template.format(year=year, month=month, pod=pod) for pod in ['M', 'A', 'N']]
    )
    data = [user_id, *tv_minutes_part_of_day, *vod_minutes_part_of_day]

    user_row = pd.Series(
        data=data,
        index=columns,
    )
    return user_row


def generate_data_for_month(n_users, year, month):
    df = pd.DataFrame([generate_data_for_user(user_id, year, month) for user_id in range(n_users)]).set_index('USER_ID')
    return df

In [3]:
random_state = np.random.RandomState(8349)

year = 2017
n_users = 100
for month in [10, 11, 12]:
    df = generate_data_for_month(n_users, year, month)
    
    filename = 'consumption_{year}{month}.csv'.format(year=year, month=month)
    df.to_csv(filename)

## CSV file with comments

In [19]:
comment_prefixes = ['#', '--', 'REM']
comments = ['Ignore this line', 'Skip me', 'Do not bother']

def data_or_comment_generator(n_rows, n_cols, fraction_comments=0.2, random_state=np.random):
    for i in range(n_rows):
        if random_state.uniform() < fraction_comments:
            # Generate comment line
            prefix = random_state.choice(comment_prefixes)
            comment = random_state.choice(comments)
            yield prefix + ' ' + comment + '\n'
        else:
            # Generate data line
            data = random_state.randint(0, 1000, size=(n_cols,)).astype(str)
            row = ','.join(data)
            yield row + '\n'


In [24]:
random_state = np.random.RandomState(69699)

filenames = ['first_commented_data.csv', 'second_commented_data.csv', 'third_commented_data.csv']
for filename in filenames:
    with open(filename, 'w') as f:
        f.writelines(
            data_or_comment_generator(
                n_rows=200,
                n_cols=4, 
                fraction_comments=0.2, 
                random_state=random_state
            )
        )