# Settings

In [74]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import math

# Loading the data

In [2]:
sample_submission = pd.read_csv("C:/Users/odeli/Desktop/dataCamp_data/sample_submission.csv")
test = pd.read_csv("C:/Users/odeli/Desktop/dataCamp_data/test.csv")
specs = pd.read_csv("C:/Users/odeli/Desktop/dataCamp_data/specs.csv")
train_labels = pd.read_csv("C:/Users/odeli/Desktop/dataCamp_data/train_labels.csv")
train = pd.read_csv("C:/Users/odeli/Desktop/dataCamp_data/train.csv")

In [3]:
print('Shapes:\n - train: ', np.shape(train), '\n - train_labels: ',np.shape(train_labels), '\n - test: ',np.shape(test), '\n - specs: ',np.shape(specs), '\n - sample_submission: ',np.shape(sample_submission))

Shapes:
 - train:  (11341042, 11) 
 - train_labels:  (17690, 7) 
 - test:  (1156414, 11) 
 - specs:  (386, 3) 
 - sample_submission:  (1000, 2)


## Making labels

The first very important thing to do is to create the labels.
As we said the right information is contained in the event_data column.
This column is a dictionary with a variable size for each line.

Organisation of the information recorded by the application:
    * When a child downloads the app an id is created: id_installation
    * When he starts playing a game an other id is created: game_session
    * Inside a game_session many things can be done by the gamer. Every event has its own id: event_id. The different possible events are caracterized by a code contained in the event_code column.
The only useful events to build the labels are the last ones in a game session: they correspond to the succes or the failure of the game in a given game_session. The corrresponding event_code is 4100 for 4 of the 5 games and 4110 for the last one (Bird Measurer).

In [332]:
def get_accuracy(data):
    """
    input: data
    output: data_labels for each of the 5 games
    """
    
    df = pd.DataFrame()
    
    games = ['Bird Measurer', 'Cart Balancer', 'Cauldron Filler', 'Chest Sorter', 'Mushroom Sorter']
    
    # Loop on the 5 games
    for game in games:
        tmp = data[data['title'].str.contains(game)]
        
        # Filter the last event : 4110/4100 (code)
        if game == 'Bird Measurer':
            tmp = tmp[(tmp['event_code'] == 4110)]#| (tmp['event_code'] == 4100)]
        else:
            tmp = tmp[tmp['event_code'] == 4100]
    
        # num_correct and num_incorrect
        correct = ["NA" for i in range(np.shape(tmp)[0])]
        incorrect = ["NA" for i in range(np.shape(tmp)[0])]
        for i in range(np.shape(tmp)[0]):
            if ('correct":false' in tmp.loc[tmp.index[i], 'event_data']):
                correct[i] = 0
                incorrect[i] = 1
            elif ('correct":true' in tmp.loc[tmp.index[i], 'event_data']):
                correct[i] = 1
                incorrect[i] = 0
            else:
                correct[i] = 'NA'
                incorrect[i] = 'NA'
        tmp['num_correct'] = correct
        tmp['num_incorrect'] = incorrect
        tmp = pd.DataFrame(tmp.groupby(('installation_id','game_session','title')).sum())
            
        # accuracy
        accuracy = tmp['num_correct'] / (tmp['num_correct'] + tmp['num_incorrect'])
        tmp['accuracy'] = accuracy

        # accuracy_group
        tmp["accuracy_group"] = tmp["accuracy"].apply(lambda x: 0 if x==0 else (1 if x<0.5 else (2 if x<0.9 else 3)))
        df = pd.concat([df, tmp])
        
    df = df.reset_index()[['game_session','installation_id','title','num_correct','num_incorrect','accuracy','accuracy_group']]
    return(df)

In [333]:
my_train_labels = get_accuracy(train)



In [337]:
np.shape(train_labels), np.shape(my_train_labels)

((17690, 7), (17690, 7))

In [131]:
np.shape(my_train_labels), np.shape(train_labels)

((17692, 7), (17690, 7))

## Trucation of the train data

In this section we wrote a function that troncate data from the train in rder to reproduce the stucture of the test to be able to make predictions.
To truncate the data (train) we used 4 methods:
    * 'random : For each installation_id, randomly choose an assessment and truncate the data after the begining of this assessement.
    * 'first' : For each installation_id truncate after one assessment
    * 'last' : For each installation_id truncate right before the last assessment
    * 'proportion' : Fix a proportion of assessment to keep and troncate the rest of the data 

### Removing the id without assessment

In [None]:
no_assess_id = []
unique_id = np.unique(train['installation_id'])
for id in unique_id:
        test_id = train[train['installation_id'] == id]
        n = test_id[(test_id['type']=='Assessment') & (test_id['event_code'] == 2000)].count()[0] #nombre d'assessment commencés
        if n==0: no_assess_id.append(id)
        if(len(no_assess_id)%1000 == 0): print(len(no_assess_id))


In [374]:
def truncate_data(data, how='random'):
    """
    Inputs:
        data : DataFrame in the shape of train.
        how : string. Method of truncation used (default:'random', other choices: 'first', 'last', 'proportion').
    Output:
        new_data : DataFrame. data where data is truncated from a given assessment.
    """
    num_col_type = 9
    num_col_ecode = 6
    
    print('Selecting unique installation_id in the data')
    unique_id = np.unique(data['installation_id'])
    print('Done !')
    
    new_data = pd.DataFrame(columns = data.columns)
    
    
    print('Truncation of the data')
    
    for id in unique_id:
        print('installation_id', id)
        test_id = data[data['installation_id'] == id]
        n = test_id[(test_id['type']=='Assessment') & (test_id['event_code'] == 2000)].count()[0] #nombre d'assessment commencés
        if n!=0:
            if how == 'random': ass_trunc = randint(1,n)
            elif how == 'first': ass_trunc = 1
            elif how == 'last': ass_trunc = n
            idx = 0 # nombre d'assessment commencés
            idx_tmp = 0 # numéro de la ligne courante
            while(idx < ass_trunc):
                if((test_id.iloc[idx_tmp, num_col_type]== 'Assessment') & (test_id.iloc[idx_tmp, num_col_ecode] == 2000)):
                    idx += 1
                idx_tmp += 1
            new_data = pd.concat([new_data, test_id[0:idx_tmp]])
            print('current data id shape', np.shape(test_id[0:idx_tmp]))
            print('new data shape', np.shape(new_data))
    return(new_data)

In [373]:
import time

start_time = time.time()

new_data = truncate_data(train)

print("Temps d execution : %s secondes ---" % (time.time() - start_time))

Selecting unique installation_id in the data
Done !
Truncation of the data
installation_id 0001e90f
installation_id 000447c4
installation_id 0006a69f
current data id shape (648, 11)
new data shape (648, 11)
installation_id 0006c192
current data id shape (1899, 11)
new data shape (2547, 11)
installation_id 0009a5a9
installation_id 0011edc8
installation_id 00129856
current data id shape (4, 11)
new data shape (2551, 11)
installation_id 0016b7cc
installation_id 00195df7
installation_id 001d0ed0
current data id shape (7, 11)
new data shape (2558, 11)
installation_id 002114ae
installation_id 00219589
installation_id 00225f67
current data id shape (605, 11)
new data shape (3163, 11)
installation_id 00279ac5
current data id shape (2501, 11)
new data shape (5664, 11)
installation_id 002c8bae
installation_id 002db7e3
current data id shape (2536, 11)
new data shape (8200, 11)
installation_id 003372b0
current data id shape (190, 11)
new data shape (8390, 11)
installation_id 0034471d
installation_

KeyboardInterrupt: 

In [124]:
def trucate(data, prop):
    """
    input: train
            the ratio of the assessments to keep
    output: train trucated from the starrt of the assesment n
    """
    df = pd.DataFrame()
    
    games = ['Bird Measurer', 'Cart Balancer', 'Cauldron Filler', 'Chest Sorter', 'Mushroom Sorter']
    
    # Loop on the 5 games
    for game in games:
        tmp = data[data['title'].str.contains(game)]
        
        # Filter the last event : 4110/4100 (code)
        if game == 'Bird Measurer':
            tmp = tmp[(tmp['event_code'] == 4110) | (tmp['event_code'] == 4100)]
        else:
            tmp = tmp[tmp['event_code'] == 4100]
    
        # num_correct and num_incorrect
        correct = ["NA" for i in range(np.shape(tmp)[0])]
        incorrect = ["NA" for i in range(np.shape(tmp)[0])]
        for i in range(np.shape(tmp)[0]):
            if ('correct":false' in tmp.loc[tmp.index[i], 'event_data']):
                correct[i] = 0
                incorrect[i] = 1
            elif ('correct":true' in tmp.loc[tmp.index[i], 'event_data']):
                correct[i] = 1
                incorrect[i] = 0
            else:
                correct[i] = 'NA'
                incorrect[i] = 'NA'
        tmp['num_correct'] = correct
        tmp['num_incorrect'] = incorrect
        tmp = pd.DataFrame(tmp.groupby(('installation_id')).sum())
        
        df = pd.concat([df, tmp])
        
    df = df.reset_index()[['installation_id','num_correct','num_incorrect']]
    
    df['num_attempt'] = df['num_correct'] + df['num_incorrect']
    
    df['threshold'] = (df['num_attempt']*prop).astype(int)
    return(df)