1. Load the data set
2. Display the first rows of the data set

In [54]:
import pandas as pd

def load_data_frame(path):
    """
    Load data set from file path
    :param path: the path of the file to turn into a data set
    :return: a created data set based on the header of the file
    """
    return pd.read_csv(path)

df = load_data_frame('../data/X_train.csv')
print(df.head())

          t       x_1       y_1     v_x_1     v_y_1       x_2       y_2  \
0  0.000000  1.000000  0.000000  0.000000  0.000000 -0.266467  0.859196   
1  0.039062  0.999548  0.000092 -0.023159  0.004731 -0.266261  0.858781   
2  0.078125  0.998190  0.000370 -0.046362  0.009474 -0.265641  0.857535   
3  0.117188  0.995925  0.000833 -0.069654  0.014239 -0.264606  0.855456   
4  0.156250  0.992747  0.001483 -0.093080  0.019040 -0.263154  0.852540   

      v_x_2     v_y_2       x_3       y_3     v_x_3     v_y_3  Id  
0  0.000000  0.000000 -0.733533 -0.859196  0.000000  0.000000   0  
1  0.010574 -0.021257 -0.733287 -0.858874  0.012584  0.016526   1  
2  0.021172 -0.042552 -0.732549 -0.857905  0.025189  0.033078   2  
3  0.031817 -0.063924 -0.731318 -0.856289  0.037837  0.049685   3  
4  0.042533 -0.085412 -0.729592 -0.854022  0.050548  0.066372   4  


Then we sanitize the data set by removing the columns that are not useful for the model. We also remove the rows with missing values.

In [55]:
import numpy as np

def sanitize_data_set(data_frame):
    """
    Sanitize the data set by removing the rows that are not useful for the model (Nan values, etc.)
    :param data_frame: the data set to sanitize
    :return: the sanitized data set
    """
    ## data_set = data_set.drop(['Id'], axis=1) # we might wanna merge train and test data
    data_frame = data_frame.replace(0, np.nan)
    data_frame = data_frame.dropna(axis=0, how='all')
    data_frame = data_frame.replace(np.nan, 0)
    return data_frame

df = sanitize_data_set(df)
print(df.head())

          t       x_1       y_1     v_x_1     v_y_1       x_2       y_2  \
0  0.000000  1.000000  0.000000  0.000000  0.000000 -0.266467  0.859196   
1  0.039062  0.999548  0.000092 -0.023159  0.004731 -0.266261  0.858781   
2  0.078125  0.998190  0.000370 -0.046362  0.009474 -0.265641  0.857535   
3  0.117188  0.995925  0.000833 -0.069654  0.014239 -0.264606  0.855456   
4  0.156250  0.992747  0.001483 -0.093080  0.019040 -0.263154  0.852540   

      v_x_2     v_y_2       x_3       y_3     v_x_3     v_y_3   Id  
0  0.000000  0.000000 -0.733533 -0.859196  0.000000  0.000000  0.0  
1  0.010574 -0.021257 -0.733287 -0.858874  0.012584  0.016526  1.0  
2  0.021172 -0.042552 -0.732549 -0.857905  0.025189  0.033078  2.0  
3  0.031817 -0.063924 -0.731318 -0.856289  0.037837  0.049685  3.0  
4  0.042533 -0.085412 -0.729592 -0.854022  0.050548  0.066372  4.0  


Now we do the same for the target data set.

In [56]:
target_df = load_data_frame('../data/X_test.csv')
target_df = sanitize_data_set(target_df)
print(target_df.head())

    Id         t  x0_1  y0_1      x0_2      y0_2      x0_3      y0_3
0  0.0  0.000000   1.0   0.0 -0.179617  0.730085 -0.820383 -0.730085
1  1.0  0.039062   1.0   0.0 -0.179617  0.730085 -0.820383 -0.730085
2  2.0  0.078125   1.0   0.0 -0.179617  0.730085 -0.820383 -0.730085
3  3.0  0.117188   1.0   0.0 -0.179617  0.730085 -0.820383 -0.730085
4  4.0  0.156250   1.0   0.0 -0.179617  0.730085 -0.820383 -0.730085


Now we merge the data frames by their 'Id' column and drop all columns with only zeros, or if the 'Id's are not in both data frames.

In [57]:
def merge_data_frames(data_frame, target_data_frame):
    """
    Merge two data frames by their 'Id' column
    :param data_frame: the first data frame
    :param target_data_frame: the second data frame
    :return: the merged data frame
    """
    merged_data_frame = pd.merge(data_frame, target_data_frame, how='inner', on='Id')
    return merged_data_frame

merged_df = merge_data_frames(df, target_df)
merged_df = sanitize_data_set(merged_df)
print(merged_df.head())

        t_x       x_1       y_1     v_x_1     v_y_1       x_2       y_2  \
0  0.000000  1.000000  0.000000  0.000000  0.000000 -0.266467  0.859196   
1  0.039062  0.999548  0.000092 -0.023159  0.004731 -0.266261  0.858781   
2  0.078125  0.998190  0.000370 -0.046362  0.009474 -0.265641  0.857535   
3  0.117188  0.995925  0.000833 -0.069654  0.014239 -0.264606  0.855456   
4  0.156250  0.992747  0.001483 -0.093080  0.019040 -0.263154  0.852540   

      v_x_2     v_y_2       x_3  ...     v_x_3     v_y_3   Id       t_y  x0_1  \
0  0.000000  0.000000 -0.733533  ...  0.000000  0.000000  0.0  0.000000   1.0   
1  0.010574 -0.021257 -0.733287  ...  0.012584  0.016526  1.0  0.039062   1.0   
2  0.021172 -0.042552 -0.732549  ...  0.025189  0.033078  2.0  0.078125   1.0   
3  0.031817 -0.063924 -0.731318  ...  0.037837  0.049685  3.0  0.117188   1.0   
4  0.042533 -0.085412 -0.729592  ...  0.050548  0.066372  4.0  0.156250   1.0   

   y0_1      x0_2      y0_2      x0_3      y0_3  
0   0.0 -0.1

Now we split the data set into the features and the target, by adding also the 'Id' column to both data frames, as it was before.

In [58]:
def split_data_frame(data_frame, target_columns):
    """
    Split the data frame into the features and the target
    :param target_columns: the columns of the target data frame
    :param data_frame: the data frame to split
    :return: the features and the target
    """
    features = data_frame.drop(columns=target_columns[1:], axis=1)
    features['Id'] = features.index
    target = data_frame[target_df.columns]
    target['Id'] = target.index
    return features, target
print(merged_df.columns)
target_column_names = target_df.columns[1:]
df_features, df_target = split_data_frame(merged_df, target_column_names)

Index(['t_x', 'x_1', 'y_1', 'v_x_1', 'v_y_1', 'x_2', 'y_2', 'v_x_2', 'v_y_2',
       'x_3', 'y_3', 'v_x_3', 'v_y_3', 'Id', 't_y', 'x0_1', 'y0_1', 'x0_2',
       'y0_2', 'x0_3', 'y0_3'],
      dtype='object')


KeyError: "['t'] not in index"

TODO: Somehow i lost the column t that represented the time. to be fixed.