1. Load the data set
2. Display the first rows of the data set

In [31]:
import pandas as pd

def load_data_frame(path):
    """
    Load data set from file path
    :param path: the path of the file to turn into a data set
    :return: a created data set based on the header of the file
    """
    return pd.read_csv(path)

df_train = load_data_frame('../data/X_train.csv')
print(df_train.head())

          t       x_1       y_1     v_x_1     v_y_1       x_2       y_2  \
0  0.000000  1.000000  0.000000  0.000000  0.000000 -0.266467  0.859196   
1  0.039062  0.999548  0.000092 -0.023159  0.004731 -0.266261  0.858781   
2  0.078125  0.998190  0.000370 -0.046362  0.009474 -0.265641  0.857535   
3  0.117188  0.995925  0.000833 -0.069654  0.014239 -0.264606  0.855456   
4  0.156250  0.992747  0.001483 -0.093080  0.019040 -0.263154  0.852540   

      v_x_2     v_y_2       x_3       y_3     v_x_3     v_y_3  Id  
0  0.000000  0.000000 -0.733533 -0.859196  0.000000  0.000000   0  
1  0.010574 -0.021257 -0.733287 -0.858874  0.012584  0.016526   1  
2  0.021172 -0.042552 -0.732549 -0.857905  0.025189  0.033078   2  
3  0.031817 -0.063924 -0.731318 -0.856289  0.037837  0.049685   3  
4  0.042533 -0.085412 -0.729592 -0.854022  0.050548  0.066372   4  


Then we sanitize the data set by removing the columns that are not useful for the model. We also remove the rows with missing values.

In [32]:
import numpy as np

def sanitize_data_set(data_frame):
    """
    Sanitize the data set by removing the rows that are not useful for the model (Nan values, etc.)
    :param data_frame: the data set to sanitize
    :return: the sanitized data set
    """
    ## data_set = data_set.drop(['Id'], axis=1) # we might wanna merge train and test data
    data_frame = data_frame.replace(0, np.nan)
    data_frame = data_frame.dropna(axis=0, how='all')
    data_frame = data_frame.replace(np.nan, 0)
    return data_frame

df_train = sanitize_data_set(df_train)
print(df_train.head())

          t       x_1       y_1     v_x_1     v_y_1       x_2       y_2  \
0  0.000000  1.000000  0.000000  0.000000  0.000000 -0.266467  0.859196   
1  0.039062  0.999548  0.000092 -0.023159  0.004731 -0.266261  0.858781   
2  0.078125  0.998190  0.000370 -0.046362  0.009474 -0.265641  0.857535   
3  0.117188  0.995925  0.000833 -0.069654  0.014239 -0.264606  0.855456   
4  0.156250  0.992747  0.001483 -0.093080  0.019040 -0.263154  0.852540   

      v_x_2     v_y_2       x_3       y_3     v_x_3     v_y_3   Id  
0  0.000000  0.000000 -0.733533 -0.859196  0.000000  0.000000  0.0  
1  0.010574 -0.021257 -0.733287 -0.858874  0.012584  0.016526  1.0  
2  0.021172 -0.042552 -0.732549 -0.857905  0.025189  0.033078  2.0  
3  0.031817 -0.063924 -0.731318 -0.856289  0.037837  0.049685  3.0  
4  0.042533 -0.085412 -0.729592 -0.854022  0.050548  0.066372  4.0  


Now we do the same for the target data set.

In [33]:
df_test = load_data_frame('../data/X_test.csv')
df_test = sanitize_data_set(df_test)
print(df_test.head())

    Id         t  x0_1  y0_1      x0_2      y0_2      x0_3      y0_3
0  0.0  0.000000   1.0   0.0 -0.179617  0.730085 -0.820383 -0.730085
1  1.0  0.039062   1.0   0.0 -0.179617  0.730085 -0.820383 -0.730085
2  2.0  0.078125   1.0   0.0 -0.179617  0.730085 -0.820383 -0.730085
3  3.0  0.117188   1.0   0.0 -0.179617  0.730085 -0.820383 -0.730085
4  4.0  0.156250   1.0   0.0 -0.179617  0.730085 -0.820383 -0.730085


Now we merge the data frames by their 'Id' column and drop all columns with only zeros, or if the 'Id's are not in both data frames.

In [34]:
def merge_data_frames(data_frame, target_data_frame):
    """
    Merge two data frames by their 'Id' column
    :param data_frame: the first data frame
    :param target_data_frame: the second data frame
    :return: the merged data frame
    """
    merged_data_frame = pd.merge(data_frame, target_data_frame, how='inner', on=['Id','t'])
    return merged_data_frame

merged_df = merge_data_frames(df_train, df_test)
merged_df = sanitize_data_set(merged_df)
print(merged_df.head())

          t       x_1       y_1     v_x_1     v_y_1       x_2       y_2  \
0  0.000000  1.000000  0.000000  0.000000  0.000000 -0.266467  0.859196   
1  0.039062  0.999548  0.000092 -0.023159  0.004731 -0.266261  0.858781   
2  0.078125  0.998190  0.000370 -0.046362  0.009474 -0.265641  0.857535   
3  0.117188  0.995925  0.000833 -0.069654  0.014239 -0.264606  0.855456   
4  0.156250  0.992747  0.001483 -0.093080  0.019040 -0.263154  0.852540   

      v_x_2     v_y_2       x_3       y_3     v_x_3     v_y_3   Id  x0_1  \
0  0.000000  0.000000 -0.733533 -0.859196  0.000000  0.000000  0.0   1.0   
1  0.010574 -0.021257 -0.733287 -0.858874  0.012584  0.016526  1.0   1.0   
2  0.021172 -0.042552 -0.732549 -0.857905  0.025189  0.033078  2.0   1.0   
3  0.031817 -0.063924 -0.731318 -0.856289  0.037837  0.049685  3.0   1.0   
4  0.042533 -0.085412 -0.729592 -0.854022  0.050548  0.066372  4.0   1.0   

   y0_1      x0_2      y0_2      x0_3      y0_3  
0   0.0 -0.179617  0.730085 -0.820383 -0.7

Now we split the data set into the features and the target, by adding also the 'Id' column to both data frames, as it was before.

In [35]:
def split_data_frame(data_frame, target_columns):
    features = data_frame.drop(columns=target_columns[1:], axis=1)
    features['Id'] = features.index  # Add the Id column

    # Create a full copy of the target data to avoid the warning
    target = data_frame[target_columns].copy()
    target['Id'] = target.index  # Add the Id column to the target data

    return features, target

# Assuming `merged_df` is the data frame you're splitting
df_train, df_test = split_data_frame(merged_df, ['t','x0_1', 'y0_1', 'x0_2', 'y0_2', 'x0_3', 'y0_3'])

print(df_train.columns)
print(df_test.columns)

Index(['t', 'x_1', 'y_1', 'v_x_1', 'v_y_1', 'x_2', 'y_2', 'v_x_2', 'v_y_2',
       'x_3', 'y_3', 'v_x_3', 'v_y_3', 'Id'],
      dtype='object')
Index(['t', 'x0_1', 'y0_1', 'x0_2', 'y0_2', 'x0_3', 'y0_3', 'Id'], dtype='object')


Learn a baseline model that can predict the position of each of the 3 bodies at a given time t, given a set of initial conditions. Your baseline will be a Linear Regression model.
For the baseline model, make a pipeline and add a StandardScaler instance before the regressor. See the pipeline tutorial on the Tutorials document for the course.

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# Features: Include initial positions, velocities, and time 't'
train_columns = ['t', 
                       'x_1', 'y_1', 'v_x_1', 'v_y_1', 
                       'x_2', 'y_2', 'v_x_2', 'v_y_2', 
                       'x_3', 'y_3', 'v_x_3', 'v_y_3'
                       ]

# Targets: Predict future positions (x0_1, y0_1, x0_2, y0_2, x0_3, y0_3)
test_columns = ['Id', 't', 'x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3']

def linear_predict_positions():
    
    # Split your dataset into features (X) and targets (y)
    # x = df_train[train_columns]  # Using positions, velocities, and time 't' as features
    # y = df_test[test_columns]     # Predicting future positions for a given time
    
    # Split the data into training and test sets
    x_train, x_validate, y_train, y_validate = train_test_split(df_train, df_test, test_size=0.2)
    
    # Create a pipeline with scaling and linear regression
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standardize the features
        ('regressor', LinearRegression())  # Apply Linear Regression
    ])
    
    # Train the model on the training data
    pipeline.fit(x_train, y_train)
    
    # Predict positions (x0_1, y0_1, x0_2, y0_2, x0_3, y0_3) on the test data
    y_pred = pipeline.predict(x_validate)
    
    #validate model
    score = pipeline.score(x_validate, y_validate)
    print("Model score: ", score)
    
    # Create a DataFrame with the predicted values
    predictions_df = pd.DataFrame(y_pred, columns=test_columns)
    
    # Add an 'Id' column if required by the competition (use the index or some unique identifier)
    predictions_df['Id'] = np.arange(1, len(predictions_df) + 1)  # Assuming Ids start from 1
    predictions_df = predictions_df.drop(columns=['t'])

    # Save the predictions to a CSV file in the required format
    predictions_df.to_csv('baseline-model-positions-with-time.csv', index=False)
    print("Predictions for the positions saved successfully with time!")
    return predictions_df

# Call the function
predictions_df = linear_predict_positions()


Model score:  0.5009397124826979
Predictions for the positions saved successfully with time!


In [34]:
import matplotlib.pyplot as plt

import matplotlib.pyplot as plt

def plot_y_yhat(y_test, y_pred, plot_title="plot"):
    # Ensure y_test and y_pred are NumPy arrays
    y_test = np.array(y_test)
    y_pred = np.array(y_pred)

    # Column labels for positions
    labels = ['x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3']
    
    # Maximum number of points to plot
    MAX = min(500, len(y_test))  # Ensure MAX is not larger than y_test length
    
    # Select random indices, ensuring they are within bounds
    idx = np.random.choice(len(y_test), MAX, replace=False)

    # Create a 10x10 figure
    plt.figure(figsize=(10, 10))
    
    # Plot each of the 6 position variables (x_1, y_1, x_2, y_2, x_3, y_3)
    for i in range(6):
        x0 = np.min([np.min(y_test[idx, i]), np.min(y_pred[idx, i])])  # Min value for the diagonal
        x1 = np.max([np.max(y_test[idx, i]), np.max(y_pred[idx, i])])  # Max value for the diagonal
        
        plt.subplot(3, 2, i+1)
        plt.scatter(y_test[idx, i], y_pred[idx, i], alpha=0.5)  # Scatter plot
        plt.xlabel('True ' + labels[i])
        plt.ylabel('Predicted ' + labels[i])
        plt.plot([x0, x1], [x0, x1], color='red')  # Plot diagonal for reference
        plt.axis('square')
    
    # Save the figure
    plt.savefig(plot_title + '.pdf')
    plt.show()

# Assuming df_target is the true values, and predictions_df is the predicted values
plot_y_yhat(df_test, predictions_df)

IndexError: index 876334 is out of bounds for axis 0 with size 176968

<Figure size 1000x1000 with 0 Axes>