In [1]:
import pandas as pd
import seaborn as sns

In [2]:
file_path_train = 'train.csv'
file_path_labels = 'train_labels.csv'

In [3]:
test_data = pd.read_csv(file_path_train)
test_labels = pd.read_csv(file_path_labels)
full_data = pd.merge(test_labels, test_data, on = 'sequence')

In [4]:
sensor_columns = full_data.columns[4:]

In [5]:
full_data.head()

Unnamed: 0,sequence,state,subject,step,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09,sensor_10,sensor_11,sensor_12
0,0,0,47,0,-0.196291,0.112395,1.0,0.329204,-1.00466,-0.131638,-0.127505,0.368702,-0.1,-0.963873,-0.985069,0.531893,4.751492
1,0,0,47,1,-0.44745,0.134454,1.0,-0.658407,0.162495,0.340314,-0.209472,-0.867176,0.2,-0.301301,0.082733,-0.231481,0.45439
2,0,0,47,2,0.326893,-0.694328,1.0,0.330088,0.473678,1.280479,-0.094718,0.535878,1.4,1.002168,0.449221,-0.58642,-4.736147
3,0,0,47,3,0.523184,0.75105,1.0,0.976991,-0.563287,-0.720269,0.79326,0.951145,-0.3,-0.995665,-0.43429,1.34465,0.429241
4,0,0,47,4,0.272025,1.07458,1.0,-0.136283,0.398579,0.044877,0.560109,-0.541985,-0.9,1.055636,0.812631,0.123457,-0.223359


In [6]:
chg_data = full_data.groupby('sequence')[sensor_columns].shift(1)
chg_data.columns = [col + '_chg' for col in chg_data.columns]

In [7]:
combined_data = pd.concat([full_data, chg_data], axis = 1)

In [8]:
combined_data.head()

Unnamed: 0,sequence,state,subject,step,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,...,sensor_03_chg,sensor_04_chg,sensor_05_chg,sensor_06_chg,sensor_07_chg,sensor_08_chg,sensor_09_chg,sensor_10_chg,sensor_11_chg,sensor_12_chg
0,0,0,47,0,-0.196291,0.112395,1.0,0.329204,-1.00466,-0.131638,...,,,,,,,,,,
1,0,0,47,1,-0.44745,0.134454,1.0,-0.658407,0.162495,0.340314,...,0.329204,-1.00466,-0.131638,-0.127505,0.368702,-0.1,-0.963873,-0.985069,0.531893,4.751492
2,0,0,47,2,0.326893,-0.694328,1.0,0.330088,0.473678,1.280479,...,-0.658407,0.162495,0.340314,-0.209472,-0.867176,0.2,-0.301301,0.082733,-0.231481,0.45439
3,0,0,47,3,0.523184,0.75105,1.0,0.976991,-0.563287,-0.720269,...,0.330088,0.473678,1.280479,-0.094718,0.535878,1.4,1.002168,0.449221,-0.58642,-4.736147
4,0,0,47,4,0.272025,1.07458,1.0,-0.136283,0.398579,0.044877,...,0.976991,-0.563287,-0.720269,0.79326,0.951145,-0.3,-0.995665,-0.43429,1.34465,0.429241


# Noticing we need to group by sequences and do calcs from there

In [9]:
agg_names = ['mean', 'median', 'var', 'min', 'max']


ml_columns = [col for group_col in [[sensor + '_' + agg for agg in agg_names] 
                                    for sensor in sensor_columns]
             for col in group_col]

In [10]:
sequence_df = pd.concat([full_data.groupby('sequence').agg({sensor: 
                                   agg_names}) for sensor in sensor_columns], 
                        axis = 1)
sequence_df.columns = ml_columns
sequence_df.head()

Unnamed: 0_level_0,sensor_00_mean,sensor_00_median,sensor_00_var,sensor_00_min,sensor_00_max,sensor_01_mean,sensor_01_median,sensor_01_var,sensor_01_min,sensor_01_max,...,sensor_11_mean,sensor_11_median,sensor_11_var,sensor_11_min,sensor_11_max,sensor_12_mean,sensor_12_median,sensor_12_var,sensor_12_min,sensor_12_max
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.041744,-0.148764,2.049649,-5.634467,7.534003,-0.013025,0.191702,2.277749,-3.002101,3.62395,...,0.010391,0.193416,1.58185,-2.972222,2.989712,-1.286985,-0.447997,67.352879,-50.658994,6.674766
1,-0.069294,-0.031298,2.540538,-6.657651,5.870943,-0.019958,0.001576,0.901366,-2.745798,2.52521,...,0.199914,0.053498,3.150006,-3.573045,8.044239,-1.604085,-0.009591,2529.213697,-160.234442,144.598039
2,-0.001546,0.058733,1.316816,-2.797527,3.402628,0.025,0.120273,0.646229,-1.733193,2.321429,...,-0.002932,0.083848,0.37349,-1.705761,1.475309,0.000945,-0.004902,0.13036,-1.044331,1.116795
3,0.029869,-0.119784,3.131857,-3.64374,4.85626,-0.049177,0.051471,3.666719,-5.860294,6.722689,...,-0.02668,-0.175926,1.958582,-3.837449,3.593621,-0.003708,-0.012361,0.058423,-0.83717,0.59335
4,0.059132,0.00541,4.059335,-5.33153,5.455951,0.014916,0.060399,1.918921,-5.898109,3.532563,...,-0.099657,-0.115741,8.37886,-7.4393,6.989712,2.8e-05,0.002344,0.002663,-0.098465,0.135976


In [11]:
full_df = pd.merge(test_labels, sequence_df, on = 'sequence')
full_df.head()

Unnamed: 0,sequence,state,sensor_00_mean,sensor_00_median,sensor_00_var,sensor_00_min,sensor_00_max,sensor_01_mean,sensor_01_median,sensor_01_var,...,sensor_11_mean,sensor_11_median,sensor_11_var,sensor_11_min,sensor_11_max,sensor_12_mean,sensor_12_median,sensor_12_var,sensor_12_min,sensor_12_max
0,0,0,0.041744,-0.148764,2.049649,-5.634467,7.534003,-0.013025,0.191702,2.277749,...,0.010391,0.193416,1.58185,-2.972222,2.989712,-1.286985,-0.447997,67.352879,-50.658994,6.674766
1,1,1,-0.069294,-0.031298,2.540538,-6.657651,5.870943,-0.019958,0.001576,0.901366,...,0.199914,0.053498,3.150006,-3.573045,8.044239,-1.604085,-0.009591,2529.213697,-160.234442,144.598039
2,2,1,-0.001546,0.058733,1.316816,-2.797527,3.402628,0.025,0.120273,0.646229,...,-0.002932,0.083848,0.37349,-1.705761,1.475309,0.000945,-0.004902,0.13036,-1.044331,1.116795
3,3,1,0.029869,-0.119784,3.131857,-3.64374,4.85626,-0.049177,0.051471,3.666719,...,-0.02668,-0.175926,1.958582,-3.837449,3.593621,-0.003708,-0.012361,0.058423,-0.83717,0.59335
4,4,1,0.059132,0.00541,4.059335,-5.33153,5.455951,0.014916,0.060399,1.918921,...,-0.099657,-0.115741,8.37886,-7.4393,6.989712,2.8e-05,0.002344,0.002663,-0.098465,0.135976


In [12]:
X = full_df.copy().drop(['sequence', 'state'], axis = 1)
y = full_df['state']

# Doing some ML

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, SparsePCA, TruncatedSVD
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

In [14]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", RobustScaler())]
)

In [None]:
rf_pipe = Pipeline(
    steps=[("preprocessor", numeric_transformer), ("regressor", RandomForestRegressor())]
)

average = []

for i in range(1,6):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    rf_pipe.fit(X_train, y_train)
    score = rf_pipe.score(X_test, y_test)
    
    average.append(score)
    
    print(f"Run {i} of Random Forest model score: {score:.3f}")
    
print(np.array(average).mean())