In [111]:
import pandas
import seaborn as sns
import numpy
import matplotlib.pyplot as plt
import sklearn.impute 
import sklearn.preprocessing


from sklearn.impute import SimpleImputer
%matplotlib inline

In [112]:
data_train = pandas.read_csv('./Data/salary.train.csv').set_index('id')
data_test = pandas.read_csv('./Data/salary.test.csv').set_index('id')
data_live = pandas.read_csv('./Data/high_salary.live.csv').set_index('id')

### Step 1: Explore the Training Data

In [113]:
# 1. See the first 5 rows to understand the columns
print("----------- HEAD -----------")
print(data_train.head())

# 2. Get a summary of all columns, their data types, and non-null counts
print("\n----------- INFO -----------")
data_train.info()

# 3. Get a simple count of missing values in each column
print("\n----------- MISSING VALUES COUNT -----------")
print(data_train.isnull().sum())

----------- HEAD -----------
       social-security-number  house-number  age-group  workclass    fnlwgt  \
id                                                                            
26890             987463818.0        5066.0        3.0  state-gov  327886.0   
933               884327274.0        6423.0        3.0    private   99736.0   
25596             397372044.0        9074.0        2.0    private  398575.0   
12949             580119132.0         413.0        2.0    private  682947.0   
6681              836161922.0        1790.0        1.0    private  236861.0   

          education  education-num      marital-status         occupation  \
id                                                                          
26890     doctorate           16.0            divorced     prof-specialty   
933         masters           14.0            divorced     prof-specialty   
25596  some-college           10.0       never-married       tech-support   
12949     bachelors           13

### Step 2 : Drop Irrelevant & Redundant Columns

In [114]:
# List of columns to drop
# We add 'education' because 'education-num' is a better version
cols_to_drop = ['social-security-number', 'house-number', 'native-country-code', 'education']

# Drop them from all three dataframes
data_train = data_train.drop(columns=cols_to_drop, errors='ignore')
data_test = data_test.drop(columns=cols_to_drop, errors='ignore')
data_live = data_live.drop(columns=cols_to_drop, errors='ignore')

# Check the new list of columns
print("Columns dropped. New training data columns:")
print(data_train.columns)

Columns dropped. New training data columns:
Index(['age-group', 'workclass', 'fnlwgt', 'education-num', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capitalgain',
       'capitalloss', 'hoursperweek', 'native-country', 'label'],
      dtype='object')


### Step 3: Identify Numeric and Categorical Columns

In [115]:
# Separate the target variable (y) from the features (X) in the training data
# We do this first so 'label' doesn't end up in our feature lists
X_train = data_train.drop('label', axis=1)
y_train = data_train['label']

# We also drop the label from the test set
# (We don't need y_test for training, but we'll use it for evaluation later)
X_test = data_test.drop('label', axis=1)
y_test = data_test['label']

# And for the 'live' data, it likely has no label, so we just copy it
# We use .copy() to avoid any future warning messages
X_live = data_live.copy()


# Now, create the lists of column names from X_train
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

print("----------- NUMERIC FEATURES -----------")
print(numeric_features)
print("\n----------- CATEGORICAL FEATURES -----------")
print(categorical_features)

----------- NUMERIC FEATURES -----------
Index(['age-group', 'fnlwgt', 'education-num', 'capitalgain', 'capitalloss',
       'hoursperweek'],
      dtype='object')

----------- CATEGORICAL FEATURES -----------
Index(['workclass', 'marital-status', 'occupation', 'relationship', 'race',
       'sex', 'native-country'],
      dtype='object')


### Step 4: Impute (Fill) Missing Values

In [116]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# 1. Create the Numeric Imputation "pipeline"
# We'll fill missing values with the median
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# 2. Create the Categorical Imputation "pipeline"
# We'll fill missing values with the most frequent value
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

# 3. Create the master preprocessor
# This ColumnTransformer applies the right transformer to the right columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 4. Apply the preprocessor to all datasets
# We FIT and TRANSFORM on X_train (to learn the medians/modes)
X_train_imputed = preprocessor.fit_transform(X_train)

# We ONLY TRANSFORM on X_test and X_live (to apply the rules we learned from X_train)
X_test_imputed = preprocessor.transform(X_test)
X_live_imputed = preprocessor.transform(X_live)


# Check the results
print("Imputation successful!")
print("Shape of X_train after imputation:", X_train_imputed.shape)
print("Shape of X_test after imputation:", X_test_imputed.shape)

Imputation successful!
Shape of X_train after imputation: (16720, 13)
Shape of X_test after imputation: (4180, 13)


- Checking

In [117]:
# We need pandas for this check
import pandas

# Wrap the NumPy array in a DataFrame and use the familiar .isnull().sum()
# .sum() twice adds up all missing values in all columns.
print("Checking X_train_imputed:", pandas.DataFrame(X_train_imputed).isnull().sum().sum())
print("Checking X_test_imputed:", pandas.DataFrame(X_test_imputed).isnull().sum().sum())
print("Checking X_live_imputed:", pandas.DataFrame(X_live_imputed).isnull().sum().sum())

Checking X_train_imputed: 0
Checking X_test_imputed: 0
Checking X_live_imputed: 0


### Step 5: One-Hot Encoding (Handling Text Data)

In [118]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# 1. Create the Numeric pipeline (same as before)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
    # We will add StandardScaler here in the next step
])

# 2. Create the Categorical pipeline (NOW with Imputer AND OneHotEncoder)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # handle_unknown='ignore' is crucial!
])

# 3. Create the master preprocessor (same as before)
# This ColumnTransformer applies the right transformer to the right columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 4. Apply the NEW preprocessor to all datasets
# We FIT and TRANSFORM on X_train (learns medians, modes, AND all categories)
X_train_processed = preprocessor.fit_transform(X_train)

# We ONLY TRANSFORM on X_test and X_live (applies all the same rules)
X_test_processed = preprocessor.transform(X_test)
X_live_processed = preprocessor.transform(X_live)


# Check the results
print("Processing successful!")
print("Shape of X_train after processing:", X_train_processed.shape)
print("Shape of X_test after processing:", X_test_processed.shape)

Processing successful!
Shape of X_train after processing: (16720, 89)
Shape of X_test after processing: (4180, 89)


- checking

In [119]:
import pandas

# This check is the most reliable
# It wraps the array in a DataFrame and uses pandas's .isnull()
print("Checking X_train_processed:", pandas.DataFrame(X_train_processed).isnull().sum().sum())
print("Checking X_test_processed:", pandas.DataFrame(X_test_processed).isnull().sum().sum())
print("Checking X_live_processed:", pandas.DataFrame(X_live_processed).isnull().sum().sum())

Checking X_train_processed: 0
Checking X_test_processed: 0
Checking X_live_processed: 0


### Step 6: Feature Scaling (Standardization)

In [120]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# 1. Create the Numeric pipeline (NOW with Imputer AND Scaler)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())  # <--- WE ADDED THIS
])

# 2. Create the Categorical pipeline (same as before)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 3. Create the master preprocessor (same as before)
# This ColumnTransformer applies the right transformer to the right columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 4. Apply the FINAL preprocessor to all datasets
# We FIT and TRANSFORM on X_train (learns medians, modes, categories, means, std devs)
X_train_final = preprocessor.fit_transform(X_train)

# We ONLY TRANSFORM on X_test and X_live (applies all the same rules)
X_test_final = preprocessor.transform(X_test)
X_live_final = preprocessor.transform(X_live)


# Check the results
print("Final processing complete!")
print("Shape of X_train_final:", X_train_final.shape)

# Let's check the first 5 rows of the processed data (it's a numpy array)
print("\n--- Processed Training Data (first 5 rows) ---")
print(X_train_final[:5])

Final processing complete!
Shape of X_train_final: (16720, 89)

--- Processed Training Data (first 5 rows) ---
  (0, 0)	0.8659762433341348
  (0, 1)	1.3269088262142485
  (0, 2)	2.144509146967056
  (0, 3)	-0.3271685157483273
  (0, 4)	-0.23168204779844281
  (0, 5)	1.094412479852149
  (0, 12)	1.0
  (0, 14)	1.0
  (0, 30)	1.0
  (0, 38)	1.0
  (0, 45)	1.0
  (0, 47)	1.0
  (0, 86)	1.0
  (1, 0)	0.8659762433341348
  (1, 1)	-0.8632036338399138
  (1, 2)	1.3737977229063303
  (1, 3)	3.9373082593926054
  (1, 4)	-0.23168204779844281
  (1, 5)	1.094412479852149
  (1, 9)	1.0
  (1, 14)	1.0
  (1, 30)	1.0
  (1, 39)	1.0
  (1, 45)	1.0
  (1, 47)	1.0
  :	:
  (3, 1)	4.7352959407410555
  (3, 2)	0.9884420108759675
  (3, 3)	-0.3271685157483273
  (3, 4)	-0.23168204779844281
  (3, 5)	1.094412479852149
  (3, 9)	1.0
  (3, 14)	1.0
  (3, 24)	1.0
  (3, 36)	1.0
  (3, 45)	1.0
  (3, 47)	1.0
  (3, 86)	1.0
  (4, 0)	-0.7309271192496012
  (4, 1)	0.45311971082336483
  (4, 2)	-0.5529808372454841
  (4, 3)	-0.3271685157483273
  (4, 4)

In [121]:
# 1. Get the new feature names (the clean way)

# Get the original numeric feature names
clean_numeric_names = list(numeric_features)

# Get the new one-hot-encoded feature names
# This will give us names like 'workclass_private', 'sex_male', etc.
clean_categorical_names = list(preprocessor.named_transformers_['cat']['onehot']
                                 .get_feature_names_out(categorical_features))

# Combine them into one final list
feature_names = clean_numeric_names + clean_categorical_names

# print(feature_names) # You can uncomment this to see the new list


# 2. Create the processed features DataFrame (using .toarray())
X_train_final_df = pandas.DataFrame(X_train_final.toarray(), columns=feature_names)
# Set the index to match the original 'id'
X_train_final_df.index = X_train.index

# 3. Create the label DataFrame
y_train_df = y_train.to_frame()

# 4. Join the processed features and the label
data_train_reconstructed = X_train_final_df.join(y_train_df)

# 5. Display the result
print("--- Reconstructed Training Data (Clean Names) ---")
print(data_train_reconstructed.head())


# --- Do the same for the test set ---
X_test_final_df = pandas.DataFrame(X_test_final.toarray(), columns=feature_names)
X_test_final_df.index = X_test.index
y_test_df = y_test.to_frame()
data_test_reconstructed = X_test_final_df.join(y_test_df)

print("\n--- Reconstructed Test Data (Clean Names) ---")
print(data_test_reconstructed.head())


# --- And for the live set ---
X_live_final_df = pandas.DataFrame(X_live_final.toarray(), columns=feature_names)
X_live_final_df.index = X_live.index
data_live_reconstructed = X_live_final_df

print("\n--- Reconstructed Live Data (Clean Names) ---")
print(data_live_reconstructed.head())

--- Reconstructed Training Data (Clean Names) ---
       age-group    fnlwgt  education-num  capitalgain  capitalloss  \
id                                                                    
26890   0.865976  1.326909       2.144509    -0.327169    -0.231682   
933     0.865976 -0.863204       1.373798     3.937308    -0.231682   
25596   0.067525  2.005484      -0.167625    -0.327169    -0.231682   
12949   0.067525  4.735296       0.988442    -0.327169    -0.231682   
6681   -0.730927  0.453120      -0.552981    -0.327169    -0.231682   

       hoursperweek  workclass_federal-gov  workclass_local-gov  \
id                                                                
26890      1.094412                    0.0                  0.0   
933        1.094412                    0.0                  0.0   
25596     -0.051540                    0.0                  0.0   
12949      1.094412                    0.0                  0.0   
6681      -0.051540                    0.0        

In [122]:
data_test_reconstructed

Unnamed: 0_level_0,age-group,fnlwgt,education-num,capitalgain,capitalloss,hoursperweek,workclass_federal-gov,workclass_local-gov,workclass_never-worked,workclass_private,...,native-country_puerto-rico,native-country_scotland,native-country_south,native-country_taiwan,native-country_thailand,native-country_trinadad&tobago,native-country_united-states,native-country_vietnam,native-country_yugoslavia,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12849,-0.730927,3.070302,0.988442,1.805070,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1460,-1.529379,0.515583,-3.635827,-0.327169,-0.231682,1.094412,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13594,0.067525,-0.433264,-0.552981,-0.327169,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
14400,0.865976,-0.371338,-0.552981,-0.327169,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
14333,0.067525,-0.511067,-0.552981,-0.327169,-0.231682,1.094412,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21121,0.865976,-1.356404,-0.552981,-0.327169,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
12348,0.067525,-0.253841,0.988442,3.937308,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
22298,0.865976,0.064026,0.988442,-0.327169,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
15636,-0.730927,0.341478,-0.167625,2.871189,-0.231682,1.094412,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [123]:
data_live_reconstructed

Unnamed: 0_level_0,age-group,fnlwgt,education-num,capitalgain,capitalloss,hoursperweek,workclass_federal-gov,workclass_local-gov,workclass_never-worked,workclass_private,...,native-country_portugal,native-country_puerto-rico,native-country_scotland,native-country_south,native-country_taiwan,native-country_thailand,native-country_trinadad&tobago,native-country_united-states,native-country_vietnam,native-country_yugoslavia
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6111,-1.529379,0.937853,-1.323692,-0.327169,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
11214,-0.730927,-0.212150,-0.552981,-0.327169,-0.231682,-1.197493,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5554,1.664428,-1.557733,-0.552981,-0.327169,-0.231682,1.094412,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25131,0.067525,-0.342530,1.373798,-0.327169,-0.231682,-0.051540,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
14324,0.865976,-1.619448,-0.552981,-0.327169,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25998,1.664428,-0.688859,0.988442,-0.327169,-0.231682,1.094412,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8375,1.664428,1.045261,-0.552981,-0.327169,-0.231682,2.240365,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
13888,-0.730927,-1.471770,-1.323692,-0.327169,-0.231682,-1.197493,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4159,-0.730927,-0.155370,1.373798,-0.327169,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [124]:
# Save the reconstructed DataFrames to new CSV files
data_train_reconstructed.to_csv('./data/salary.train.processed.csv')
data_test_reconstructed.to_csv('./data/salary.test.processed.csv')
data_live_reconstructed.to_csv('./data/salary.live.processed.csv')

print("Successfully saved all 3 processed files to the './Data/' folder.")

Successfully saved all 3 processed files to the './Data/' folder.


In [125]:
data_test_reconstructed

Unnamed: 0_level_0,age-group,fnlwgt,education-num,capitalgain,capitalloss,hoursperweek,workclass_federal-gov,workclass_local-gov,workclass_never-worked,workclass_private,...,native-country_puerto-rico,native-country_scotland,native-country_south,native-country_taiwan,native-country_thailand,native-country_trinadad&tobago,native-country_united-states,native-country_vietnam,native-country_yugoslavia,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12849,-0.730927,3.070302,0.988442,1.805070,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1460,-1.529379,0.515583,-3.635827,-0.327169,-0.231682,1.094412,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13594,0.067525,-0.433264,-0.552981,-0.327169,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
14400,0.865976,-0.371338,-0.552981,-0.327169,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
14333,0.067525,-0.511067,-0.552981,-0.327169,-0.231682,1.094412,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21121,0.865976,-1.356404,-0.552981,-0.327169,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
12348,0.067525,-0.253841,0.988442,3.937308,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
22298,0.865976,0.064026,0.988442,-0.327169,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
15636,-0.730927,0.341478,-0.167625,2.871189,-0.231682,1.094412,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [126]:
data_live_reconstructed

Unnamed: 0_level_0,age-group,fnlwgt,education-num,capitalgain,capitalloss,hoursperweek,workclass_federal-gov,workclass_local-gov,workclass_never-worked,workclass_private,...,native-country_portugal,native-country_puerto-rico,native-country_scotland,native-country_south,native-country_taiwan,native-country_thailand,native-country_trinadad&tobago,native-country_united-states,native-country_vietnam,native-country_yugoslavia
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6111,-1.529379,0.937853,-1.323692,-0.327169,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
11214,-0.730927,-0.212150,-0.552981,-0.327169,-0.231682,-1.197493,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5554,1.664428,-1.557733,-0.552981,-0.327169,-0.231682,1.094412,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25131,0.067525,-0.342530,1.373798,-0.327169,-0.231682,-0.051540,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
14324,0.865976,-1.619448,-0.552981,-0.327169,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25998,1.664428,-0.688859,0.988442,-0.327169,-0.231682,1.094412,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8375,1.664428,1.045261,-0.552981,-0.327169,-0.231682,2.240365,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
13888,-0.730927,-1.471770,-1.323692,-0.327169,-0.231682,-1.197493,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4159,-0.730927,-0.155370,1.373798,-0.327169,-0.231682,-0.051540,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
