In [2]:
import pandas as pd

# Load the training dataset
training_data_path = 'dataGaia_AB_train.csv'
training_data = pd.read_csv(training_data_path)

# Display the first few rows and basic information about the dataset
training_data.head(), training_data.describe(), training_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148538 entries, 0 to 148537
Data columns (total 29 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   ID          148538 non-null  int64  
 1   Unnamed: 0  148538 non-null  int64  
 2   RA_ICRS     148538 non-null  float64
 3   DE_ICRS     148538 non-null  float64
 4   Source      148538 non-null  float64
 5   Plx         148538 non-null  float64
 6   PM          148538 non-null  float64
 7   pmRA        148538 non-null  float64
 8   pmDE        148538 non-null  float64
 9   Gmag        148538 non-null  float64
 10  e_Gmag      148538 non-null  float64
 11  BPmag       148538 non-null  float64
 12  e_BPmag     148538 non-null  float64
 13  RPmag       148538 non-null  float64
 14  e_RPmag     148538 non-null  float64
 15  GRVSmag     84484 non-null   float64
 16  e_GRVSmag   84484 non-null   float64
 17  BP-RP       148538 non-null  float64
 18  BP-G        148538 non-null  float64
 19  G-

(   ID  Unnamed: 0    RA_ICRS   DE_ICRS        Source      Plx      PM    pmRA  \
 0   1           1  44.375187  2.895901  1.400000e+15   1.8937  16.954   7.031   
 1   2           2  42.959304  2.920459  1.550000e+15   2.9732   2.528   1.795   
 2   3           3  44.031079  3.203683  1.610000e+15   2.2337  10.776  -3.151   
 3   4           4  45.463599  3.138095  1.750000e+15   4.0179  10.053   1.828   
 4   5           5  43.784164  3.486814  2.060000e+15  11.6458  32.355  29.688   
 
      pmDE       Gmag  ...      G-RP   pscol    Teff      Dist     Rad  \
 0 -15.427  10.369156  ...  0.163792     NaN  9348.7  679.7121  2.4642   
 1   1.780  10.038978  ...  0.304139     NaN  7520.7  339.8108  1.9321   
 2 -10.305   9.810181  ...  0.338961  1.5956  7052.3  433.9715  2.9735   
 3   9.885   8.163041  ...  0.091842     NaN  9982.8  251.2483  2.2829   
 4 -12.863   6.855547  ...  0.257051     NaN  7192.3   85.7073  2.0194   
 
    Lum-Flame  Mass-Flame  Age-Flame   z-Flame  SpType-ELS  

The training dataset consists of 148,538 entries and 29 columns

# **Data preprocessing**

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Drop unnecessary columns
training_data_cleaned = training_data.drop(columns=['ID', 'Unnamed: 0'])

# Handling missing values: Imputing with mean for numerical columns
for column in training_data_cleaned.columns:
    if training_data_cleaned[column].isnull().any() and training_data_cleaned[column].dtype != 'object':
        training_data_cleaned[column].fillna(training_data_cleaned[column].mean(), inplace=True)

# Encoding categorical target variable
label_encoder = LabelEncoder()
training_data_cleaned['SpType-ELS'] = label_encoder.fit_transform(training_data_cleaned['SpType-ELS'])

# Splitting data into features and target
X = training_data_cleaned.drop('SpType-ELS', axis=1)
y = training_data_cleaned['SpType-ELS']

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

X_train_scaled.shape, X_val_scaled.shape, y_train.shape, y_val.shape


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  training_data_cleaned[column].fillna(training_data_cleaned[column].mean(), inplace=True)


((118830, 26), (29708, 26), (118830,), (29708,))

Training data: 118,830 samples
Validation data: 29,708 samples
Each sample has 26 features after preprocessing.