In [1]:
import warnings

import matplotlib as mpl
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

warnings.simplefilter(action='ignore', category=FutureWarning)
mpl.rcParams['figure.facecolor'] = 'white'

In [2]:
# Import data
galaxies_df = pd.read_csv('../data/asu(4).tsv',delimiter=';',skiprows=104)
galaxies_df = galaxies_df.drop([0,1])  # drop useless rows

# List of important predictors
categorical_vars = ['TT','Mcl']
ordinal_vars = ['Tdw']
quantitative_vars = ['FUV','Bmag','Hamag','Kmag','HImag','W50','HRV','Dist','A26','i','vAmp','<Bmu>','M26','MHI','Vlg','Ti5']
all_features = categorical_vars + ordinal_vars + quantitative_vars

# Only get imporant values of dataframe
galaxies_df = galaxies_df[all_features]
galaxies_df.head()

Unnamed: 0,TT,Mcl,Tdw,FUV,Bmag,Hamag,Kmag,HImag,W50,HRV,Dist,A26,i,vAmp,<Bmu>,M26,MHI,Vlg,Ti5
2,10,Ir,L,17.57,16.8,19.91,14.02,15.66,34.0,335,8.47,2.78,33,21.0,25.2,8.17,7.92,619,-1.2
3,9,Im,N,12.8,11.03,15.56,9.0,11.19,53.0,-122,0.97,3.21,90,22.0,24.8,8.27,7.83,-16,0.2
4,-3,Sph,L,23.0,17.0,,12.49,,,-332,1.36,0.63,10,,26.2,,,-44,0.5
5,9,Im,N,16.08,15.15,17.21,12.74,15.28,53.0,726,7.7,2.65,78,23.0,24.1,8.2,7.99,769,-1.2
6,10,Ir,L,20.28,18.9,,16.29,18.21,16.0,258,5.4,0.53,70,3.0,24.9,5.64,6.51,486,-1.3


In [3]:
# Strip the columns of whitespace
for col in quantitative_vars:
    galaxies_df[col] = galaxies_df[col].apply(lambda x: float(x.strip()) if (isinstance(x, str) and x.strip() != '') else np.nan)

# Drop anything without response variables
half_col_count = len(quantitative_vars) * 0.5
dropped_gal_df = galaxies_df.dropna(subset=quantitative_vars, thresh=half_col_count, inplace=False)
print(f'Dropped {len(galaxies_df) - len(dropped_gal_df)} rows from initial {len(galaxies_df)}')

Dropped 5 rows from initial 869


In [4]:
# Impute remaining mising values with KNN
imputer = KNNImputer(n_neighbors=5)
dropped_gal_df[quantitative_vars] = imputer.fit_transform(dropped_gal_df[quantitative_vars])
dropped_gal_df.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dropped_gal_df[quantitative_vars] = imputer.fit_transform(dropped_gal_df[quantitative_vars])


In [5]:
dropped_gal_df.head(100)

Unnamed: 0,index,TT,Mcl,Tdw,FUV,Bmag,Hamag,Kmag,HImag,W50,HRV,Dist,A26,i,vAmp,<Bmu>,M26,MHI,Vlg,Ti5
0,2,10,Ir,L,17.570,16.80,19.910,14.02,15.66,34.0,335.0,8.47,2.78,33.0,21.0,25.2,8.170,7.920,619.0,-1.2
1,3,9,Im,N,12.800,11.03,15.560,9.00,11.19,53.0,-122.0,0.97,3.21,90.0,22.0,24.8,8.270,7.830,-16.0,0.2
2,4,-3,Sph,L,23.000,17.00,24.098,12.49,18.32,131.8,-332.0,1.36,0.63,10.0,66.4,26.2,8.436,6.688,-44.0,0.5
3,5,9,Im,N,16.080,15.15,17.210,12.74,15.28,53.0,726.0,7.70,2.65,78.0,23.0,24.1,8.200,7.990,769.0,-1.2
4,6,10,Ir,L,20.280,18.90,24.706,16.29,18.21,16.0,258.0,5.40,0.53,70.0,3.0,24.9,5.640,6.510,486.0,-1.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,98,10,Ir,N,18.300,15.65,19.640,13.11,15.54,52.0,570.0,9.30,4.69,90.0,22.0,25.3,8.410,8.050,765.0,0.4
96,99,8,,,15.900,13.96,17.500,10.98,14.65,36.0,604.0,9.30,7.57,23.0,33.0,24.4,8.970,8.410,785.0,1.2
97,100,8,,,22.616,13.68,18.420,10.86,13.97,48.0,581.0,9.80,7.80,36.0,33.0,24.3,9.000,8.730,780.0,1.0
98,101,10,Ir,L,21.002,17.80,19.790,10.46,15.05,50.0,190.0,3.00,2.25,87.0,21.0,24.3,8.040,7.270,421.0,4.6


In [6]:
dropped_gal_df.to_csv('/Users/adamboesky/Desktop/General/College/CS109/Classifying_Galaxies/data/preprocessed_data.csv')