In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv(r'C:\Users\smbm2\projects\CAT-Transformer\datasets\covertype\covtype.csv')

le = LabelEncoder()
df['Cover_Type'] = le.fit_transform(df['Cover_Type'])

#Split based of ft-transformer paper: 371847 92962 116203
df_train, df_temp = train_test_split(df, train_size=371847, random_state=42)
df_val, df_test = train_test_split(df_temp, train_size=92962, random_state=42)

assert(df_train.shape[0] == 371847)
assert(df_val.shape[0] == 92962)
assert(df_test.shape[0] == 116203)

#taking out the target
train_target = df_train['Cover_Type']
val_target = df_val['Cover_Type']
test_target = df_test['Cover_Type']

df_train.drop(columns = ['Cover_Type'], inplace = True)
df_val.drop(columns=['Cover_Type'], inplace = True)
df_test.drop(columns=['Cover_Type'], inplace=True)

stand = StandardScaler()
stand.fit(df_train)
df_train_scaled = stand.transform(df_train)
df_val_scaled = stand.transform(df_val)
df_test_scaled = stand.transform(df_test)

df_train = pd.DataFrame(df_train_scaled, columns=df_train.columns)
df_val = pd.DataFrame(df_val_scaled, columns=df_val.columns)
df_test = pd.DataFrame(df_test_scaled, columns=df_test.columns)

train_target=train_target.reset_index(drop=True)
val_target=val_target.reset_index(drop=True)
test_target=test_target.reset_index(drop=True)

df_train['Cover_Type'] = train_target
df_val['Cover_Type'] = val_target
df_test['Cover_Type'] = test_target

In [3]:
df_train.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,-0.17899,1.065817,-0.14668,-1.126996,-0.641333,-0.268396,-1.014612,0.996189,1.424228,-0.627081,...,-0.315732,3.447327,-0.053061,-0.057632,-0.014577,-0.02261,-0.16607,-0.156269,-0.123519,1
1,0.805968,1.15513,0.655084,-0.280086,0.491065,-1.408655,-1.837025,0.642279,1.790231,-1.063771,...,-0.315732,-0.29008,-0.053061,-0.057632,-0.014577,-0.02261,-0.16607,-0.156269,-0.123519,0
2,-1.260303,1.503452,1.991358,0.830308,3.013223,-1.095869,-2.883733,-1.380061,1.319656,-0.678367,...,-0.315732,-0.29008,-0.053061,-0.057632,-0.014577,-0.02261,-0.16607,-0.156269,-0.123519,1
3,-1.178223,1.628491,0.521457,-0.562389,0.165071,-0.728602,-1.313672,-0.621683,0.744507,-0.866921,...,-0.315732,-0.29008,-0.053061,-0.057632,-0.014577,-0.02261,-0.16607,-0.156269,-0.123519,1
4,-0.325307,-1.336711,-0.547562,0.185715,-0.263867,-0.835,-0.154817,-0.06554,0.195502,0.566841,...,3.167241,-0.29008,-0.053061,-0.057632,-0.014577,-0.02261,-0.16607,-0.156269,-0.123519,0


In [4]:
df_val.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,1.027226,-0.550754,0.922339,1.277289,0.679798,-0.695272,1.34048,-1.026151,-1.817516,-0.160976,...,-0.315732,-0.29008,-0.053061,-0.057632,-0.014577,-0.02261,-0.16607,-0.156269,-0.123519,0
1,-2.762721,-0.532891,0.922339,0.284521,0.542537,-1.23752,1.377862,-1.026151,-1.84366,-1.301349,...,-0.315732,-0.29008,-0.053061,-0.057632,-0.014577,-0.02261,-0.16607,-0.156269,-0.123519,5
2,-0.468054,-0.238157,1.323221,-0.868218,-0.349655,1.336554,1.452627,-0.318332,-1.660658,-1.204809,...,-0.315732,-0.29008,-0.053061,-0.057632,-0.014577,-0.02261,-0.16607,-0.156269,-0.123519,1
3,1.062913,-1.149153,-0.814817,1.639578,0.336647,0.811613,0.181625,-0.06554,-0.039786,0.408455,...,-0.315732,-0.29008,-0.053061,-0.057632,-0.014577,-0.02261,6.021563,-0.156269,-0.123519,0
4,0.481217,-1.292054,-1.215699,-0.105999,0.096441,-0.361976,0.10686,0.338928,0.247788,0.096965,...,-0.315732,-0.29008,-0.053061,-0.057632,-0.014577,-0.02261,-0.16607,-0.156269,-0.123519,1


In [5]:
df_test.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,-1.970472,1.003298,0.120575,-0.444763,-0.178079,-0.544648,-1.238907,1.097306,1.60723,-1.078855,...,-0.315732,-0.29008,-0.053061,-0.057632,-0.014577,-0.02261,-0.16607,-0.156269,-0.123519,5
1,0.156466,-0.372127,-0.68119,0.340982,0.353804,-1.366993,0.891891,0.389487,-0.458075,-0.317853,...,-0.315732,-0.29008,-0.053061,-0.057632,-0.014577,-0.02261,-0.16607,-0.156269,-0.123519,0
2,0.395568,-0.711518,-0.413935,-0.985844,-0.761436,-0.986265,0.817126,-0.166657,-0.66722,0.023052,...,-0.315732,-0.29008,-0.053061,-0.057632,-0.014577,-0.02261,-0.16607,-0.156269,-0.123519,0
3,0.09223,0.869328,-0.547562,1.677218,0.696955,2.26017,-0.528641,1.198423,1.162797,2.993148,...,-0.315732,-0.29008,-0.053061,-0.057632,-0.014577,-0.02261,-0.16607,-0.156269,-0.123519,1
4,-1.585054,-0.541822,-0.413935,0.072794,0.439592,-0.952936,0.966656,-0.014981,-0.745649,-0.78622,...,-0.315732,-0.29008,-0.053061,-0.057632,-0.014577,-0.02261,-0.16607,-0.156269,-0.123519,1


In [6]:
#save csvs
df_train.to_csv('train.csv', index=False)
df_val.to_csv('validation.csv', index=False)
df_test.to_csv('test.csv', index=False)