# Preprocessing

In [1]:
import numpy as np
import pandas as pd
import os
import pickle
import utm
import random

In [2]:
import prepostprocessing.cleaning as cleaning
import prepostprocessing.pre_processing as preproc

In [53]:
from sklearn.preprocessing import StandardScaler

In [3]:
# Load jupyter extension to reload packages before executing user code.
# https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html
%load_ext autoreload
# Reload all packages (except those excluded by %aimport) every time before executing the Python code typed.
%autoreload 2

## Mineralogy

**To do**
* ~~Clean last points in Excel file while using "sum" as check~~

In [4]:
mineralogy = pd.read_excel("../_CLEANED/Vistelius_data_cleaned_all.xlsx", index_col=0)

mineralogyP = pd.read_excel("../_CLEANED/Vistelius_data_cleaned_all.xlsx", index_col=0)

### Check for wrong entries

In [5]:
np.isclose(mineralogy.loc[:, :"oth"].sum(axis=1), mineralogy.loc[:, "sum"])

array([ True,  True,  True, ...,  True,  True,  True])

In [6]:
wrong_sum = mineralogy.loc[~np.isclose(mineralogy.loc[:, :"oth"].sum(axis=1), mineralogy.loc[:, "sum"]), :]

In [7]:
# Check to see if any remaining incorrect lines are present
wrong_sum

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O,P2O5,l.i.,oth,sum,hs


In [8]:
mineralogy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4659 entries, 1 to 4659
Data columns (total 15 columns):
SiO2     4659 non-null float64
TiO2     4626 non-null float64
Al2O3    4659 non-null float64
Fe2O3    4657 non-null float64
FeO      4659 non-null float64
MnO      4544 non-null float64
MgO      4658 non-null float64
CaO      4659 non-null float64
Na2O     4659 non-null float64
K2O      4659 non-null float64
P2O5     3834 non-null float64
l.i.     4659 non-null float64
oth      874 non-null float64
sum      4659 non-null float64
hs       2240 non-null float64
dtypes: float64(15)
memory usage: 582.4 KB


In [9]:
wrong_sum.loc[:, :"oth"].sum(axis=1)

Series([], dtype: float64)

In [10]:
mineralogy = preproc.replace_nan(mineralogy, 0)
mineralogyP = preproc.replace_nan(mineralogyP, 0)

In [11]:
mineralogy['oth'] = mineralogy['P2O5'] + mineralogy['l.i.'] + mineralogy['oth']
mineralogy.drop(["l.i.", "P2O5", ], axis = 1, inplace = True)
mineralogyP.drop(["l.i.", ], axis = 1, inplace = True)

In [12]:
mineralogyP.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O,P2O5,oth,sum,hs
1,80.8,0.04,10.16,0.61,1.72,0.0,0.4,0.55,2.0,3.59,0.0,0.0,100.22,0.0
2,80.0,0.1,10.1,0.17,0.56,0.02,0.4,0.35,2.3,5.1,0.05,0.0,99.65,0.0
3,79.92,0.05,9.89,0.16,1.73,0.02,0.12,0.14,0.75,6.15,0.0,0.08,100.03,0.3
4,79.65,0.04,9.64,1.15,0.75,0.1,0.45,0.67,3.71,4.25,0.0,0.0,100.67,0.01
5,79.18,0.08,10.24,0.64,2.6,0.04,0.05,1.25,1.52,3.08,0.01,0.16,100.6,0.28


### Cleaning
**To do**
* ~~Replace zero values~~


In [13]:
# Would not do this this way since it becomes less clear what the variable means
# You should also replace 'minralogy' in all remaining cells by 'x' if you would want to do this
# x = mineralogy

In [14]:
# Replace zero values
mineralogy = preproc.replace_zero(mineralogy, 0.01)
mineralogyP = preproc.replace_zero(mineralogyP, 0.01)

* ~~Normalize~~

In [15]:
# Normalize specific columns
mineralogy.loc[:, :"oth"] = preproc.normalize(mineralogy.loc[:, :"oth"])# , total=mineralogy['sum'])
mineralogyP.loc[:, :"oth"] = preproc.normalize(mineralogyP.loc[:, :"oth"])# , total=mineralogy['sum'])

In [16]:
# Renew 'sum' column to reflect changes applied during cleaning
mineralogy["sum"] = mineralogy.loc[:, :"oth"].sum(axis=1)
mineralogyP["sum"] = mineralogyP.loc[:, :"oth"].sum(axis=1)

In [17]:
# Check that sum of all variables + 'sum' == 200
assert all(np.isclose(mineralogy.loc[:, :'sum'].sum(axis=1), 200.0))
assert all(np.isclose(mineralogyP.loc[:, :'sum'].sum(axis=1), 200.0))

### centred log-ratio (clr) transformation

In [18]:
mineralogy1 = mineralogy.drop(columns=['oth', 'hs', 'sum'])
mineralogyP1 = mineralogyP.drop(columns=['oth', 'hs', 'sum'])

In [19]:
mineralogy_clr1 = preproc.clr(mineralogy1)
mineralogy_clr1.head()

mineralogy_clrP1 = preproc.clr(mineralogyP1)
mineralogy_clrP1.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O,P2O5
1,4.865939,-2.744914,2.79242,-0.020334,1.016286,-4.131208,-0.442329,-0.123875,1.167109,1.752114,-4.131208
2,4.779437,-1.905174,2.709946,-1.374546,-0.182408,-3.514612,-0.51888,-0.652412,1.23032,2.026651,-2.598322
3,5.170339,-2.206419,3.080837,-1.043269,1.337434,-3.12271,-1.330951,-1.1768,0.501631,2.605765,-3.815857
4,4.566008,-3.03051,2.454287,0.328128,-0.099316,-2.114219,-0.610142,-0.212112,1.499398,1.635285,-4.416805
5,4.76913,-2.128322,2.723708,-0.04888,1.352918,-2.821469,-2.598326,0.62055,0.816117,1.522336,-4.207763


### Principal Component Analysis (PCA)

In [20]:
mineralogy_pca1 = preproc.pca(mineralogy_clr1)
preproc.pca_variance(mineralogy_pca1)

6 PCA components  out of 10 components with variance sum 0.9653845247577469 needed for obtaining sum of variance > 0.95


array([4.36495556e-01, 2.24309391e-01, 1.14379085e-01, 7.92733011e-02,
       6.47910351e-02, 4.61361562e-02, 2.56884815e-02, 7.25584036e-03,
       1.67115334e-03, 2.35922349e-32])

In [21]:
mineralogy_pca_df1 = preproc.create_pca_df(mineralogy_pca1, mineralogy_clr1)

In [22]:
mineralogy_pca_df1

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,PC08,PC09,PC10
1,-0.989677,0.063658,-1.077309,0.949276,0.196830,-1.052766,-0.196021,0.313477,0.228518,-3.805685e-16
2,-1.170683,0.906362,-0.457936,0.315585,0.836318,0.547351,-0.165217,0.278892,0.207743,2.205780e-16
3,-1.952305,1.084163,0.126341,-0.059648,0.787068,-1.028997,-0.907513,0.941895,0.141972,-5.419556e-17
4,-1.265500,-0.665278,0.792680,1.554351,0.290969,0.026927,-0.145361,-0.158186,0.272325,-2.268773e-16
5,-1.401423,0.090158,0.667637,-0.203788,-1.529623,-1.127234,-0.414450,0.462700,0.259138,-2.570066e-16
...,...,...,...,...,...,...,...,...,...,...
4655,1.969576,-1.154415,0.734286,0.877843,-0.311676,-0.413926,0.060452,-0.189310,-0.243017,1.875335e-16
4656,2.765959,-0.679754,0.061989,-0.052525,-0.149849,-0.699663,0.034609,0.112326,-0.131631,2.651447e-18
4657,2.610638,-1.211222,0.315811,0.056660,-0.001314,-0.147221,-0.224187,0.059057,-0.149794,-8.143807e-16
4658,2.415960,-0.786335,0.173592,-0.202025,-0.649209,-0.020704,0.652916,0.082545,-0.300729,2.716508e-17


In [23]:
preproc.save_obj(mineralogy_pca_df1, "mineralogy_pca_df1")

In [24]:
metadata = pd.read_excel("../_INTERPOLATION/coordinates_full_data.xlsx", index_col=0, usecols=[0, 23, 24, 25, 26, 27, 28])

In [25]:
preproc.save_obj(metadata, "metadata")

In [26]:
preproc.save_obj(mineralogy_clr1, "mineralogy_clr1")

----

# Feature engineering: Frost

## Fe_Number

In [27]:
mineralogy_clr1

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
1,4.452818,-3.158035,2.379300,-0.433455,0.603165,-4.544329,-0.855450,-0.536996,0.753988,1.338993
2,4.519605,-2.165007,2.450114,-1.634378,-0.442240,-3.774445,-0.778712,-0.912244,0.970488,1.766819
3,4.788753,-2.588005,2.699251,-1.424854,0.955849,-3.504296,-1.712536,-1.558386,0.120045,2.224179
4,4.124327,-3.472191,2.012606,-0.113553,-0.540997,-2.555900,-1.051822,-0.653792,1.057717,1.193604
5,4.348354,-2.549098,2.302932,-0.469657,0.932142,-3.242245,-3.019102,0.199774,0.395341,1.101560
...,...,...,...,...,...,...,...,...,...,...
4655,2.635045,-2.240153,1.668789,0.110001,0.304876,-2.812672,0.033822,0.783394,0.117309,-0.600410
4656,2.640723,-1.289140,1.587809,-0.329790,0.906973,-3.591725,0.334201,0.846209,-0.310814,-0.794444
4657,2.529955,-1.250274,1.502001,0.047427,0.262462,-3.222617,0.374695,0.744131,-0.404219,-0.583560
4658,2.737015,-1.059568,1.909712,-0.357610,0.248526,-3.484370,-0.177484,1.059988,0.219398,-1.095607


In [28]:
Fe_num = pd.read_excel("../_CLEANED/Vistelius_data_cleaned_all.xlsx", index_col=0, usecols = 0)

  usecols = _maybe_convert_usecols(usecols)


In [29]:
#Fe_num['SiO2'] = mineralogy_clr1['SiO2']
Fe_num['Fe'] = mineralogy_clr1['FeO'] + mineralogy_clr1['Fe2O3'] 
Fe_num['Fe+Mg']= mineralogy_clr1['MgO'] +mineralogy_clr1['FeO'] + mineralogy_clr1['Fe2O3']

In [30]:
preproc.save_obj(Fe_num, "Fe_num")

In [31]:
Fe_num

Unnamed: 0,Fe,Fe+Mg
1,0.169710,-0.685739
2,-2.076618,-2.855331
3,-0.469006,-2.181542
4,-0.654550,-1.706372
5,0.462485,-2.556617
...,...,...
4655,0.414876,0.448698
4656,0.577183,0.911383
4657,0.309888,0.684583
4658,-0.109084,-0.286567


In [32]:
Fe_num_pca = preproc.pca(Fe_num)
preproc.pca_variance(Fe_num_pca)

2 PCA components  out of 2 components with variance sum 1.0 needed for obtaining sum of variance > 0.95


array([0.86126701, 0.13873299])

In [33]:
Fe_num_pca_df = preproc.create_pca_df(Fe_num_pca, Fe_num)

In [34]:
preproc.save_obj(Fe_num_pca_df, "Fe_num_pca_df")

## MALI

In [35]:
MALI = pd.read_excel("../_CLEANED/Vistelius_data_cleaned_all.xlsx", index_col=0, usecols = 0)

  usecols = _maybe_convert_usecols(usecols)


In [36]:
MALI['Na2O'] = mineralogy_clr1['Na2O']
MALI['K2O'] = mineralogy_clr1['K2O']
MALI['CaO'] = mineralogy_clr1['CaO']



In [37]:
MALI

Unnamed: 0,Na2O,K2O,CaO
1,0.753988,1.338993,-0.536996
2,0.970488,1.766819,-0.912244
3,0.120045,2.224179,-1.558386
4,1.057717,1.193604,-0.653792
5,0.395341,1.101560,0.199774
...,...,...,...
4655,0.117309,-0.600410,0.783394
4656,-0.310814,-0.794444,0.846209
4657,-0.404219,-0.583560,0.744131
4658,0.219398,-1.095607,1.059988


In [38]:
preproc.save_obj(MALI, "MALI")

In [39]:
MALI_pca = preproc.pca(MALI)
preproc.pca_variance(MALI_pca)

3 PCA components  out of 3 components with variance sum 1.0 needed for obtaining sum of variance > 0.95


array([0.77116024, 0.15231898, 0.07652078])

In [40]:
MALI_pca_df = preproc.create_pca_df(MALI_pca, MALI)

In [41]:
preproc.save_obj(MALI_pca_df, "MALI_pca_df")

-----

## ASI

In [42]:
ASI = pd.read_excel("../_CLEANED/Vistelius_data_cleaned_all.xlsx", index_col=0, usecols = 0)

  usecols = _maybe_convert_usecols(usecols)


In [43]:
ASI['Al'] = mineralogy_clrP1['Al2O3']
ASI['Ca-P+Na+K'] = mineralogy_clrP1['Al2O3'] - (1.67* mineralogy_clrP1['P2O5']) + mineralogy_clrP1['Na2O'] + mineralogy_clrP1['K2O']


In [44]:
ASI

Unnamed: 0,Al,Ca-P+Na+K
1,2.792420,12.610761
2,2.709946,10.306114
3,3.080837,12.560714
4,2.454287,12.965033
5,2.723708,12.089127
...,...,...
4655,1.834805,4.456205
4656,1.827464,5.203770
4657,1.697006,4.355827
4658,2.087740,4.540664


In [45]:
preproc.save_obj(ASI, "ASI")

In [46]:
ASI_pca = preproc.pca(ASI)
preproc.pca_variance(ASI_pca)

1 PCA components  out of 2 components with variance sum 0.9924299020615133 needed for obtaining sum of variance > 0.95


array([0.9924299, 0.0075701])

In [47]:
ASI_pca_df = preproc.create_pca_df(ASI_pca, ASI)

In [48]:
preproc.save_obj(ASI_pca_df, "ASI_pca_df")

------

# incorporating coordinates and timeclass

In [212]:
mineralogy_inc_pca_df = mineralogy_pca_df1.iloc[:, 0: 7]
time = pd.read_excel("../_RESULTS/time_fix.xlsx", index_col=0, usecols=[0, 9])
coordinates = pd.read_excel("../_INTERPOLATION/coordinates_UTM.xlsx", index_col=0, usecols=[0, 1, 2])

**remark** : make sure to use the right coordinate system

In [213]:
coordinates

Unnamed: 0,Y_UTM,X_UTM
1,3.166850e+06,5.512837e+06
2,1.436111e+06,6.102354e+06
3,2.874853e+06,7.966637e+06
4,2.565651e+06,7.521202e+06
5,2.540731e+06,8.757731e+06
...,...,...
4655,2.759727e+06,8.616851e+06
4656,2.927123e+06,5.721711e+06
4657,2.397980e+06,6.939549e+06
4658,2.684981e+06,7.505261e+06


In [214]:
classification_renaming_dict = {"Tr" : "1",
                                "Tr-J" : "2",
                                "J" : "3",
                                "K" : "4",
                                "Pg" : "5",
                                "Mz" : "6"}                         

In [215]:
time['time'] = time['time'].replace(classification_renaming_dict)

In [216]:
x8 = time

In [217]:
mineralogy_inc_pca_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07
1,-0.989677,0.063658,-1.077309,0.949276,0.196830,-1.052766,-0.196021
2,-1.170683,0.906362,-0.457936,0.315585,0.836318,0.547351,-0.165217
3,-1.952305,1.084163,0.126341,-0.059648,0.787068,-1.028997,-0.907513
4,-1.265500,-0.665278,0.792680,1.554351,0.290969,0.026927,-0.145361
5,-1.401423,0.090158,0.667637,-0.203788,-1.529623,-1.127234,-0.414450
...,...,...,...,...,...,...,...
4655,1.969576,-1.154415,0.734286,0.877843,-0.311676,-0.413926,0.060452
4656,2.765959,-0.679754,0.061989,-0.052525,-0.149849,-0.699663,0.034609
4657,2.610638,-1.211222,0.315811,0.056660,-0.001314,-0.147221,-0.224187
4658,2.415960,-0.786335,0.173592,-0.202025,-0.649209,-0.020704,0.652916


In [218]:
x1 = mineralogy_inc_pca_df.iloc[:,:1]
x2 = mineralogy_inc_pca_df.iloc[:,1:2]
x3 = mineralogy_inc_pca_df.iloc[:,2:3]
x4 = mineralogy_inc_pca_df.iloc[:,3:4]
x5 = mineralogy_inc_pca_df.iloc[:,4:5]
x6 = mineralogy_inc_pca_df.iloc[:,5:6]
x7 = mineralogy_inc_pca_df.iloc[:,6:7]

x9 = coordinates.iloc[:,:1]
x10 = coordinates.iloc[:,1:2]

In [219]:
scaler1 = StandardScaler().fit(x1)
scaler2 = StandardScaler().fit(x2)
scaler3 = StandardScaler().fit(x3)
scaler4 = StandardScaler().fit(x4)
scaler5 = StandardScaler().fit(x5)
scaler6 = StandardScaler().fit(x6)
scaler7 = StandardScaler().fit(x7)
scaler8 = StandardScaler().fit(x8)
scaler9 = StandardScaler().fit(x9)
scaler10 = StandardScaler().fit(x10)

In [220]:
x1_scaled = scaler1.transform(x1)
x2_scaled = scaler2.transform(x2)
x3_scaled = scaler3.transform(x3)
x4_scaled = scaler4.transform(x4)
x5_scaled = scaler5.transform(x5)
x6_scaled = scaler6.transform(x6)
x7_scaled = scaler7.transform(x7)
x8_scaled = scaler8.transform(x8)
x9_scaled = scaler9.transform(x9)
x10_scaled = scaler10.transform(x10)

In [224]:
scaler9.mean_

array([6482553.5916579])

In [225]:
scaled_inc_pca_df = mineralogy_inc_pca_df.iloc[:,:0]

In [227]:
x1_df = pd.DataFrame(data =x1_scaled,columns=['PC01'])
x2_df = pd.DataFrame(data =x2_scaled,columns=['PC02'])
x3_df = pd.DataFrame(data =x3_scaled,columns=['PC03'])
x4_df = pd.DataFrame(data =x4_scaled,columns=['PC04'])
x5_df = pd.DataFrame(data =x5_scaled,columns=['PC05'])
x6_df = pd.DataFrame(data =x6_scaled,columns=['PC06'])
x7_df = pd.DataFrame(data =x7_scaled,columns=['PC07'])
x8_df = pd.DataFrame(data =x8_scaled,columns=['time'])
x9_df = pd.DataFrame(data =x9_scaled,columns=['X_UTM'])
x10_df = pd.DataFrame(data =x10_scaled,columns=['Y_UTM'])

In [228]:
scaled_inc_pca_df = pd.concat([mineralogy_inc_pca_df.iloc[:,:0], x1_df, x2_df, x3_df, x4_df, x5_df, x6_df, x7_df, x8_df, x9_df, x10_df], axis=1, sort=True)
scaled_inc_pca_df.index = np.arange(1, len(scaled_inc_pca_df) + 1)
scaled_inc_pca_df = scaled_inc_pca_df.dropna()

In [229]:
scaled_inc_pca_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,time,X_UTM,Y_UTM
1,-0.821697,0.073729,-1.747331,1.849427,0.424171,-2.688559,-0.670874,0.467685,-0.148099,0.173109
2,-0.971980,1.049750,-0.742744,0.614839,1.802281,1.397826,-0.565449,-2.133358,-0.225403,0.186731
3,-1.620936,1.255681,0.204918,-0.116210,1.696146,-2.627857,-3.105933,0.467685,-0.161141,0.229809
4,-1.050704,-0.770527,1.285680,3.028266,0.627044,0.068766,-0.497492,0.467685,-0.174952,0.219516
5,-1.163557,0.104422,1.082868,-0.397029,-3.296363,-2.878734,-1.418440,0.467685,-0.176065,0.248088
...,...,...,...,...,...,...,...,...,...,...
4655,1.635276,-1.337046,1.190968,1.710257,-0.671667,-1.057086,0.206893,0.467685,-0.166283,0.244833
4656,2.296487,-0.787293,0.100543,-0.102332,-0.322927,-1.786802,0.118447,0.467685,-0.158806,0.177936
4657,2.167530,-1.402840,0.512226,0.110387,-0.002831,-0.375974,-0.767273,-0.399329,-0.182441,0.206076
4658,2.005894,-0.910735,0.281555,-0.393594,-1.399058,-0.052875,2.234582,0.467685,-0.169622,0.219148
