# Preprocessing

In [1]:
import numpy as np
import pandas as pd
import os
import pickle
import utm
import random

In [2]:
import prepostprocessing.cleaning as cleaning
import prepostprocessing.pre_processing as preproc

from qapf import qapf
from qapf import cipw

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
# Load jupyter extension to reload packages before executing user code.
# https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html
%load_ext autoreload
# Reload all packages (except those excluded by %aimport) every time before executing the Python code typed.
%autoreload 2

## Mineralogy

**To do**
* ~~Clean last points in Excel file while using "sum" as check~~

In [5]:
mineralogy = pd.read_excel("../_CLEANED/Vistelius_data_cleaned_all.xlsx", index_col=0)

mineralogyP = pd.read_excel("../_CLEANED/Vistelius_data_cleaned_all.xlsx", index_col=0)

### Check for wrong entries

In [6]:
np.isclose(mineralogy.loc[:, :"oth"].sum(axis=1), mineralogy.loc[:, "sum"])

array([ True,  True,  True, ...,  True,  True,  True])

In [7]:
wrong_sum = mineralogy.loc[~np.isclose(mineralogy.loc[:, :"oth"].sum(axis=1), mineralogy.loc[:, "sum"]), :]

In [8]:
# Check to see if any remaining incorrect lines are present
wrong_sum

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O,P2O5,l.i.,oth,sum,hs


In [9]:
mineralogy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4659 entries, 1 to 4659
Data columns (total 15 columns):
SiO2     4659 non-null float64
TiO2     4626 non-null float64
Al2O3    4659 non-null float64
Fe2O3    4657 non-null float64
FeO      4659 non-null float64
MnO      4544 non-null float64
MgO      4658 non-null float64
CaO      4659 non-null float64
Na2O     4659 non-null float64
K2O      4659 non-null float64
P2O5     3834 non-null float64
l.i.     4659 non-null float64
oth      874 non-null float64
sum      4659 non-null float64
hs       2240 non-null float64
dtypes: float64(15)
memory usage: 582.4 KB


In [10]:
wrong_sum.loc[:, :"oth"].sum(axis=1)

Series([], dtype: float64)

In [11]:
mineralogy = preproc.replace_nan(mineralogy, 0)
mineralogyP = preproc.replace_nan(mineralogyP, 0)

In [12]:
mineralogy['oth'] = mineralogy['P2O5'] + mineralogy['l.i.'] + mineralogy['oth']
mineralogy.drop(["l.i.", "P2O5", ], axis = 1, inplace = True)
mineralogyP.drop(["l.i.", ], axis = 1, inplace = True)

In [13]:
mineralogyP.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O,P2O5,oth,sum,hs
1,80.8,0.04,10.16,0.61,1.72,0.0,0.4,0.55,2.0,3.59,0.0,0.0,100.22,0.0
2,80.0,0.1,10.1,0.17,0.56,0.02,0.4,0.35,2.3,5.1,0.05,0.0,99.65,0.0
3,79.92,0.05,9.89,0.16,1.73,0.02,0.12,0.14,0.75,6.15,0.0,0.08,100.03,0.3
4,79.65,0.04,9.64,1.15,0.75,0.1,0.45,0.67,3.71,4.25,0.0,0.0,100.67,0.01
5,79.18,0.08,10.24,0.64,2.6,0.04,0.05,1.25,1.52,3.08,0.01,0.16,100.6,0.28


### Cleaning
**To do**
* ~~Replace zero values~~


In [14]:
# Would not do this this way since it becomes less clear what the variable means
# You should also replace 'minralogy' in all remaining cells by 'x' if you would want to do this
# x = mineralogy

In [15]:
# Replace zero values
mineralogy = preproc.replace_zero(mineralogy, 0.01)
mineralogyP = preproc.replace_zero(mineralogyP, 0.01)

* ~~Normalize~~

In [16]:
# Normalize specific columns
mineralogy.loc[:, :"oth"] = preproc.normalize(mineralogy.loc[:, :"oth"])# , total=mineralogy['sum'])
mineralogyP.loc[:, :"oth"] = preproc.normalize(mineralogyP.loc[:, :"oth"])# , total=mineralogy['sum'])

In [17]:
# Renew 'sum' column to reflect changes applied during cleaning
mineralogy["sum"] = mineralogy.loc[:, :"oth"].sum(axis=1)
mineralogyP["sum"] = mineralogyP.loc[:, :"oth"].sum(axis=1)

In [18]:
# Check that sum of all variables + 'sum' == 200
assert all(np.isclose(mineralogy.loc[:, :'sum'].sum(axis=1), 200.0))
assert all(np.isclose(mineralogyP.loc[:, :'sum'].sum(axis=1), 200.0))

### centred log-ratio (clr) transformation

In [19]:
mineralogy1 = mineralogy.drop(columns=['oth', 'hs', 'sum'])
mineralogyP1 = mineralogyP.drop(columns=['oth', 'hs', 'sum'])

In [20]:
mineralogy_clr1 = preproc.clr(mineralogy1)
mineralogy_clr1.head()

mineralogy_clrP1 = preproc.clr(mineralogyP1)
mineralogy_clrP1.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O,P2O5
1,4.865939,-2.744914,2.79242,-0.020334,1.016286,-4.131208,-0.442329,-0.123875,1.167109,1.752114,-4.131208
2,4.779437,-1.905174,2.709946,-1.374546,-0.182408,-3.514612,-0.51888,-0.652412,1.23032,2.026651,-2.598322
3,5.170339,-2.206419,3.080837,-1.043269,1.337434,-3.12271,-1.330951,-1.1768,0.501631,2.605765,-3.815857
4,4.566008,-3.03051,2.454287,0.328128,-0.099316,-2.114219,-0.610142,-0.212112,1.499398,1.635285,-4.416805
5,4.76913,-2.128322,2.723708,-0.04888,1.352918,-2.821469,-2.598326,0.62055,0.816117,1.522336,-4.207763


### Principal Component Analysis (PCA)

In [21]:
mineralogy_pca1 = preproc.pca(mineralogy_clr1)
preproc.pca_variance(mineralogy_pca1)

6 PCA components  out of 10 components with variance sum 0.9653845247577469 needed for obtaining sum of variance > 0.95


array([4.36495556e-01, 2.24309391e-01, 1.14379085e-01, 7.92733011e-02,
       6.47910351e-02, 4.61361562e-02, 2.56884815e-02, 7.25584036e-03,
       1.67115334e-03, 2.35922349e-32])

In [22]:
mineralogy_pca_df1 = preproc.create_pca_df(mineralogy_pca1, mineralogy_clr1)

In [23]:
mineralogy_pca_df1

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,PC08,PC09,PC10
1,-0.989677,0.063658,-1.077309,0.949276,0.196830,-1.052766,-0.196021,0.313477,0.228518,-3.805685e-16
2,-1.170683,0.906362,-0.457936,0.315585,0.836318,0.547351,-0.165217,0.278892,0.207743,2.205780e-16
3,-1.952305,1.084163,0.126341,-0.059648,0.787068,-1.028997,-0.907513,0.941895,0.141972,-5.419556e-17
4,-1.265500,-0.665278,0.792680,1.554351,0.290969,0.026927,-0.145361,-0.158186,0.272325,-2.268773e-16
5,-1.401423,0.090158,0.667637,-0.203788,-1.529623,-1.127234,-0.414450,0.462700,0.259138,-2.570066e-16
...,...,...,...,...,...,...,...,...,...,...
4655,1.969576,-1.154415,0.734286,0.877843,-0.311676,-0.413926,0.060452,-0.189310,-0.243017,1.875335e-16
4656,2.765959,-0.679754,0.061989,-0.052525,-0.149849,-0.699663,0.034609,0.112326,-0.131631,2.651447e-18
4657,2.610638,-1.211222,0.315811,0.056660,-0.001314,-0.147221,-0.224187,0.059057,-0.149794,-8.143807e-16
4658,2.415960,-0.786335,0.173592,-0.202025,-0.649209,-0.020704,0.652916,0.082545,-0.300729,2.716508e-17


In [24]:
preproc.save_obj(mineralogy_pca_df1, "mineralogy_pca_df1")

In [25]:
metadata = pd.read_excel("../_INTERPOLATION/coordinates_full_data.xlsx", index_col=0, usecols=[0, 23, 24, 25, 26, 27, 28])

In [26]:
preproc.save_obj(metadata, "metadata")

In [27]:
preproc.save_obj(mineralogy_clr1, "mineralogy_clr1")

----

### making time and coordinate classes

In [28]:
time = pd.read_excel("../_RESULTS/time_fix.xlsx", index_col=0, usecols=[0, 9])
coordinates = pd.read_excel("../_INTERPOLATION/coordinates_UTM.xlsx", index_col=0, usecols=[0, 1, 2])

In [29]:
classification_renaming_dict = {"Tr" : "1",
                                "Tr-J" : "2",
                                "J" : "3",
                                "K" : "4",
                                "Pg" : "5",
                                "Mz" : "6"}   

In [30]:
time['time'] = time['time'].replace(classification_renaming_dict)

In [31]:
coordinates

Unnamed: 0,Y_UTM,X_UTM
1,3.166850e+06,5.512837e+06
2,1.436111e+06,6.102354e+06
3,2.874853e+06,7.966637e+06
4,2.565651e+06,7.521202e+06
5,2.540731e+06,8.757731e+06
...,...,...
4655,2.759727e+06,8.616851e+06
4656,2.927123e+06,5.721711e+06
4657,2.397980e+06,6.939549e+06
4658,2.684981e+06,7.505261e+06


# Feature engineering: Frost

## Fe_Number

In [32]:
mineralogy_clr1

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
1,4.452818,-3.158035,2.379300,-0.433455,0.603165,-4.544329,-0.855450,-0.536996,0.753988,1.338993
2,4.519605,-2.165007,2.450114,-1.634378,-0.442240,-3.774445,-0.778712,-0.912244,0.970488,1.766819
3,4.788753,-2.588005,2.699251,-1.424854,0.955849,-3.504296,-1.712536,-1.558386,0.120045,2.224179
4,4.124327,-3.472191,2.012606,-0.113553,-0.540997,-2.555900,-1.051822,-0.653792,1.057717,1.193604
5,4.348354,-2.549098,2.302932,-0.469657,0.932142,-3.242245,-3.019102,0.199774,0.395341,1.101560
...,...,...,...,...,...,...,...,...,...,...
4655,2.635045,-2.240153,1.668789,0.110001,0.304876,-2.812672,0.033822,0.783394,0.117309,-0.600410
4656,2.640723,-1.289140,1.587809,-0.329790,0.906973,-3.591725,0.334201,0.846209,-0.310814,-0.794444
4657,2.529955,-1.250274,1.502001,0.047427,0.262462,-3.222617,0.374695,0.744131,-0.404219,-0.583560
4658,2.737015,-1.059568,1.909712,-0.357610,0.248526,-3.484370,-0.177484,1.059988,0.219398,-1.095607


In [33]:
Fe_num1 = pd.read_excel("../_CLEANED/Vistelius_data_cleaned_all.xlsx", index_col=0, usecols = 0)

  usecols = _maybe_convert_usecols(usecols)


In [34]:
Fe_num1['Fe'] = mineralogy_clr1['FeO'] + mineralogy_clr1['Fe2O3'] 
Fe_num1['Fe+Mg']= mineralogy_clr1['MgO'] +mineralogy_clr1['FeO'] + mineralogy_clr1['Fe2O3']
Fe_num1['SiO2'] = mineralogy_clr1['SiO2']

In [35]:
Fe_x1 = Fe_num1.iloc[:,:1]
Fe_x2 = Fe_num1.iloc[:,1:2]
Fe_x3 = Fe_num1.iloc[:,2:3]

#Fe_x4 = time
#Fe_x5 = coordinates.iloc[:,:1]
#Fe_x6 = coordinates.iloc[:,1:2]

In [36]:
#Fe_x4

In [37]:
Fe_num1

Unnamed: 0,Fe,Fe+Mg,SiO2
1,0.169710,-0.685739,4.452818
2,-2.076618,-2.855331,4.519605
3,-0.469006,-2.181542,4.788753
4,-0.654550,-1.706372,4.124327
5,0.462485,-2.556617,4.348354
...,...,...,...
4655,0.414876,0.448698,2.635045
4656,0.577183,0.911383,2.640723
4657,0.309888,0.684583,2.529955
4658,-0.109084,-0.286567,2.737015


In [38]:
scaler1 = StandardScaler().fit(Fe_x1)
scaler2 = StandardScaler().fit(Fe_x2)
scaler3 = StandardScaler().fit(Fe_x3)

#scaler4 = StandardScaler().fit(Fe_x4)
#scaler5 = StandardScaler().fit(Fe_x5)
#scaler6 = StandardScaler().fit(Fe_x6)

In [39]:
Fe_x1_scaled = scaler1.transform(Fe_x1)
Fe_x2_scaled = scaler2.transform(Fe_x2)
Fe_x3_scaled = scaler3.transform(Fe_x3)

#Fe_x4_scaled = scaler4.transform(Fe_x4)
#Fe_x5_scaled = scaler5.transform(Fe_x5)
#Fe_x6_scaled = scaler6.transform(Fe_x6)

In [40]:
Fe_x1_df = pd.DataFrame(data =Fe_x1_scaled,columns=['Fe'])
Fe_x2_df = pd.DataFrame(data =Fe_x2_scaled,columns=['Fe+Mg'])
Fe_x3_df = pd.DataFrame(data =Fe_x3_scaled,columns=['SiO2'])

#Fe_x4_df = pd.DataFrame(data =Fe_x4_scaled,columns=['time'])
#Fe_x5_df = pd.DataFrame(data =Fe_x5_scaled,columns=['Y_UTM'])
#Fe_x6_df = pd.DataFrame(data =Fe_x6_scaled,columns=['X_UTM'])

In [41]:
Fe_num = pd.concat([Fe_num1.iloc[:,:0], Fe_x1_df, Fe_x2_df], axis=1, sort=True)
Fe_num.index = np.arange(1, len(Fe_num) + 1)
Fe_num = Fe_num.dropna()

In [42]:
Fe_num

Unnamed: 0,Fe,Fe+Mg
1,1.324521,1.004741
2,-1.884229,-1.104543
3,0.412153,-0.449484
4,0.147114,0.012478
5,1.742733,-0.814132
...,...,...
4655,1.674727,2.107645
4656,1.906572,2.557469
4657,1.524758,2.336973
4658,0.926281,1.392818


In [43]:
preproc.save_obj(Fe_num, "Fe_num")

In [44]:
Fe_num

Unnamed: 0,Fe,Fe+Mg
1,1.324521,1.004741
2,-1.884229,-1.104543
3,0.412153,-0.449484
4,0.147114,0.012478
5,1.742733,-0.814132
...,...,...
4655,1.674727,2.107645
4656,1.906572,2.557469
4657,1.524758,2.336973
4658,0.926281,1.392818


In [45]:
Fe_num_pca = preproc.pca(Fe_num)
preproc.pca_variance(Fe_num_pca)

2 PCA components  out of 2 components with variance sum 1.0 needed for obtaining sum of variance > 0.95


array([0.83456435, 0.16543565])

In [46]:
Fe_num_pca_df = preproc.create_pca_df(Fe_num_pca, Fe_num)

In [47]:
preproc.save_obj(Fe_num_pca_df, "Fe_num_pca_df")

## MALI

In [48]:
MALI1 = pd.read_excel("../_CLEANED/Vistelius_data_cleaned_all.xlsx", index_col=0, usecols = 0)

  usecols = _maybe_convert_usecols(usecols)


In [49]:
MALI1['Na2O'] = mineralogy_clr1['Na2O']
MALI1['K2O'] = mineralogy_clr1['K2O']
MALI1['CaO'] = mineralogy_clr1['CaO']
MALI1['SiO2'] = mineralogy_clr1['SiO2']


In [50]:
MALI_x1 = MALI1.iloc[:,:1]
MALI_x2 = MALI1.iloc[:,1:2]
MALI_x3 = MALI1.iloc[:,2:3]
MALI_x4 = MALI1.iloc[:,3:4]

MALI_x5 = time

In [51]:
MALI1

Unnamed: 0,Na2O,K2O,CaO,SiO2
1,0.753988,1.338993,-0.536996,4.452818
2,0.970488,1.766819,-0.912244,4.519605
3,0.120045,2.224179,-1.558386,4.788753
4,1.057717,1.193604,-0.653792,4.124327
5,0.395341,1.101560,0.199774,4.348354
...,...,...,...,...
4655,0.117309,-0.600410,0.783394,2.635045
4656,-0.310814,-0.794444,0.846209,2.640723
4657,-0.404219,-0.583560,0.744131,2.529955
4658,0.219398,-1.095607,1.059988,2.737015


In [52]:
scaler1 = StandardScaler().fit(MALI_x1)
scaler2 = StandardScaler().fit(MALI_x2)
scaler3 = StandardScaler().fit(MALI_x3)
scaler4 = StandardScaler().fit(MALI_x4)

scaler5 = StandardScaler().fit(MALI_x5)

In [53]:
MALI_x1_scaled = scaler1.transform(MALI_x1)
MALI_x2_scaled = scaler2.transform(MALI_x2)
MALI_x3_scaled = scaler3.transform(MALI_x3)
MALI_x4_scaled = scaler4.transform(MALI_x4)

MALI_x5_scaled = scaler5.transform(MALI_x5)

In [54]:
MALI_x1_scaled = pd.DataFrame(data =MALI_x1_scaled,columns=['Na2O'])
MALI_x2_scaled = pd.DataFrame(data =MALI_x2_scaled,columns=['K2O'])
MALI_x3_scaled = pd.DataFrame(data =MALI_x3_scaled,columns=['CaO'])
MALI_x4_scaled = pd.DataFrame(data =MALI_x4_scaled,columns=['SiO2'])

MALI_x5_scaled = pd.DataFrame(data =MALI_x5_scaled,columns=['time'])

In [55]:
MALI = pd.concat([MALI1.iloc[:,:0], MALI_x1_scaled, MALI_x2_scaled, MALI_x3_scaled], axis=1, sort=True)
MALI.index = np.arange(1, len(MALI) + 1)
MALI = MALI.dropna()

In [56]:
MALI

Unnamed: 0,Na2O,K2O,CaO
1,0.122888,0.940794,-0.636242
2,0.628920,1.627214,-1.229407
3,-1.358851,2.361020,-2.250782
4,0.832804,0.707527,-0.820865
5,-0.715393,0.559848,0.528392
...,...,...,...
4655,-1.365247,-2.170856,1.450937
4656,-2.365914,-2.482172,1.550231
4657,-2.584233,-2.143822,1.388872
4658,-1.126630,-2.965370,1.888157


In [57]:
preproc.save_obj(MALI, "MALI")

In [58]:
MALI_pca = preproc.pca(MALI)
preproc.pca_variance(MALI_pca)

3 PCA components  out of 3 components with variance sum 1.0 needed for obtaining sum of variance > 0.95


array([0.75985197, 0.13716298, 0.10298505])

In [59]:
MALI_pca_df = preproc.create_pca_df(MALI_pca, MALI)

In [60]:
preproc.save_obj(MALI_pca_df, "MALI_pca_df")

-----

## ASI

In [61]:
ASI = pd.read_excel("../_CLEANED/Vistelius_data_cleaned_all.xlsx", index_col=0, usecols = 0)

  usecols = _maybe_convert_usecols(usecols)


In [62]:
ASI['Al'] = mineralogy_clrP1['Al2O3']
ASI['Ca-P+Na+K'] = mineralogy_clrP1['Al2O3'] - (1.67* mineralogy_clrP1['P2O5']) + mineralogy_clrP1['Na2O'] + mineralogy_clrP1['K2O']


In [63]:
ASI

Unnamed: 0,Al,Ca-P+Na+K
1,2.792420,12.610761
2,2.709946,10.306114
3,3.080837,12.560714
4,2.454287,12.965033
5,2.723708,12.089127
...,...,...
4655,1.834805,4.456205
4656,1.827464,5.203770
4657,1.697006,4.355827
4658,2.087740,4.540664


In [64]:
preproc.save_obj(ASI, "ASI")

In [65]:
ASI_pca = preproc.pca(ASI)
preproc.pca_variance(ASI_pca)

1 PCA components  out of 2 components with variance sum 0.9924299020615133 needed for obtaining sum of variance > 0.95


array([0.9924299, 0.0075701])

In [66]:
ASI_pca_df = preproc.create_pca_df(ASI_pca, ASI)

In [67]:
preproc.save_obj(ASI_pca_df, "ASI_pca_df")

------

# incorporating coordinates and timeclass

In [68]:
mineralogy_inc_pca_df = mineralogy_pca_df1.iloc[:, 0: 7]
time = pd.read_excel("../_RESULTS/time_fix.xlsx", index_col=0, usecols=[0, 9])
coordinates = pd.read_excel("../_INTERPOLATION/coordinates_UTM.xlsx", index_col=0, usecols=[0, 1, 2])

**remark** : make sure to use the right coordinate system

In [69]:
coordinates

Unnamed: 0,Y_UTM,X_UTM
1,3.166850e+06,5.512837e+06
2,1.436111e+06,6.102354e+06
3,2.874853e+06,7.966637e+06
4,2.565651e+06,7.521202e+06
5,2.540731e+06,8.757731e+06
...,...,...
4655,2.759727e+06,8.616851e+06
4656,2.927123e+06,5.721711e+06
4657,2.397980e+06,6.939549e+06
4658,2.684981e+06,7.505261e+06


In [70]:
classification_renaming_dict = {"Tr" : "1",
                                "Tr-J" : "2",
                                "J" : "3",
                                "K" : "4",
                                "Pg" : "5",
                                "Mz" : "6"}                         

In [71]:
time['time'] = time['time'].replace(classification_renaming_dict)

In [72]:
x8 = time

In [73]:
mineralogy_inc_pca_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07
1,-0.989677,0.063658,-1.077309,0.949276,0.196830,-1.052766,-0.196021
2,-1.170683,0.906362,-0.457936,0.315585,0.836318,0.547351,-0.165217
3,-1.952305,1.084163,0.126341,-0.059648,0.787068,-1.028997,-0.907513
4,-1.265500,-0.665278,0.792680,1.554351,0.290969,0.026927,-0.145361
5,-1.401423,0.090158,0.667637,-0.203788,-1.529623,-1.127234,-0.414450
...,...,...,...,...,...,...,...
4655,1.969576,-1.154415,0.734286,0.877843,-0.311676,-0.413926,0.060452
4656,2.765959,-0.679754,0.061989,-0.052525,-0.149849,-0.699663,0.034609
4657,2.610638,-1.211222,0.315811,0.056660,-0.001314,-0.147221,-0.224187
4658,2.415960,-0.786335,0.173592,-0.202025,-0.649209,-0.020704,0.652916


In [74]:
x1 = mineralogy_inc_pca_df.iloc[:,:1]
x2 = mineralogy_inc_pca_df.iloc[:,1:2]
x3 = mineralogy_inc_pca_df.iloc[:,2:3]
x4 = mineralogy_inc_pca_df.iloc[:,3:4]
x5 = mineralogy_inc_pca_df.iloc[:,4:5]
x6 = mineralogy_inc_pca_df.iloc[:,5:6]
x7 = mineralogy_inc_pca_df.iloc[:,6:7]

x9 = coordinates.iloc[:,:1]
x10 = coordinates.iloc[:,1:2]

In [75]:
scaler1 = StandardScaler().fit(x1)
scaler2 = StandardScaler().fit(x2)
scaler3 = StandardScaler().fit(x3)
scaler4 = StandardScaler().fit(x4)
scaler5 = StandardScaler().fit(x5)
scaler6 = StandardScaler().fit(x6)
scaler7 = StandardScaler().fit(x7)
scaler8 = StandardScaler().fit(x8)
scaler9 = StandardScaler().fit(x9)
scaler10 = StandardScaler().fit(x10)

In [76]:
x1_scaled = scaler1.transform(x1)
x2_scaled = scaler2.transform(x2)
x3_scaled = scaler3.transform(x3)
x4_scaled = scaler4.transform(x4)
x5_scaled = scaler5.transform(x5)
x6_scaled = scaler6.transform(x6)
x7_scaled = scaler7.transform(x7)
x8_scaled = scaler8.transform(x8)
x9_scaled = scaler9.transform(x9)
x10_scaled = scaler10.transform(x10)

In [77]:
scaler9.mean_

array([6482553.5916579])

In [78]:
scaled_inc_pca_df = mineralogy_inc_pca_df.iloc[:,:0]

In [79]:
x1_df = pd.DataFrame(data =x1_scaled,columns=['PC01'])
x2_df = pd.DataFrame(data =x2_scaled,columns=['PC02'])
x3_df = pd.DataFrame(data =x3_scaled,columns=['PC03'])
x4_df = pd.DataFrame(data =x4_scaled,columns=['PC04'])
x5_df = pd.DataFrame(data =x5_scaled,columns=['PC05'])
x6_df = pd.DataFrame(data =x6_scaled,columns=['PC06'])
x7_df = pd.DataFrame(data =x7_scaled,columns=['PC07'])
x8_df = pd.DataFrame(data =x8_scaled,columns=['time'])
x9_df = pd.DataFrame(data =x9_scaled,columns=['X_UTM'])
x10_df = pd.DataFrame(data =x10_scaled,columns=['Y_UTM'])

In [80]:
scaled_inc_pca_df = pd.concat([mineralogy_inc_pca_df.iloc[:,:0], x1_df, x2_df, x3_df, x4_df, x5_df, x6_df, x7_df, x8_df, x9_df, x10_df], axis=1, sort=True)
scaled_inc_pca_df.index = np.arange(1, len(scaled_inc_pca_df) + 1)
scaled_inc_pca_df = scaled_inc_pca_df.dropna()

scaled_time_pca_df = pd.concat([mineralogy_inc_pca_df.iloc[:,:0], x1_df, x2_df, x3_df, x4_df, x5_df, x6_df, x7_df, x8_df], axis=1, sort=True)
scaled_time_pca_df.index = np.arange(1, len(scaled_time_pca_df) + 1)
scaled_time_pca_df = scaled_time_pca_df.dropna()

scaled_coordinates_pca_df = pd.concat([mineralogy_inc_pca_df.iloc[:,:0], x1_df, x2_df, x3_df, x4_df, x5_df, x6_df, x7_df, x9_df, x10_df], axis=1, sort=True)
scaled_coordinates_pca_df.index = np.arange(1, len(scaled_coordinates_pca_df) + 1)
scaled_coordinates_pca_df = scaled_coordinates_pca_df.dropna()

In [81]:
scaled_coordinates_pca_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,X_UTM,Y_UTM
1,-0.821697,0.073729,-1.747331,1.849427,0.424171,-2.688559,-0.670874,-0.148099,0.173109
2,-0.971980,1.049750,-0.742744,0.614839,1.802281,1.397826,-0.565449,-0.225403,0.186731
3,-1.620936,1.255681,0.204918,-0.116210,1.696146,-2.627857,-3.105933,-0.161141,0.229809
4,-1.050704,-0.770527,1.285680,3.028266,0.627044,0.068766,-0.497492,-0.174952,0.219516
5,-1.163557,0.104422,1.082868,-0.397029,-3.296363,-2.878734,-1.418440,-0.176065,0.248088
...,...,...,...,...,...,...,...,...,...
4655,1.635276,-1.337046,1.190968,1.710257,-0.671667,-1.057086,0.206893,-0.166283,0.244833
4656,2.296487,-0.787293,0.100543,-0.102332,-0.322927,-1.786802,0.118447,-0.158806,0.177936
4657,2.167530,-1.402840,0.512226,0.110387,-0.002831,-0.375974,-0.767273,-0.182441,0.206076
4658,2.005894,-0.910735,0.281555,-0.393594,-1.399058,-0.052875,2.234582,-0.169622,0.219148


In [82]:
scaled_time_pca_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,time
1,-0.821697,0.073729,-1.747331,1.849427,0.424171,-2.688559,-0.670874,0.467685
2,-0.971980,1.049750,-0.742744,0.614839,1.802281,1.397826,-0.565449,-2.133358
3,-1.620936,1.255681,0.204918,-0.116210,1.696146,-2.627857,-3.105933,0.467685
4,-1.050704,-0.770527,1.285680,3.028266,0.627044,0.068766,-0.497492,0.467685
5,-1.163557,0.104422,1.082868,-0.397029,-3.296363,-2.878734,-1.418440,0.467685
...,...,...,...,...,...,...,...,...
4655,1.635276,-1.337046,1.190968,1.710257,-0.671667,-1.057086,0.206893,0.467685
4656,2.296487,-0.787293,0.100543,-0.102332,-0.322927,-1.786802,0.118447,0.467685
4657,2.167530,-1.402840,0.512226,0.110387,-0.002831,-0.375974,-0.767273,-0.399329
4658,2.005894,-0.910735,0.281555,-0.393594,-1.399058,-0.052875,2.234582,0.467685


In [83]:
preproc.save_obj(scaled_inc_pca_df, "scaled_inc_pca_df")
preproc.save_obj(scaled_time_pca_df, "scaled_time_pca_df")
preproc.save_obj(scaled_coordinates_pca_df, "scaled_coordinates_pca_df")

-----

# grid points cluster analysis

## Area 1 

In [84]:
area1_int = pd.read_excel("../_CIPW/CIPW/AREA1/interpolated_data.xlsx", index_col=0)
area1_int = area1_int.dropna()
area1_int.drop(["oth", ], axis = 1, inplace = True)
area1_int = preproc.replace_zero(area1_int, 0.01)


In [85]:
area1_int.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55413 entries, 2809 to 205544
Data columns (total 10 columns):
SiO2     55413 non-null float64
TiO2     55413 non-null float64
Al2O3    55413 non-null float64
Fe2O3    55413 non-null float64
FeO      55413 non-null float64
MnO      55413 non-null float64
MgO      55413 non-null float64
CaO      55413 non-null float64
Na2O     55413 non-null float64
K2O      55413 non-null float64
dtypes: float64(10)
memory usage: 4.7 MB


In [86]:
area1_int

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
2809,70.222982,0.306007,15.037751,0.652161,2.100375,0.039497,1.063120,2.105502,3.870846,3.740172
2810,70.240242,0.305221,15.034542,0.652185,2.095248,0.039421,1.059408,2.098104,3.870482,3.742184
2811,70.253542,0.304611,15.032047,0.652144,2.091362,0.039362,1.056553,2.092448,3.870193,3.743731
2812,70.241073,0.312743,15.003450,0.619717,2.223126,0.042901,1.036778,2.105143,3.909642,3.782753
2813,70.243463,0.312634,15.002904,0.619474,2.222875,0.042899,1.036236,2.104256,3.909656,3.783102
...,...,...,...,...,...,...,...,...,...,...
205097,71.839678,0.283426,14.517509,0.948295,1.714704,0.045674,0.624198,1.249518,4.051043,4.229271
205541,71.822233,0.284638,14.521715,0.958462,1.712651,0.045724,0.626124,1.251051,4.052232,4.228004
205542,71.824211,0.284493,14.521284,0.957263,1.712859,0.045716,0.625929,1.250916,4.052057,4.228092
205543,71.826556,0.284328,14.520734,0.955941,1.713074,0.045707,0.625675,1.250699,4.051882,4.228246


In [87]:
area1_int_clr = preproc.clr(area1_int)
area1_int_clr.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
2809,3.617717,-1.818107,2.076606,-1.061423,0.108158,-3.865485,-0.57275,0.110596,0.719515,0.685173
2810,3.619309,-1.819333,2.077737,-1.06004,0.107059,-3.866074,-0.574903,0.108421,0.720766,0.687057
2811,3.620542,-1.820289,2.078616,-1.059059,0.106247,-3.866515,-0.576557,0.106766,0.721736,0.688514
2812,3.607553,-1.806754,2.063899,-1.122872,0.154534,-3.793236,-0.608263,0.100003,0.719065,0.686071
2813,3.607763,-1.806927,2.06404,-1.123088,0.154597,-3.79312,-0.608609,0.099758,0.719245,0.68634


In [88]:
area1_int_pca = preproc.pca(area1_int_clr)
preproc.pca_variance(area1_int_pca)

2 PCA components  out of 10 components with variance sum 0.9773112140707415 needed for obtaining sum of variance > 0.95


array([6.52307621e-01, 3.25003593e-01, 2.26887859e-02, 5.47025444e-31,
       1.72784347e-31, 1.36543201e-31, 8.31338518e-32, 6.65649746e-32,
       5.62173424e-32, 3.73045601e-32])

In [89]:
area1_int_df = preproc.create_pca_df(area1_int_pca, area1_int_clr)

In [90]:
area1_int_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,PC08,PC09,PC10
2809,-0.871616,-0.061954,-0.178992,-3.667464e-16,-6.732535e-17,2.033403e-16,-1.077045e-16,-4.769786e-17,-2.249744e-16,-2.429622e-16
2810,-0.867083,-0.061057,-0.180370,-7.451425e-16,1.624210e-17,-1.705633e-16,-3.907189e-16,-2.133986e-16,-4.304799e-17,-2.793085e-16
2811,-0.863637,-0.060284,-0.181416,-9.220021e-16,6.328708e-18,-1.430243e-17,9.122456e-17,8.590459e-17,1.582475e-16,1.372954e-16
2812,-0.881656,-0.001032,-0.083585,-7.859609e-16,-2.094071e-16,-7.524761e-16,-1.179243e-16,1.622333e-16,2.295189e-16,-1.097492e-16
2813,-0.881230,-0.000526,-0.083510,-7.583104e-16,4.920769e-17,-2.476059e-16,-4.686063e-17,-2.128640e-16,5.045648e-16,-1.338189e-17
...,...,...,...,...,...,...,...,...,...,...
205097,-0.049269,-0.211667,0.023654,-3.985039e-16,1.049181e-16,-3.073668e-17,-2.316495e-16,8.972486e-17,4.643697e-16,-6.205094e-17
205541,-0.047306,-0.221904,0.022992,-5.638857e-16,1.370082e-18,-1.317205e-16,3.132650e-17,-9.524689e-17,-4.777928e-16,1.173006e-17
205542,-0.047587,-0.220724,0.022998,-6.280217e-16,4.503544e-16,-4.097656e-16,-8.273617e-17,2.521313e-16,-7.196328e-17,8.275516e-18
205543,-0.047827,-0.219399,0.023041,-7.122765e-16,2.530445e-17,1.490451e-16,-2.164953e-16,-1.068615e-16,-3.819137e-17,-1.552907e-17


In [91]:
preproc.save_obj(area1_int_df, "area1_int_df")

-----

## Area 2

In [92]:
area2_int = pd.read_excel("../_CIPW/CIPW/AREA2/interpolated_data.xlsx", index_col=0)
area2_int = area2_int.dropna()
area2_int.drop(["oth", ], axis = 1, inplace = True)
area2_int = preproc.replace_zero(area2_int, 0.01)

In [93]:
area2_int.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80997 entries, 5679 to 235767
Data columns (total 10 columns):
SiO2     80997 non-null float64
TiO2     80997 non-null float64
Al2O3    80997 non-null float64
Fe2O3    80997 non-null float64
FeO      80997 non-null float64
MnO      80997 non-null float64
MgO      80997 non-null float64
CaO      80997 non-null float64
Na2O     80997 non-null float64
K2O      80997 non-null float64
dtypes: float64(10)
memory usage: 6.8 MB


In [94]:
area2_int

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
5679,71.325684,0.336266,14.575051,1.924127,1.548361,0.080167,0.649239,1.485709,3.633363,3.820793
5680,71.334523,0.335790,14.571812,1.924360,1.546116,0.080114,0.647729,1.482653,3.633472,3.822544
5681,71.343460,0.335309,14.568532,1.924595,1.543847,0.080060,0.646204,1.479566,3.633581,3.824317
6029,71.315004,0.336840,14.578956,1.923846,1.551074,0.080232,0.651066,1.489404,3.633231,3.818681
6030,71.324390,0.336335,14.575525,1.924093,1.548690,0.080175,0.649460,1.486156,3.633347,3.820537
...,...,...,...,...,...,...,...,...,...,...
235415,70.343834,0.374706,15.029898,1.245316,2.075276,0.080455,0.931258,2.065778,3.615349,3.545129
235416,69.086644,0.442032,15.307041,1.380547,2.333729,0.087959,1.163224,2.505622,3.589204,3.379170
235417,69.084714,0.442142,15.307411,1.380848,2.334076,0.087971,1.163582,2.506285,3.589161,3.378941
235418,69.082825,0.442249,15.307773,1.381146,2.334414,0.087983,1.163931,2.506932,3.589120,3.378717


In [95]:
area2_int_clr = preproc.clr(area2_int)
area2_int_clr.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
5679,3.565327,-1.791783,1.977382,-0.047457,-0.264733,-3.225571,-1.133884,-0.306038,0.588229,0.638528
5680,3.566192,-1.792458,1.9779,-0.046595,-0.265442,-3.225497,-1.135472,-0.307355,0.589,0.639728
5681,3.567067,-1.79314,1.978425,-0.045723,-0.266161,-3.225422,-1.137079,-0.308689,0.58978,0.640941
6029,3.564283,-1.79097,1.976755,-0.048497,-0.263876,-3.225659,-1.131968,-0.304448,0.587299,0.637081
6030,3.5652,-1.791685,1.977306,-0.047583,-0.264629,-3.225581,-1.133652,-0.305845,0.588116,0.638353


In [96]:
area2_int_pca = preproc.pca(area2_int_clr)
preproc.pca_variance(area2_int_pca)

2 PCA components  out of 10 components with variance sum 1.0 needed for obtaining sum of variance > 0.95


array([7.47058428e-01, 2.52941572e-01, 1.10503556e-30, 1.98470893e-31,
       1.56975156e-31, 1.11055683e-31, 9.03270660e-32, 6.65789822e-32,
       6.20646259e-32, 3.62823923e-32])

In [97]:
area2_int_df = preproc.create_pca_df(area2_int_pca, area2_int_clr)

In [98]:
area2_int_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,PC08,PC09,PC10
5679,0.404377,-0.641168,8.490958e-16,2.929988e-16,-4.239957e-16,4.226417e-16,7.677841e-17,-4.546072e-16,-2.328264e-16,-1.460760e-16
5680,0.407373,-0.641369,5.504198e-16,-2.745024e-16,-3.677782e-16,1.316523e-16,6.052416e-17,-1.464797e-16,1.304448e-17,1.553163e-16
5681,0.410405,-0.641572,9.099892e-16,3.713079e-16,2.164559e-16,-1.807361e-17,1.001039e-16,6.661919e-17,2.522757e-16,1.861644e-16
6029,0.400763,-0.640926,8.307272e-16,-1.006030e-17,-2.355624e-16,1.808371e-16,5.350751e-17,-2.361346e-16,-3.357920e-16,-7.033226e-17
6030,0.403939,-0.641139,6.107434e-16,2.983970e-16,-1.696658e-17,1.819489e-16,-9.767308e-17,-2.834820e-16,-4.416844e-16,-2.276861e-17
...,...,...,...,...,...,...,...,...,...,...
235415,-0.151609,-0.208043,6.051222e-16,-2.027507e-17,3.766853e-16,-4.453283e-16,-6.804779e-17,-6.826307e-17,-1.828744e-16,-1.086399e-16
235416,-0.417064,-0.303705,6.712210e-16,-8.265424e-16,2.275533e-16,-1.341333e-16,1.610870e-16,1.900142e-16,7.408841e-18,1.001307e-16
235417,-0.417416,-0.303912,7.975709e-16,-9.134360e-17,-1.191996e-16,1.267072e-16,-5.769684e-17,-2.464214e-17,-1.754152e-16,8.191294e-17
235418,-0.417759,-0.304116,9.698704e-16,-4.291248e-16,3.036520e-16,-8.411313e-17,-6.883899e-17,3.172698e-16,-1.992958e-16,1.362059e-16


In [99]:
preproc.save_obj(area2_int_df, "area2_int_df")

----

## Area 3

In [100]:
area3_int = pd.read_excel("../_CIPW/CIPW/AREA3/interpolated_data.xlsx", index_col=0)
area3_int = area3_int.dropna()
area3_int.drop(["oth", ], axis = 1, inplace = True)
area3_int = preproc.replace_zero(area3_int, 0.01)

In [101]:
area3_int.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 122714 entries, 24087 to 360787
Data columns (total 10 columns):
SiO2     122714 non-null float64
TiO2     122714 non-null float64
Al2O3    122714 non-null float64
Fe2O3    122714 non-null float64
FeO      122714 non-null float64
MnO      122714 non-null float64
MgO      122714 non-null float64
CaO      122714 non-null float64
Na2O     122714 non-null float64
K2O      122714 non-null float64
dtypes: float64(10)
memory usage: 10.3 MB


In [102]:
area3_int

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
24087,68.967331,0.385195,15.493254,2.100784,2.128221,0.063870,1.005087,2.267443,3.407114,3.336449
24088,68.944036,0.386313,15.497973,2.104559,2.132595,0.063970,1.009427,2.274831,3.406487,3.333415
24089,68.920923,0.387428,15.502677,2.108133,2.136985,0.064069,1.013761,2.282212,3.405864,3.330416
24698,68.991563,0.383881,15.487023,2.102098,2.122082,0.063765,0.999979,2.258423,3.407755,3.339450
24699,68.965146,0.385153,15.492443,2.106197,2.127099,0.063878,1.004909,2.266837,3.407046,3.336009
...,...,...,...,...,...,...,...,...,...,...
360180,65.291125,0.605366,16.253105,1.070777,3.431397,0.075659,1.996416,3.978195,3.286069,3.008045
360784,65.903204,0.572190,16.193187,1.029246,3.302574,0.073447,1.833952,3.734468,3.306430,3.072428
360785,65.887846,0.573016,16.194780,1.030744,3.305576,0.073506,1.837927,3.740422,3.305934,3.070726
360786,66.037233,0.564805,16.176964,1.046303,3.259607,0.073087,1.797860,3.675090,3.311253,3.083656


In [103]:
area3_int_clr = preproc.clr(area3_int)
area3_int_clr.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
24087,3.431511,-1.756129,1.938283,-0.059811,-0.046835,-3.553032,-0.797047,0.016531,0.423744,0.402785
24088,3.429699,-1.754704,1.937113,-0.05949,-0.046257,-3.552944,-0.794214,0.01831,0.422086,0.400401
24089,3.427904,-1.75328,1.935957,-0.059253,-0.045659,-3.552856,-0.791388,0.02009,0.420443,0.398042
24698,3.4334,-1.758007,1.939418,-0.057648,-0.048187,-3.553143,-0.800605,0.014082,0.425469,0.405222
24699,3.431341,-1.756374,1.938092,-0.057377,-0.047501,-3.553042,-0.797363,0.016125,0.423585,0.402515


In [104]:
area3_int_pca = preproc.pca(area3_int_clr)
preproc.pca_variance(area3_int_pca)

2 PCA components  out of 10 components with variance sum 0.9999999999999999 needed for obtaining sum of variance > 0.95


array([5.97909591e-01, 4.02090409e-01, 8.77184909e-31, 1.17746878e-31,
       1.14806577e-31, 9.32427464e-32, 6.74254388e-32, 5.32834492e-32,
       5.00893230e-32, 3.20818355e-32])

In [105]:
area3_int_df = preproc.create_pca_df(area3_int_pca, area3_int_clr)

In [106]:
area3_int_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,PC08,PC09,PC10
24087,0.375124,-0.910685,3.768294e-16,8.713247e-17,-4.833689e-16,-3.147789e-16,8.422070e-17,-3.322899e-17,3.173369e-17,4.509341e-17
24088,0.371535,-0.914409,3.199339e-16,-1.524910e-17,-2.355615e-16,-5.392535e-16,2.597432e-16,-5.490657e-16,1.152151e-16,-2.421905e-16
24089,0.367905,-0.918050,6.065883e-16,9.342967e-17,-1.425713e-16,-4.439968e-16,1.034926e-17,1.162314e-16,-3.170630e-16,-1.504354e-17
24698,0.381191,-0.908308,9.970303e-16,-1.303746e-16,-2.463894e-16,2.275078e-17,-2.975128e-16,6.914313e-17,3.988292e-17,-2.098070e-16
24699,0.377027,-0.912484,4.755348e-16,2.495077e-16,3.440318e-16,-1.695928e-16,2.139599e-16,-2.698483e-16,-2.390419e-16,2.352277e-16
...,...,...,...,...,...,...,...,...,...,...
360180,-0.840538,-0.816965,6.791533e-16,5.080986e-16,-4.135054e-16,-4.680344e-16,1.436105e-17,1.022127e-16,-1.111561e-16,-1.506044e-16
360784,-0.771534,-0.737394,8.783340e-16,8.504456e-17,-2.122089e-16,-1.657295e-16,1.435211e-16,-9.962480e-17,-8.264941e-17,3.747353e-18
360785,-0.772994,-0.739737,8.559951e-16,2.236129e-16,2.517945e-16,-2.141132e-17,9.047226e-17,-3.397317e-16,-2.364786e-19,-9.182057e-17
360786,-0.738209,-0.737369,7.869737e-16,2.321319e-16,-3.041077e-16,1.605639e-16,1.276032e-16,-2.205236e-16,2.022731e-17,3.091396e-17


In [107]:
preproc.save_obj(area3_int_df, "area3_int_df")

----

## Area 5

In [109]:
area5_int = pd.read_excel("../_CIPW/CIPW/AREA5/interpolated_data.xlsx", index_col=0)
area5_int = area5_int.dropna()
area5_int.drop(["oth", ], axis = 1, inplace = True)
area5_int = preproc.replace_zero(area5_int, 0.01)

In [110]:
area5_int.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 177302 entries, 5249 to 363040
Data columns (total 10 columns):
SiO2     177302 non-null float64
TiO2     177302 non-null float64
Al2O3    177302 non-null float64
Fe2O3    177302 non-null float64
FeO      177302 non-null float64
MnO      177302 non-null float64
MgO      177302 non-null float64
CaO      177302 non-null float64
Na2O     177302 non-null float64
K2O      177302 non-null float64
dtypes: float64(10)
memory usage: 14.9 MB


In [111]:
area5_int

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
5249,64.492596,0.779755,16.021952,1.304893,3.469333,0.117919,2.322823,3.577399,3.188924,3.516064
5250,64.492298,0.779772,16.021955,1.304972,3.469331,0.117914,2.322968,3.577482,3.188925,3.516038
5251,64.492001,0.779789,16.021957,1.305049,3.469331,0.117910,2.323112,3.577564,3.188927,3.516012
5252,64.491707,0.779805,16.021960,1.305126,3.469332,0.117906,2.323254,3.577646,3.188928,3.515986
5253,64.491416,0.779822,16.021962,1.305200,3.469334,0.117903,2.323393,3.577727,3.188929,3.515960
...,...,...,...,...,...,...,...,...,...,...
363036,70.138919,0.345290,15.337498,0.637838,2.070944,0.049323,0.858231,2.102867,3.323969,4.203429
363037,70.138862,0.345294,15.337505,0.637821,2.070978,0.049323,0.858241,2.102886,3.323962,4.203431
363038,70.138820,0.345297,15.337510,0.637804,2.071009,0.049323,0.858247,2.102901,3.323956,4.203433
363039,70.138792,0.345299,15.337513,0.637787,2.071034,0.049323,0.858251,2.102911,3.323950,4.203437


In [112]:
area5_int_clr = preproc.clr(area5_int)
area5_int_clr.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
5249,3.1067,-1.308625,1.71411,-0.793729,0.184112,-3.197612,-0.217067,0.214786,0.099833,0.197492
5250,3.106684,-1.308616,1.714098,-0.793681,0.1841,-3.197659,-0.217016,0.214797,0.099822,0.197472
5251,3.106667,-1.308607,1.714086,-0.793633,0.184088,-3.197705,-0.216967,0.214808,0.09981,0.197453
5252,3.10665,-1.308597,1.714074,-0.793587,0.184076,-3.19775,-0.216917,0.214819,0.099799,0.197434
5253,3.106634,-1.308588,1.714062,-0.793542,0.184065,-3.197794,-0.216869,0.21483,0.099787,0.197415


In [113]:
area5_int_pca = preproc.pca(area5_int_clr)
preproc.pca_variance(area5_int_pca)

3 PCA components  out of 10 components with variance sum 1.0 needed for obtaining sum of variance > 0.95


array([6.09260700e-01, 2.32458752e-01, 1.58280547e-01, 4.55190112e-31,
       1.10374352e-31, 1.06795994e-31, 7.12835014e-32, 5.47153389e-32,
       5.29792506e-32, 4.75457042e-32])

In [114]:
area5_int_df = preproc.create_pca_df(area5_int_pca, area5_int_clr)

In [115]:
area5_int_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,PC08,PC09,PC10
5249,-1.404205,-0.816380,0.364863,2.230240e-16,-7.458378e-16,-2.630968e-16,-1.268764e-16,-7.652368e-17,1.018737e-16,-7.035127e-17
5250,-1.404230,-0.816436,0.364794,2.650962e-16,-9.224636e-17,-2.022676e-16,-5.048884e-16,-3.639250e-16,4.520836e-16,-3.139251e-16
5251,-1.404254,-0.816491,0.364727,4.707145e-16,-7.755778e-16,-3.659371e-17,-4.372298e-17,-1.587913e-16,3.396261e-16,-2.958054e-16
5252,-1.404279,-0.816545,0.364661,-1.321715e-16,-1.097337e-16,2.022198e-16,-8.980674e-17,-3.773590e-16,6.304860e-18,-2.904494e-16
5253,-1.404303,-0.816598,0.364597,6.065224e-16,-5.430471e-16,7.126414e-17,-2.187345e-17,-9.417185e-17,1.608373e-16,-3.403836e-16
...,...,...,...,...,...,...,...,...,...,...
363036,-0.515891,0.133543,0.090137,6.663693e-16,-2.048241e-16,2.186516e-16,-8.885714e-18,1.340709e-16,2.209199e-16,7.197480e-17
363037,-0.515920,0.133563,0.090145,1.017894e-15,2.111895e-16,-1.830772e-16,-1.232119e-17,-3.552539e-16,-6.498558e-17,-9.952099e-17
363038,-0.515945,0.133584,0.090154,2.830538e-16,-6.062016e-17,-2.149024e-16,4.913307e-16,4.457311e-17,-1.896184e-16,-4.111851e-17
363039,-0.515965,0.133605,0.090162,7.057447e-16,-2.340290e-17,3.017164e-16,-2.747837e-17,-1.895714e-16,3.624160e-16,-7.721357e-17


In [116]:
preproc.save_obj(area5_int_df, "area5_int_df")

---