# Preprocessing

In [510]:
import numpy as np
import pandas as pd
import os
import pickle
import utm
import random

In [511]:
import prepostprocessing.cleaning as cleaning
import prepostprocessing.pre_processing as preproc

from qapf import qapf
from qapf import cipw

In [512]:
from sklearn.preprocessing import StandardScaler

In [513]:
# Load jupyter extension to reload packages before executing user code.
# https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html
%load_ext autoreload
# Reload all packages (except those excluded by %aimport) every time before executing the Python code typed.
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Mineralogy

**To do**
* ~~Clean last points in Excel file while using "sum" as check~~

In [356]:
mineralogy = pd.read_excel("../_CLEANED/Vistelius_data_cleaned_all.xlsx", index_col=0)

mineralogyP = pd.read_excel("../_CLEANED/Vistelius_data_cleaned_all.xlsx", index_col=0)

### Check for wrong entries

In [357]:
np.isclose(mineralogy.loc[:, :"oth"].sum(axis=1), mineralogy.loc[:, "sum"])

array([ True,  True,  True, ...,  True,  True,  True])

In [358]:
wrong_sum = mineralogy.loc[~np.isclose(mineralogy.loc[:, :"oth"].sum(axis=1), mineralogy.loc[:, "sum"]), :]

In [359]:
# Check to see if any remaining incorrect lines are present
wrong_sum

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O,P2O5,l.i.,oth,sum,hs


In [360]:
mineralogy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4659 entries, 1 to 4659
Data columns (total 15 columns):
SiO2     4659 non-null float64
TiO2     4626 non-null float64
Al2O3    4659 non-null float64
Fe2O3    4657 non-null float64
FeO      4659 non-null float64
MnO      4544 non-null float64
MgO      4658 non-null float64
CaO      4659 non-null float64
Na2O     4659 non-null float64
K2O      4659 non-null float64
P2O5     3834 non-null float64
l.i.     4659 non-null float64
oth      874 non-null float64
sum      4659 non-null float64
hs       2240 non-null float64
dtypes: float64(15)
memory usage: 582.4 KB


In [361]:
wrong_sum.loc[:, :"oth"].sum(axis=1)

Series([], dtype: float64)

In [362]:
mineralogy = preproc.replace_nan(mineralogy, 0)
mineralogyP = preproc.replace_nan(mineralogyP, 0)

In [363]:
mineralogy['oth'] = mineralogy['P2O5'] + mineralogy['l.i.'] + mineralogy['oth']
mineralogy.drop(["l.i.", "P2O5", ], axis = 1, inplace = True)
mineralogyP.drop(["l.i.", ], axis = 1, inplace = True)

In [364]:
mineralogyP.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O,P2O5,oth,sum,hs
1,80.8,0.04,10.16,0.61,1.72,0.0,0.4,0.55,2.0,3.59,0.0,0.0,100.22,0.0
2,80.0,0.1,10.1,0.17,0.56,0.02,0.4,0.35,2.3,5.1,0.05,0.0,99.65,0.0
3,79.92,0.05,9.89,0.16,1.73,0.02,0.12,0.14,0.75,6.15,0.0,0.08,100.03,0.3
4,79.65,0.04,9.64,1.15,0.75,0.1,0.45,0.67,3.71,4.25,0.0,0.0,100.67,0.01
5,79.18,0.08,10.24,0.64,2.6,0.04,0.05,1.25,1.52,3.08,0.01,0.16,100.6,0.28


### Cleaning
**To do**
* ~~Replace zero values~~


In [365]:
# Would not do this this way since it becomes less clear what the variable means
# You should also replace 'minralogy' in all remaining cells by 'x' if you would want to do this
# x = mineralogy

In [366]:
# Replace zero values
mineralogy = preproc.replace_zero(mineralogy, 0.01)
mineralogyP = preproc.replace_zero(mineralogyP, 0.01)

* ~~Normalize~~

In [367]:
# Normalize specific columns
mineralogy.loc[:, :"oth"] = preproc.normalize(mineralogy.loc[:, :"oth"])# , total=mineralogy['sum'])
mineralogyP.loc[:, :"oth"] = preproc.normalize(mineralogyP.loc[:, :"oth"])# , total=mineralogy['sum'])

In [368]:
# Renew 'sum' column to reflect changes applied during cleaning
mineralogy["sum"] = mineralogy.loc[:, :"oth"].sum(axis=1)
mineralogyP["sum"] = mineralogyP.loc[:, :"oth"].sum(axis=1)

In [369]:
# Check that sum of all variables + 'sum' == 200
assert all(np.isclose(mineralogy.loc[:, :'sum'].sum(axis=1), 200.0))
assert all(np.isclose(mineralogyP.loc[:, :'sum'].sum(axis=1), 200.0))

### centred log-ratio (clr) transformation

In [370]:
mineralogy1 = mineralogy.drop(columns=['oth', 'hs', 'sum'])
mineralogyP1 = mineralogyP.drop(columns=['oth', 'hs', 'sum'])

In [371]:
mineralogy_clr1 = preproc.clr(mineralogy1)
mineralogy_clr1.head()

mineralogy_clrP1 = preproc.clr(mineralogyP1)
mineralogy_clrP1.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O,P2O5
1,4.865939,-2.744914,2.79242,-0.020334,1.016286,-4.131208,-0.442329,-0.123875,1.167109,1.752114,-4.131208
2,4.779437,-1.905174,2.709946,-1.374546,-0.182408,-3.514612,-0.51888,-0.652412,1.23032,2.026651,-2.598322
3,5.170339,-2.206419,3.080837,-1.043269,1.337434,-3.12271,-1.330951,-1.1768,0.501631,2.605765,-3.815857
4,4.566008,-3.03051,2.454287,0.328128,-0.099316,-2.114219,-0.610142,-0.212112,1.499398,1.635285,-4.416805
5,4.76913,-2.128322,2.723708,-0.04888,1.352918,-2.821469,-2.598326,0.62055,0.816117,1.522336,-4.207763


### Principal Component Analysis (PCA)

In [372]:
mineralogy_pca1 = preproc.pca(mineralogy_clr1)
preproc.pca_variance(mineralogy_pca1)

6 PCA components  out of 10 components with variance sum 0.9653845247577469 needed for obtaining sum of variance > 0.95


array([4.36495556e-01, 2.24309391e-01, 1.14379085e-01, 7.92733011e-02,
       6.47910351e-02, 4.61361562e-02, 2.56884815e-02, 7.25584036e-03,
       1.67115334e-03, 2.35922349e-32])

In [373]:
mineralogy_pca_df1 = preproc.create_pca_df(mineralogy_pca1, mineralogy_clr1)

In [374]:
mineralogy_pca_df1

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,PC08,PC09,PC10
1,-0.989677,0.063658,-1.077309,0.949276,0.196830,-1.052766,-0.196021,0.313477,0.228518,-3.805685e-16
2,-1.170683,0.906362,-0.457936,0.315585,0.836318,0.547351,-0.165217,0.278892,0.207743,2.205780e-16
3,-1.952305,1.084163,0.126341,-0.059648,0.787068,-1.028997,-0.907513,0.941895,0.141972,-5.419556e-17
4,-1.265500,-0.665278,0.792680,1.554351,0.290969,0.026927,-0.145361,-0.158186,0.272325,-2.268773e-16
5,-1.401423,0.090158,0.667637,-0.203788,-1.529623,-1.127234,-0.414450,0.462700,0.259138,-2.570066e-16
...,...,...,...,...,...,...,...,...,...,...
4655,1.969576,-1.154415,0.734286,0.877843,-0.311676,-0.413926,0.060452,-0.189310,-0.243017,1.875335e-16
4656,2.765959,-0.679754,0.061989,-0.052525,-0.149849,-0.699663,0.034609,0.112326,-0.131631,2.651447e-18
4657,2.610638,-1.211222,0.315811,0.056660,-0.001314,-0.147221,-0.224187,0.059057,-0.149794,-8.143807e-16
4658,2.415960,-0.786335,0.173592,-0.202025,-0.649209,-0.020704,0.652916,0.082545,-0.300729,2.716508e-17


In [375]:
preproc.save_obj(mineralogy_pca_df1, "mineralogy_pca_df1")

In [376]:
metadata = pd.read_excel("../_INTERPOLATION/coordinates_full_data.xlsx", index_col=0, usecols=[0, 23, 24, 25, 26, 27, 28])

In [377]:
preproc.save_obj(metadata, "metadata")

In [378]:
preproc.save_obj(mineralogy_clr1, "mineralogy_clr1")

----

### making time and coordinate classes

In [379]:
time = pd.read_excel("../_RESULTS/time_fix.xlsx", index_col=0, usecols=[0, 9])
coordinates = pd.read_excel("../_INTERPOLATION/coordinates_UTM.xlsx", index_col=0, usecols=[0, 1, 2])

In [380]:
classification_renaming_dict = {"Tr" : "1",
                                "Tr-J" : "2",
                                "J" : "3",
                                "K" : "4",
                                "Pg" : "5",
                                "Mz" : "6"}   

In [381]:
time['time'] = time['time'].replace(classification_renaming_dict)

In [382]:
coordinates

Unnamed: 0,Y_UTM,X_UTM
1,-1.119047e+06,5.136464e+06
2,-2.003327e+06,6.758106e+06
3,3.254870e+05,6.945620e+06
4,-1.851743e+05,6.896371e+06
5,7.165453e+05,7.654590e+06
...,...,...
4655,7.373749e+05,7.408189e+06
4656,-1.164355e+06,5.436888e+06
4657,-7.101936e+05,6.640923e+06
4658,-1.218014e+05,6.800765e+06


# Feature engineering: Frost

## Fe_Number

In [383]:
mineralogy_clr1

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
1,4.452818,-3.158035,2.379300,-0.433455,0.603165,-4.544329,-0.855450,-0.536996,0.753988,1.338993
2,4.519605,-2.165007,2.450114,-1.634378,-0.442240,-3.774445,-0.778712,-0.912244,0.970488,1.766819
3,4.788753,-2.588005,2.699251,-1.424854,0.955849,-3.504296,-1.712536,-1.558386,0.120045,2.224179
4,4.124327,-3.472191,2.012606,-0.113553,-0.540997,-2.555900,-1.051822,-0.653792,1.057717,1.193604
5,4.348354,-2.549098,2.302932,-0.469657,0.932142,-3.242245,-3.019102,0.199774,0.395341,1.101560
...,...,...,...,...,...,...,...,...,...,...
4655,2.635045,-2.240153,1.668789,0.110001,0.304876,-2.812672,0.033822,0.783394,0.117309,-0.600410
4656,2.640723,-1.289140,1.587809,-0.329790,0.906973,-3.591725,0.334201,0.846209,-0.310814,-0.794444
4657,2.529955,-1.250274,1.502001,0.047427,0.262462,-3.222617,0.374695,0.744131,-0.404219,-0.583560
4658,2.737015,-1.059568,1.909712,-0.357610,0.248526,-3.484370,-0.177484,1.059988,0.219398,-1.095607


In [384]:
Fe_num1 = pd.read_excel("../_CLEANED/Vistelius_data_cleaned_all.xlsx", index_col=0, usecols = 0)

  usecols = _maybe_convert_usecols(usecols)


In [385]:
Fe_num1['Fe'] = mineralogy_clr1['FeO'] + mineralogy_clr1['Fe2O3'] 
Fe_num1['Fe+Mg']= mineralogy_clr1['MgO'] +mineralogy_clr1['FeO'] + mineralogy_clr1['Fe2O3']
Fe_num1['SiO2'] = mineralogy_clr1['SiO2']

In [386]:
Fe_x1 = Fe_num1.iloc[:,:1]
Fe_x2 = Fe_num1.iloc[:,1:2]
Fe_x3 = Fe_num1.iloc[:,2:3]

#Fe_x4 = time
#Fe_x5 = coordinates.iloc[:,:1]
#Fe_x6 = coordinates.iloc[:,1:2]

In [387]:
#Fe_x4

In [388]:
Fe_num1

Unnamed: 0,Fe,Fe+Mg,SiO2
1,0.169710,-0.685739,4.452818
2,-2.076618,-2.855331,4.519605
3,-0.469006,-2.181542,4.788753
4,-0.654550,-1.706372,4.124327
5,0.462485,-2.556617,4.348354
...,...,...,...
4655,0.414876,0.448698,2.635045
4656,0.577183,0.911383,2.640723
4657,0.309888,0.684583,2.529955
4658,-0.109084,-0.286567,2.737015


In [389]:
scaler1 = StandardScaler().fit(Fe_x1)
scaler2 = StandardScaler().fit(Fe_x2)
scaler3 = StandardScaler().fit(Fe_x3)

#scaler4 = StandardScaler().fit(Fe_x4)
#scaler5 = StandardScaler().fit(Fe_x5)
#scaler6 = StandardScaler().fit(Fe_x6)

In [390]:
Fe_x1_scaled = scaler1.transform(Fe_x1)
Fe_x2_scaled = scaler2.transform(Fe_x2)
Fe_x3_scaled = scaler3.transform(Fe_x3)

#Fe_x4_scaled = scaler4.transform(Fe_x4)
#Fe_x5_scaled = scaler5.transform(Fe_x5)
#Fe_x6_scaled = scaler6.transform(Fe_x6)

In [391]:
Fe_x1_df = pd.DataFrame(data =Fe_x1_scaled,columns=['Fe'])
Fe_x2_df = pd.DataFrame(data =Fe_x2_scaled,columns=['Fe+Mg'])
Fe_x3_df = pd.DataFrame(data =Fe_x3_scaled,columns=['SiO2'])

#Fe_x4_df = pd.DataFrame(data =Fe_x4_scaled,columns=['time'])
#Fe_x5_df = pd.DataFrame(data =Fe_x5_scaled,columns=['Y_UTM'])
#Fe_x6_df = pd.DataFrame(data =Fe_x6_scaled,columns=['X_UTM'])

In [392]:
Fe_num = pd.concat([Fe_num1.iloc[:,:0], Fe_x1_df, Fe_x2_df], axis=1, sort=True)
Fe_num.index = np.arange(1, len(Fe_num) + 1)
Fe_num = Fe_num.dropna()

In [393]:
Fe_num

Unnamed: 0,Fe,Fe+Mg
1,1.324521,1.004741
2,-1.884229,-1.104543
3,0.412153,-0.449484
4,0.147114,0.012478
5,1.742733,-0.814132
...,...,...
4655,1.674727,2.107645
4656,1.906572,2.557469
4657,1.524758,2.336973
4658,0.926281,1.392818


In [394]:
preproc.save_obj(Fe_num, "Fe_num")

In [395]:
Fe_num

Unnamed: 0,Fe,Fe+Mg
1,1.324521,1.004741
2,-1.884229,-1.104543
3,0.412153,-0.449484
4,0.147114,0.012478
5,1.742733,-0.814132
...,...,...
4655,1.674727,2.107645
4656,1.906572,2.557469
4657,1.524758,2.336973
4658,0.926281,1.392818


In [396]:
Fe_num_pca = preproc.pca(Fe_num)
preproc.pca_variance(Fe_num_pca)

2 PCA components  out of 2 components with variance sum 1.0 needed for obtaining sum of variance > 0.95


array([0.83456435, 0.16543565])

In [397]:
Fe_num_pca_df = preproc.create_pca_df(Fe_num_pca, Fe_num)

In [398]:
preproc.save_obj(Fe_num_pca_df, "Fe_num_pca_df")

## MALI

In [399]:
MALI1 = pd.read_excel("../_CLEANED/Vistelius_data_cleaned_all.xlsx", index_col=0, usecols = 0)

  usecols = _maybe_convert_usecols(usecols)


In [400]:
MALI1['Na2O'] = mineralogy_clr1['Na2O']
MALI1['K2O'] = mineralogy_clr1['K2O']
MALI1['CaO'] = mineralogy_clr1['CaO']
MALI1['SiO2'] = mineralogy_clr1['SiO2']


In [401]:
MALI_x1 = MALI1.iloc[:,:1]
MALI_x2 = MALI1.iloc[:,1:2]
MALI_x3 = MALI1.iloc[:,2:3]
MALI_x4 = MALI1.iloc[:,3:4]

MALI_x5 = time

In [402]:
MALI1

Unnamed: 0,Na2O,K2O,CaO,SiO2
1,0.753988,1.338993,-0.536996,4.452818
2,0.970488,1.766819,-0.912244,4.519605
3,0.120045,2.224179,-1.558386,4.788753
4,1.057717,1.193604,-0.653792,4.124327
5,0.395341,1.101560,0.199774,4.348354
...,...,...,...,...
4655,0.117309,-0.600410,0.783394,2.635045
4656,-0.310814,-0.794444,0.846209,2.640723
4657,-0.404219,-0.583560,0.744131,2.529955
4658,0.219398,-1.095607,1.059988,2.737015


In [403]:
scaler1 = StandardScaler().fit(MALI_x1)
scaler2 = StandardScaler().fit(MALI_x2)
scaler3 = StandardScaler().fit(MALI_x3)
scaler4 = StandardScaler().fit(MALI_x4)

scaler5 = StandardScaler().fit(MALI_x5)

In [404]:
MALI_x1_scaled = scaler1.transform(MALI_x1)
MALI_x2_scaled = scaler2.transform(MALI_x2)
MALI_x3_scaled = scaler3.transform(MALI_x3)
MALI_x4_scaled = scaler4.transform(MALI_x4)

MALI_x5_scaled = scaler5.transform(MALI_x5)

In [405]:
MALI_x1_scaled = pd.DataFrame(data =MALI_x1_scaled,columns=['Na2O'])
MALI_x2_scaled = pd.DataFrame(data =MALI_x2_scaled,columns=['K2O'])
MALI_x3_scaled = pd.DataFrame(data =MALI_x3_scaled,columns=['CaO'])
MALI_x4_scaled = pd.DataFrame(data =MALI_x4_scaled,columns=['SiO2'])

MALI_x5_scaled = pd.DataFrame(data =MALI_x5_scaled,columns=['time'])

In [406]:
MALI = pd.concat([MALI1.iloc[:,:0], MALI_x1_scaled, MALI_x2_scaled, MALI_x3_scaled], axis=1, sort=True)
MALI.index = np.arange(1, len(MALI) + 1)
MALI = MALI.dropna()

In [407]:
MALI

Unnamed: 0,Na2O,K2O,CaO
1,0.122888,0.940794,-0.636242
2,0.628920,1.627214,-1.229407
3,-1.358851,2.361020,-2.250782
4,0.832804,0.707527,-0.820865
5,-0.715393,0.559848,0.528392
...,...,...,...
4655,-1.365247,-2.170856,1.450937
4656,-2.365914,-2.482172,1.550231
4657,-2.584233,-2.143822,1.388872
4658,-1.126630,-2.965370,1.888157


In [408]:
preproc.save_obj(MALI, "MALI")

In [409]:
MALI_pca = preproc.pca(MALI)
preproc.pca_variance(MALI_pca)

3 PCA components  out of 3 components with variance sum 1.0 needed for obtaining sum of variance > 0.95


array([0.75985197, 0.13716298, 0.10298505])

In [410]:
MALI_pca_df = preproc.create_pca_df(MALI_pca, MALI)

In [411]:
preproc.save_obj(MALI_pca_df, "MALI_pca_df")

-----

## ASI

In [412]:
ASI = pd.read_excel("../_CLEANED/Vistelius_data_cleaned_all.xlsx", index_col=0, usecols = 0)

  usecols = _maybe_convert_usecols(usecols)


In [413]:
ASI['Al'] = mineralogy_clrP1['Al2O3']
ASI['Ca-P+Na+K'] = mineralogy_clrP1['Al2O3'] - (1.67* mineralogy_clrP1['P2O5']) + mineralogy_clrP1['Na2O'] + mineralogy_clrP1['K2O']


In [414]:
ASI

Unnamed: 0,Al,Ca-P+Na+K
1,2.792420,12.610761
2,2.709946,10.306114
3,3.080837,12.560714
4,2.454287,12.965033
5,2.723708,12.089127
...,...,...
4655,1.834805,4.456205
4656,1.827464,5.203770
4657,1.697006,4.355827
4658,2.087740,4.540664


In [415]:
preproc.save_obj(ASI, "ASI")

In [416]:
ASI_pca = preproc.pca(ASI)
preproc.pca_variance(ASI_pca)

1 PCA components  out of 2 components with variance sum 0.9924299020615133 needed for obtaining sum of variance > 0.95


array([0.9924299, 0.0075701])

In [417]:
ASI_pca_df = preproc.create_pca_df(ASI_pca, ASI)

In [418]:
preproc.save_obj(ASI_pca_df, "ASI_pca_df")

------

# incorporating coordinates and timeclass

In [419]:
mineralogy_inc_pca_df = mineralogy_pca_df1.iloc[:, 0: 7]
time = pd.read_excel("../_RESULTS/time_fix.xlsx", index_col=0, usecols=[0, 9])
coordinates = pd.read_excel("../_INTERPOLATION/coordinates_UTM.xlsx", index_col=0, usecols=[0, 1, 2])

**remark** : make sure to use the right coordinate system

In [420]:
coordinates

Unnamed: 0,Y_UTM,X_UTM
1,-1.119047e+06,5.136464e+06
2,-2.003327e+06,6.758106e+06
3,3.254870e+05,6.945620e+06
4,-1.851743e+05,6.896371e+06
5,7.165453e+05,7.654590e+06
...,...,...
4655,7.373749e+05,7.408189e+06
4656,-1.164355e+06,5.436888e+06
4657,-7.101936e+05,6.640923e+06
4658,-1.218014e+05,6.800765e+06


In [421]:
classification_renaming_dict = {"Tr" : "1",
                                "Tr-J" : "2",
                                "J" : "3",
                                "K" : "4",
                                "Pg" : "5",
                                "Mz" : "6"}                         

In [422]:
time['time'] = time['time'].replace(classification_renaming_dict)

In [423]:
x8 = time

In [424]:
mineralogy_inc_pca_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07
1,-0.989677,0.063658,-1.077309,0.949276,0.196830,-1.052766,-0.196021
2,-1.170683,0.906362,-0.457936,0.315585,0.836318,0.547351,-0.165217
3,-1.952305,1.084163,0.126341,-0.059648,0.787068,-1.028997,-0.907513
4,-1.265500,-0.665278,0.792680,1.554351,0.290969,0.026927,-0.145361
5,-1.401423,0.090158,0.667637,-0.203788,-1.529623,-1.127234,-0.414450
...,...,...,...,...,...,...,...
4655,1.969576,-1.154415,0.734286,0.877843,-0.311676,-0.413926,0.060452
4656,2.765959,-0.679754,0.061989,-0.052525,-0.149849,-0.699663,0.034609
4657,2.610638,-1.211222,0.315811,0.056660,-0.001314,-0.147221,-0.224187
4658,2.415960,-0.786335,0.173592,-0.202025,-0.649209,-0.020704,0.652916


In [425]:
x1 = mineralogy_inc_pca_df.iloc[:,:1]
x2 = mineralogy_inc_pca_df.iloc[:,1:2]
x3 = mineralogy_inc_pca_df.iloc[:,2:3]
x4 = mineralogy_inc_pca_df.iloc[:,3:4]
x5 = mineralogy_inc_pca_df.iloc[:,4:5]
x6 = mineralogy_inc_pca_df.iloc[:,5:6]
x7 = mineralogy_inc_pca_df.iloc[:,6:7]

x9 = coordinates.iloc[:,:1]
x10 = coordinates.iloc[:,1:2]

In [426]:
scaler1 = StandardScaler().fit(x1)
scaler2 = StandardScaler().fit(x2)
scaler3 = StandardScaler().fit(x3)
scaler4 = StandardScaler().fit(x4)
scaler5 = StandardScaler().fit(x5)
scaler6 = StandardScaler().fit(x6)
scaler7 = StandardScaler().fit(x7)
scaler8 = StandardScaler().fit(x8)
scaler9 = StandardScaler().fit(x9)
scaler10 = StandardScaler().fit(x10)

In [427]:
x1_scaled = scaler1.transform(x1)
x2_scaled = scaler2.transform(x2)
x3_scaled = scaler3.transform(x3)
x4_scaled = scaler4.transform(x4)
x5_scaled = scaler5.transform(x5)
x6_scaled = scaler6.transform(x6)
x7_scaled = scaler7.transform(x7)
x8_scaled = scaler8.transform(x8)
x9_scaled = scaler9.transform(x9)
x10_scaled = scaler10.transform(x10)

In [428]:
scaler9.mean_

array([9362012.22323719])

In [429]:
scaled_inc_pca_df = mineralogy_inc_pca_df.iloc[:,:0]

In [430]:
x1_df = pd.DataFrame(data =x1_scaled,columns=['PC01'])
x2_df = pd.DataFrame(data =x2_scaled,columns=['PC02'])
x3_df = pd.DataFrame(data =x3_scaled,columns=['PC03'])
x4_df = pd.DataFrame(data =x4_scaled,columns=['PC04'])
x5_df = pd.DataFrame(data =x5_scaled,columns=['PC05'])
x6_df = pd.DataFrame(data =x6_scaled,columns=['PC06'])
x7_df = pd.DataFrame(data =x7_scaled,columns=['PC07'])
x8_df = pd.DataFrame(data =x8_scaled,columns=['time'])
x9_df = pd.DataFrame(data =x9_scaled,columns=['X_UTM'])
x10_df = pd.DataFrame(data =x10_scaled,columns=['Y_UTM'])

In [431]:
scaled_inc_pca_df = pd.concat([mineralogy_inc_pca_df.iloc[:,:0], x1_df, x2_df, x3_df, x4_df, x5_df, x6_df, x7_df, x8_df, x9_df, x10_df], axis=1, sort=True)
scaled_inc_pca_df.index = np.arange(1, len(scaled_inc_pca_df) + 1)
scaled_inc_pca_df = scaled_inc_pca_df.dropna()

scaled_time_pca_df = pd.concat([mineralogy_inc_pca_df.iloc[:,:0], x1_df, x2_df, x3_df, x4_df, x5_df, x6_df, x7_df, x8_df], axis=1, sort=True)
scaled_time_pca_df.index = np.arange(1, len(scaled_time_pca_df) + 1)
scaled_time_pca_df = scaled_time_pca_df.dropna()

scaled_coordinates_pca_df = pd.concat([mineralogy_inc_pca_df.iloc[:,:0], x1_df, x2_df, x3_df, x4_df, x5_df, x6_df, x7_df, x9_df, x10_df], axis=1, sort=True)
scaled_coordinates_pca_df.index = np.arange(1, len(scaled_coordinates_pca_df) + 1)
scaled_coordinates_pca_df = scaled_coordinates_pca_df.dropna()

In [432]:
scaled_coordinates_pca_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,X_UTM,Y_UTM
1,-0.821697,0.073729,-1.747331,1.849427,0.424171,-2.688559,-0.670874,-0.197192,0.187730
2,-0.971980,1.049750,-0.742744,0.614839,1.802281,1.397826,-0.565449,-0.213829,0.200041
3,-1.620936,1.255681,0.204918,-0.116210,1.696146,-2.627857,-3.105933,-0.170014,0.201465
4,-1.050704,-0.770527,1.285680,3.028266,0.627044,0.068766,-0.497492,-0.179622,0.201091
5,-1.163557,0.104422,1.082868,-0.397029,-3.296363,-2.878734,-1.418440,-0.162657,0.206847
...,...,...,...,...,...,...,...,...,...
4655,1.635276,-1.337046,1.190968,1.710257,-0.671667,-1.057086,0.206893,-0.162265,0.204976
4656,2.296487,-0.787293,0.100543,-0.102332,-0.322927,-1.786802,0.118447,-0.198044,0.190011
4657,2.167530,-1.402840,0.512226,0.110387,-0.002831,-0.375974,-0.767273,-0.189500,0.199151
4658,2.005894,-0.910735,0.281555,-0.393594,-1.399058,-0.052875,2.234582,-0.178430,0.200365


In [433]:
scaled_time_pca_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,time
1,-0.821697,0.073729,-1.747331,1.849427,0.424171,-2.688559,-0.670874,0.467685
2,-0.971980,1.049750,-0.742744,0.614839,1.802281,1.397826,-0.565449,-2.133358
3,-1.620936,1.255681,0.204918,-0.116210,1.696146,-2.627857,-3.105933,0.467685
4,-1.050704,-0.770527,1.285680,3.028266,0.627044,0.068766,-0.497492,0.467685
5,-1.163557,0.104422,1.082868,-0.397029,-3.296363,-2.878734,-1.418440,0.467685
...,...,...,...,...,...,...,...,...
4655,1.635276,-1.337046,1.190968,1.710257,-0.671667,-1.057086,0.206893,0.467685
4656,2.296487,-0.787293,0.100543,-0.102332,-0.322927,-1.786802,0.118447,0.467685
4657,2.167530,-1.402840,0.512226,0.110387,-0.002831,-0.375974,-0.767273,-0.399329
4658,2.005894,-0.910735,0.281555,-0.393594,-1.399058,-0.052875,2.234582,0.467685


In [434]:
preproc.save_obj(scaled_inc_pca_df, "scaled_inc_pca_df")
preproc.save_obj(scaled_time_pca_df, "scaled_time_pca_df")
preproc.save_obj(scaled_coordinates_pca_df, "scaled_coordinates_pca_df")

-----

# grid points cluster analysis

## Area 1 

In [537]:
area1_int = pd.read_excel("../_CIPW/CIPW/AREA1/interpolated_data.xlsx", index_col=0)
area1_int = area1_int.dropna()
area1_int.drop(["oth", ], axis = 1, inplace = True)
area1_int = preproc.replace_zero(area1_int, 0.01)


In [538]:
area1_int.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55413 entries, 2809 to 205544
Data columns (total 10 columns):
SiO2     55413 non-null float64
TiO2     55413 non-null float64
Al2O3    55413 non-null float64
Fe2O3    55413 non-null float64
FeO      55413 non-null float64
MnO      55413 non-null float64
MgO      55413 non-null float64
CaO      55413 non-null float64
Na2O     55413 non-null float64
K2O      55413 non-null float64
dtypes: float64(10)
memory usage: 4.7 MB


In [539]:
area1_int

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
2809,70.222982,0.306007,15.037751,0.652161,2.100375,0.039497,1.063120,2.105502,3.870846,3.740172
2810,70.240242,0.305221,15.034542,0.652185,2.095248,0.039421,1.059408,2.098104,3.870482,3.742184
2811,70.253542,0.304611,15.032047,0.652144,2.091362,0.039362,1.056553,2.092448,3.870193,3.743731
2812,70.241073,0.312743,15.003450,0.619717,2.223126,0.042901,1.036778,2.105143,3.909642,3.782753
2813,70.243463,0.312634,15.002904,0.619474,2.222875,0.042899,1.036236,2.104256,3.909656,3.783102
...,...,...,...,...,...,...,...,...,...,...
205097,71.839678,0.283426,14.517509,0.948295,1.714704,0.045674,0.624198,1.249518,4.051043,4.229271
205541,71.822233,0.284638,14.521715,0.958462,1.712651,0.045724,0.626124,1.251051,4.052232,4.228004
205542,71.824211,0.284493,14.521284,0.957263,1.712859,0.045716,0.625929,1.250916,4.052057,4.228092
205543,71.826556,0.284328,14.520734,0.955941,1.713074,0.045707,0.625675,1.250699,4.051882,4.228246


In [540]:
area1_int_clr = preproc.clr(area1_int)
area1_int_clr.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
2809,3.617717,-1.818107,2.076606,-1.061423,0.108158,-3.865485,-0.57275,0.110596,0.719515,0.685173
2810,3.619309,-1.819333,2.077737,-1.06004,0.107059,-3.866074,-0.574903,0.108421,0.720766,0.687057
2811,3.620542,-1.820289,2.078616,-1.059059,0.106247,-3.866515,-0.576557,0.106766,0.721736,0.688514
2812,3.607553,-1.806754,2.063899,-1.122872,0.154534,-3.793236,-0.608263,0.100003,0.719065,0.686071
2813,3.607763,-1.806927,2.06404,-1.123088,0.154597,-3.79312,-0.608609,0.099758,0.719245,0.68634


In [541]:
area1_int_pca = preproc.pca(area1_int_clr)
preproc.pca_variance(area1_int_pca)

2 PCA components  out of 10 components with variance sum 0.9773112140707415 needed for obtaining sum of variance > 0.95


array([6.52307621e-01, 3.25003593e-01, 2.26887859e-02, 5.47025444e-31,
       1.72784347e-31, 1.36543201e-31, 8.31338518e-32, 6.65649746e-32,
       5.62173424e-32, 3.73045601e-32])

In [542]:
area1_int_df = preproc.create_pca_df(area1_int_pca, area1_int_clr)

In [543]:
area1_int_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,PC08,PC09,PC10
2809,-0.871616,-0.061954,-0.178992,-3.667464e-16,-6.732535e-17,2.033403e-16,-1.077045e-16,-4.769786e-17,-2.249744e-16,-2.429622e-16
2810,-0.867083,-0.061057,-0.180370,-7.451425e-16,1.624210e-17,-1.705633e-16,-3.907189e-16,-2.133986e-16,-4.304799e-17,-2.793085e-16
2811,-0.863637,-0.060284,-0.181416,-9.220021e-16,6.328708e-18,-1.430243e-17,9.122456e-17,8.590459e-17,1.582475e-16,1.372954e-16
2812,-0.881656,-0.001032,-0.083585,-7.859609e-16,-2.094071e-16,-7.524761e-16,-1.179243e-16,1.622333e-16,2.295189e-16,-1.097492e-16
2813,-0.881230,-0.000526,-0.083510,-7.583104e-16,4.920769e-17,-2.476059e-16,-4.686063e-17,-2.128640e-16,5.045648e-16,-1.338189e-17
...,...,...,...,...,...,...,...,...,...,...
205097,-0.049269,-0.211667,0.023654,-3.985039e-16,1.049181e-16,-3.073668e-17,-2.316495e-16,8.972486e-17,4.643697e-16,-6.205094e-17
205541,-0.047306,-0.221904,0.022992,-5.638857e-16,1.370082e-18,-1.317205e-16,3.132650e-17,-9.524689e-17,-4.777928e-16,1.173006e-17
205542,-0.047587,-0.220724,0.022998,-6.280217e-16,4.503544e-16,-4.097656e-16,-8.273617e-17,2.521313e-16,-7.196328e-17,8.275516e-18
205543,-0.047827,-0.219399,0.023041,-7.122765e-16,2.530445e-17,1.490451e-16,-2.164953e-16,-1.068615e-16,-3.819137e-17,-1.552907e-17


In [544]:
preproc.save_obj(area1_int_df, "area1_int_df")

-----

## Area 2

In [545]:
area2_int = pd.read_excel("../_CIPW/CIPW/AREA2/interpolated_data.xlsx", index_col=0)
area2_int = area2_int.dropna()
area2_int.drop(["oth", ], axis = 1, inplace = True)
area2_int = preproc.replace_zero(area2_int, 0.01)

In [546]:
area2_int.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80997 entries, 5679 to 235767
Data columns (total 10 columns):
SiO2     80997 non-null float64
TiO2     80997 non-null float64
Al2O3    80997 non-null float64
Fe2O3    80997 non-null float64
FeO      80997 non-null float64
MnO      80997 non-null float64
MgO      80997 non-null float64
CaO      80997 non-null float64
Na2O     80997 non-null float64
K2O      80997 non-null float64
dtypes: float64(10)
memory usage: 6.8 MB


In [547]:
area2_int

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
5679,71.325684,0.336266,14.575051,1.924127,1.548361,0.080167,0.649239,1.485709,3.633363,3.820793
5680,71.334523,0.335790,14.571812,1.924360,1.546116,0.080114,0.647729,1.482653,3.633472,3.822544
5681,71.343460,0.335309,14.568532,1.924595,1.543847,0.080060,0.646204,1.479566,3.633581,3.824317
6029,71.315004,0.336840,14.578956,1.923846,1.551074,0.080232,0.651066,1.489404,3.633231,3.818681
6030,71.324390,0.336335,14.575525,1.924093,1.548690,0.080175,0.649460,1.486156,3.633347,3.820537
...,...,...,...,...,...,...,...,...,...,...
235415,70.343834,0.374706,15.029898,1.245316,2.075276,0.080455,0.931258,2.065778,3.615349,3.545129
235416,69.086644,0.442032,15.307041,1.380547,2.333729,0.087959,1.163224,2.505622,3.589204,3.379170
235417,69.084714,0.442142,15.307411,1.380848,2.334076,0.087971,1.163582,2.506285,3.589161,3.378941
235418,69.082825,0.442249,15.307773,1.381146,2.334414,0.087983,1.163931,2.506932,3.589120,3.378717


In [548]:
area2_int_clr = preproc.clr(area2_int)
area2_int_clr.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
5679,3.565327,-1.791783,1.977382,-0.047457,-0.264733,-3.225571,-1.133884,-0.306038,0.588229,0.638528
5680,3.566192,-1.792458,1.9779,-0.046595,-0.265442,-3.225497,-1.135472,-0.307355,0.589,0.639728
5681,3.567067,-1.79314,1.978425,-0.045723,-0.266161,-3.225422,-1.137079,-0.308689,0.58978,0.640941
6029,3.564283,-1.79097,1.976755,-0.048497,-0.263876,-3.225659,-1.131968,-0.304448,0.587299,0.637081
6030,3.5652,-1.791685,1.977306,-0.047583,-0.264629,-3.225581,-1.133652,-0.305845,0.588116,0.638353


In [549]:
area2_int_pca = preproc.pca(area2_int_clr)
preproc.pca_variance(area2_int_pca)

2 PCA components  out of 10 components with variance sum 1.0 needed for obtaining sum of variance > 0.95


array([7.47058428e-01, 2.52941572e-01, 1.10503556e-30, 1.98470893e-31,
       1.56975156e-31, 1.11055683e-31, 9.03270660e-32, 6.65789822e-32,
       6.20646259e-32, 3.62823923e-32])

In [550]:
area2_int_df = preproc.create_pca_df(area2_int_pca, area2_int_clr)

In [551]:
area2_int_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,PC08,PC09,PC10
5679,0.404377,-0.641168,8.490958e-16,2.929988e-16,-4.239957e-16,4.226417e-16,7.677841e-17,-4.546072e-16,-2.328264e-16,-1.460760e-16
5680,0.407373,-0.641369,5.504198e-16,-2.745024e-16,-3.677782e-16,1.316523e-16,6.052416e-17,-1.464797e-16,1.304448e-17,1.553163e-16
5681,0.410405,-0.641572,9.099892e-16,3.713079e-16,2.164559e-16,-1.807361e-17,1.001039e-16,6.661919e-17,2.522757e-16,1.861644e-16
6029,0.400763,-0.640926,8.307272e-16,-1.006030e-17,-2.355624e-16,1.808371e-16,5.350751e-17,-2.361346e-16,-3.357920e-16,-7.033226e-17
6030,0.403939,-0.641139,6.107434e-16,2.983970e-16,-1.696658e-17,1.819489e-16,-9.767308e-17,-2.834820e-16,-4.416844e-16,-2.276861e-17
...,...,...,...,...,...,...,...,...,...,...
235415,-0.151609,-0.208043,6.051222e-16,-2.027507e-17,3.766853e-16,-4.453283e-16,-6.804779e-17,-6.826307e-17,-1.828744e-16,-1.086399e-16
235416,-0.417064,-0.303705,6.712210e-16,-8.265424e-16,2.275533e-16,-1.341333e-16,1.610870e-16,1.900142e-16,7.408841e-18,1.001307e-16
235417,-0.417416,-0.303912,7.975709e-16,-9.134360e-17,-1.191996e-16,1.267072e-16,-5.769684e-17,-2.464214e-17,-1.754152e-16,8.191294e-17
235418,-0.417759,-0.304116,9.698704e-16,-4.291248e-16,3.036520e-16,-8.411313e-17,-6.883899e-17,3.172698e-16,-1.992958e-16,1.362059e-16


In [552]:
preproc.save_obj(area2_int_df, "area2_int_df")

----

## Area 3

In [553]:
area3_int = pd.read_excel("../_CIPW/CIPW/AREA3/interpolated_data.xlsx", index_col=0)
area3_int = area3_int.dropna()
area3_int.drop(["oth", ], axis = 1, inplace = True)
area3_int = preproc.replace_zero(area3_int, 0.01)

In [554]:
area3_int.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54455 entries, 11189 to 160544
Data columns (total 10 columns):
SiO2     54455 non-null float64
TiO2     54455 non-null float64
Al2O3    54455 non-null float64
Fe2O3    54455 non-null float64
FeO      54455 non-null float64
MnO      54455 non-null float64
MgO      54455 non-null float64
CaO      54455 non-null float64
Na2O     54455 non-null float64
K2O      54455 non-null float64
dtypes: float64(10)
memory usage: 4.6 MB


In [555]:
area3_int

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
11189,68.973826,0.384729,15.490616,2.105048,2.125391,0.063841,1.003265,2.264021,3.407279,3.337133
11190,68.933533,0.386681,15.498923,2.110996,2.133132,0.064013,1.010837,2.276947,3.406195,3.331904
11191,68.893888,0.388613,15.507131,2.116467,2.140859,0.064183,1.018360,2.289786,3.405126,3.326788
11192,68.856651,0.390440,15.514863,2.121303,2.148205,0.064343,1.025487,2.301946,3.404120,3.322008
11596,69.065490,0.380060,15.469415,2.099817,2.105257,0.063444,0.985195,2.232583,3.409713,3.348840
...,...,...,...,...,...,...,...,...,...,...
160540,65.844594,0.575363,16.199450,1.031606,3.315901,0.073653,1.849313,3.758044,3.304477,3.066352
160541,65.888449,0.572938,16.194183,1.038694,3.300992,0.073544,1.837354,3.738182,3.306089,3.069814
160542,65.963747,0.568772,16.185019,1.050153,3.275856,0.073351,1.816909,3.704308,3.308833,3.075885
160543,65.954892,0.569303,16.186553,1.042429,3.282323,0.073342,1.819655,3.709889,3.308405,3.075937


In [556]:
area3_int_clr = preproc.clr(area3_int)
area3_int_clr.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
11189,3.432017,-1.756925,1.938524,-0.057372,-0.047754,-3.553076,-0.79845,0.015432,0.424204,0.403402
11190,3.428883,-1.754417,1.93651,-0.0571,-0.046669,-3.552921,-0.793481,0.018575,0.421336,0.399284
11191,3.425809,-1.751929,1.934542,-0.05701,-0.045551,-3.552769,-0.788565,0.0217,0.418524,0.395249
11192,3.422931,-1.749577,1.932703,-0.057065,-0.044463,-3.552624,-0.783928,0.024659,0.415891,0.391474
11596,3.439188,-1.763293,1.942997,-0.054017,-0.05143,-3.55346,-0.810784,0.007292,0.43076,0.412747


In [557]:
area3_int_pca = preproc.pca(area3_int_clr)
preproc.pca_variance(area3_int_pca)

2 PCA components  out of 10 components with variance sum 1.0 needed for obtaining sum of variance > 0.95


array([5.98138421e-01, 4.01861579e-01, 3.87080825e-31, 1.31333072e-31,
       1.10359749e-31, 9.12883257e-32, 6.18384818e-32, 5.37724228e-32,
       4.64576740e-32, 3.50855223e-32])

In [558]:
area3_int_df = preproc.create_pca_df(area3_int_pca, area3_int_clr)

In [559]:
area3_int_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,PC08,PC09,PC10
11189,0.378920,-0.911470,5.439131e-16,-1.952341e-16,-2.814163e-16,2.566793e-16,-4.017168e-16,-2.583177e-16,-1.439430e-16,-5.706962e-17
11190,0.372451,-0.917743,1.876171e-16,8.945326e-17,2.833851e-16,-1.164986e-16,1.366745e-16,-3.112835e-16,-1.473322e-16,-1.207413e-16
11191,0.365942,-0.923789,3.212005e-16,-3.298020e-16,-1.041261e-16,-2.887258e-16,-8.421969e-17,-1.789867e-16,-5.016274e-17,1.234255e-17
11192,0.359717,-0.929365,1.312915e-16,-2.854636e-16,1.055485e-16,2.685475e-17,-1.264535e-16,4.191175e-17,-1.948273e-16,-2.054843e-16
11596,0.397430,-0.899513,5.705767e-16,1.050845e-16,-8.080592e-17,7.356383e-18,-2.336581e-17,1.270848e-16,-7.990720e-17,-3.184433e-16
...,...,...,...,...,...,...,...,...,...,...
160540,-0.778978,-0.744624,1.247884e-16,-4.458693e-17,-5.274837e-16,4.295536e-16,-7.294304e-18,-2.613299e-16,-2.176553e-16,-2.134437e-17
160541,-0.767105,-0.745673,6.640944e-16,1.317419e-16,6.051890e-17,-7.647030e-17,-5.084030e-16,7.613067e-17,-2.469211e-16,-4.755967e-17
160542,-0.747192,-0.746888,1.307851e-17,-4.474222e-16,3.963108e-17,-1.378375e-17,-4.195711e-16,-1.828671e-16,-2.264948e-16,2.980998e-17
160543,-0.753829,-0.742470,6.015401e-16,-4.235754e-16,-4.340958e-18,4.227186e-16,-2.789172e-17,1.825506e-16,-1.102268e-16,-6.836111e-17


In [560]:
preproc.save_obj(area3_int_df, "area3_int_df")

----

## Area 4

In [561]:
area4_int = pd.read_excel("../_CIPW/CIPW/AREA4/interpolated_data.xlsx", index_col=0)
area4_int = area4_int.dropna()
area4_int.drop(["oth", ], axis = 1, inplace = True)
area4_int = preproc.replace_zero(area4_int, 0.01)

In [562]:
area4_int.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59479 entries, 2499 to 134976
Data columns (total 10 columns):
SiO2     59479 non-null float64
TiO2     59479 non-null float64
Al2O3    59479 non-null float64
Fe2O3    59479 non-null float64
FeO      59479 non-null float64
MnO      59479 non-null float64
MgO      59479 non-null float64
CaO      59479 non-null float64
Na2O     59479 non-null float64
K2O      59479 non-null float64
dtypes: float64(10)
memory usage: 5.0 MB


In [563]:
area4_int

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
2499,68.157353,0.389587,15.940766,1.459134,2.561572,0.056441,1.543992,3.051329,3.567657,2.134805
2500,68.150893,0.389888,15.941639,1.460142,2.562972,0.056480,1.545360,3.053444,3.567597,2.134063
2501,68.146669,0.390094,15.942191,1.460837,2.563919,0.056509,1.546200,3.054770,3.567559,2.133670
2502,68.144610,0.390206,15.942435,1.461226,2.564424,0.056529,1.546533,3.055337,3.567544,2.133604
2503,68.144608,0.390229,15.942387,1.461325,2.564508,0.056540,1.546389,3.055187,3.567550,2.133841
...,...,...,...,...,...,...,...,...,...,...
134972,69.341569,0.321180,15.276715,0.844648,1.992609,0.033076,0.895321,1.911588,3.143461,5.509776
134973,69.349371,0.320701,15.273379,0.844562,1.989720,0.033033,0.893315,1.908378,3.143485,5.514530
134974,69.359296,0.320101,15.269328,0.844390,1.986195,0.032981,0.890875,1.904469,3.143549,5.519897
134975,69.371339,0.319381,15.264571,0.844127,1.982046,0.032919,0.888009,1.899876,3.143655,5.525843


In [564]:
area4_int_clr = preproc.clr(area4_int)
area4_int_clr.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
2499,3.414602,-1.749886,1.961663,-0.429374,0.133404,-3.681773,-0.372846,0.30836,0.464692,-0.048842
2500,3.41412,-1.749499,1.96133,-0.429071,0.133563,-3.681472,-0.372348,0.308666,0.464288,-0.049577
2501,3.413793,-1.749236,1.9611,-0.42886,0.133668,-3.681217,-0.372069,0.308835,0.464013,-0.050026
2502,3.413618,-1.749094,1.96097,-0.428738,0.13372,-3.681012,-0.371999,0.308875,0.463863,-0.050202
2503,3.413586,-1.749067,1.960936,-0.428702,0.133721,-3.680857,-0.372123,0.308795,0.463833,-0.050122


In [565]:
area4_int_pca = preproc.pca(area4_int_clr)
preproc.pca_variance(area4_int_pca)

4 PCA components  out of 10 components with variance sum 0.9999999999999999 needed for obtaining sum of variance > 0.95


array([6.44692309e-01, 1.85364036e-01, 1.06554614e-01, 6.33890403e-02,
       1.86013230e-31, 9.80462890e-32, 7.54045993e-32, 5.44979498e-32,
       4.32215351e-32, 3.60683654e-32])

In [566]:
area4_int_df = preproc.create_pca_df(area4_int_pca, area4_int_clr)

In [567]:
area4_int_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,PC08,PC09,PC10
2499,0.485689,-0.125740,-0.055342,0.246923,-1.039289e-16,-1.302962e-16,-2.587433e-16,-7.067557e-17,-2.311174e-16,1.870939e-16
2500,0.486885,-0.126015,-0.055743,0.246641,-3.899310e-16,-5.753650e-16,-3.178221e-16,1.094210e-17,-1.028628e-16,-6.561663e-17
2501,0.487637,-0.126147,-0.056041,0.246396,1.749958e-17,5.290171e-16,-3.432184e-16,-4.959165e-16,2.595399e-16,1.489980e-16
2502,0.487965,-0.126145,-0.056236,0.246190,-3.953032e-16,1.114381e-16,-4.375435e-16,1.184405e-16,1.459494e-16,9.663860e-17
2503,0.487892,-0.126016,-0.056330,0.246027,-2.536859e-16,3.304752e-16,-4.077117e-16,5.206356e-16,-4.974291e-19,2.463790e-16
...,...,...,...,...,...,...,...,...,...,...
134972,-0.823145,-0.212690,-0.299398,0.431477,-7.193210e-16,5.551460e-16,8.122340e-17,-9.539794e-17,2.986868e-16,-1.868084e-16
134973,-0.825401,-0.210903,-0.299287,0.432355,-3.077927e-16,4.892259e-16,4.208353e-16,-3.757978e-17,1.507400e-16,7.176897e-17
134974,-0.828128,-0.208760,-0.299055,0.433406,-3.048143e-16,1.673033e-16,3.445884e-16,5.202831e-17,9.141722e-17,9.053739e-17
134975,-0.831322,-0.206267,-0.298698,0.434626,-1.881016e-17,3.270835e-16,1.704870e-16,1.612700e-16,-2.190454e-16,-3.638746e-17


In [568]:
preproc.save_obj(area4_int_df, "area4_int_df")

## Area 5

In [569]:
area5_int = pd.read_excel("../_CIPW/CIPW/AREA5/interpolated_data.xlsx", index_col=0)
area5_int = area5_int.dropna()
area5_int.drop(["oth", ], axis = 1, inplace = True)
area5_int = preproc.replace_zero(area5_int, 0.01)

In [570]:
area5_int.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60209 entries, 1996 to 123586
Data columns (total 10 columns):
SiO2     60209 non-null float64
TiO2     60209 non-null float64
Al2O3    60209 non-null float64
Fe2O3    60209 non-null float64
FeO      60209 non-null float64
MnO      60209 non-null float64
MgO      60209 non-null float64
CaO      60209 non-null float64
Na2O     60209 non-null float64
K2O      60209 non-null float64
dtypes: float64(10)
memory usage: 5.1 MB


In [571]:
area5_int

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
1996,64.492324,0.779771,16.021955,1.304960,3.469338,0.117915,2.322952,3.577475,3.188924,3.516040
1997,64.491820,0.779800,16.021959,1.305092,3.469338,0.117909,2.323196,3.577614,3.188927,3.515996
1998,64.491327,0.779828,16.021964,1.305219,3.469341,0.117902,2.323433,3.577751,3.188929,3.515952
1999,64.490847,0.779855,16.021969,1.305340,3.469347,0.117896,2.323662,3.577884,3.188930,3.515910
2000,64.490387,0.779882,16.021974,1.305454,3.469356,0.117891,2.323880,3.578012,3.188931,3.515870
...,...,...,...,...,...,...,...,...,...,...
123582,70.138886,0.345292,15.337506,0.637853,2.070939,0.049323,0.858240,2.102874,3.323972,4.203421
123583,70.138775,0.345299,15.337522,0.637823,2.071004,0.049323,0.858259,2.102911,3.323960,4.203422
123584,70.138706,0.345304,15.337530,0.637794,2.071055,0.049324,0.858270,2.102936,3.323949,4.203427
123585,70.138680,0.345306,15.337530,0.637765,2.071092,0.049324,0.858271,2.102947,3.323940,4.203435


In [572]:
area5_int_clr = preproc.clr(area5_int)
area5_int_clr.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
1996,3.106685,-1.308616,1.714099,-0.793689,0.184102,-3.19765,-0.217023,0.214796,0.099822,0.197474
1997,3.106657,-1.3086,1.714078,-0.793608,0.184082,-3.197728,-0.216938,0.214814,0.099803,0.197441
1998,3.106629,-1.308584,1.714059,-0.793531,0.184063,-3.197802,-0.216856,0.214833,0.099783,0.197408
1999,3.106602,-1.308569,1.714039,-0.793458,0.184045,-3.197872,-0.216777,0.21485,0.099764,0.197377
2000,3.106576,-1.308553,1.714021,-0.79339,0.184028,-3.197937,-0.216703,0.214867,0.099745,0.197346


In [573]:
area5_int_pca = preproc.pca(area5_int_clr)
preproc.pca_variance(area5_int_pca)

3 PCA components  out of 10 components with variance sum 1.0 needed for obtaining sum of variance > 0.95


array([6.09225681e-01, 2.32476806e-01, 1.58297513e-01, 3.98711542e-31,
       1.21647309e-31, 9.40885894e-32, 6.13241177e-32, 6.03063262e-32,
       5.14894195e-32, 3.09583294e-32])

In [574]:
area5_int_df = preproc.create_pca_df(area5_int_pca, area5_int_clr)

In [575]:
area5_int_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,PC08,PC09,PC10
1996,-1.404360,-0.816303,0.365033,-4.081481e-16,3.902347e-16,-3.556127e-16,-2.838719e-17,1.450981e-16,-1.870483e-16,2.118815e-16
1997,-1.404402,-0.816395,0.364919,-6.622449e-16,-3.618581e-16,-6.747147e-16,-3.307569e-16,-4.321083e-17,1.870591e-16,-1.552170e-16
1998,-1.404444,-0.816486,0.364810,-8.289077e-16,-4.977368e-16,-7.514300e-17,4.930142e-16,3.109769e-16,3.922534e-17,9.156759e-17
1999,-1.404485,-0.816572,0.364707,-9.128211e-16,-5.519901e-17,-5.802901e-16,-3.276596e-18,3.603118e-16,-8.580826e-17,-1.815730e-16
2000,-1.404526,-0.816654,0.364611,-7.009625e-16,-1.680333e-16,7.696369e-18,5.069393e-16,8.697547e-17,-1.880096e-16,1.298028e-16
...,...,...,...,...,...,...,...,...,...,...
123582,-0.516033,0.133608,0.090183,-4.516425e-16,3.252055e-16,2.876175e-16,1.159547e-17,7.687785e-17,2.122814e-17,2.003950e-16
123583,-0.516086,0.133643,0.090197,-3.452497e-16,-5.701888e-16,4.549955e-16,2.243574e-16,1.569767e-16,1.876207e-17,-1.915397e-16
123584,-0.516128,0.133679,0.090211,-8.443746e-16,-7.192301e-18,2.692906e-16,2.349037e-16,2.786820e-16,1.400359e-16,-9.615518e-18
123585,-0.516157,0.133716,0.090226,-2.673639e-16,2.654459e-16,-1.868731e-16,-6.503191e-18,3.085155e-16,2.430331e-17,-1.814433e-16


In [576]:
preproc.save_obj(area5_int_df, "area5_int_df")

---

# combined areas

## PCA

In [475]:
combined_int_clr = pd.concat([area1_int_clr, area2_int_clr, area3_int_clr, area4_int_clr, area5_int_clr])

In [476]:
combined_int_clr

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O
2809,3.617717,-1.818107,2.076606,-1.061423,0.108158,-3.865485,-0.572750,0.110596,0.719515,0.685173
2810,3.619309,-1.819333,2.077737,-1.060040,0.107059,-3.866074,-0.574903,0.108421,0.720766,0.687057
2811,3.620542,-1.820289,2.078616,-1.059059,0.106247,-3.866515,-0.576557,0.106766,0.721736,0.688514
2812,3.607553,-1.806754,2.063899,-1.122872,0.154534,-3.793236,-0.608263,0.100003,0.719065,0.686071
2813,3.607763,-1.806927,2.064040,-1.123088,0.154597,-3.793120,-0.608609,0.099758,0.719245,0.686340
...,...,...,...,...,...,...,...,...,...,...
363036,3.609093,-1.704756,2.088916,-1.091055,0.086619,-3.650758,-0.794267,0.101917,0.559775,0.794516
363037,3.609090,-1.704747,2.088914,-1.091085,0.086634,-3.650755,-0.794258,0.101923,0.559770,0.794514
363038,3.609088,-1.704740,2.088913,-1.091113,0.086647,-3.650752,-0.794252,0.101929,0.559767,0.794513
363039,3.609087,-1.704735,2.088913,-1.091139,0.086659,-3.650750,-0.794248,0.101934,0.559765,0.794514


In [477]:
combined_int_pca = preproc.pca(combined_int_clr)
preproc.pca_variance(combined_int_pca)

4 PCA components  out of 10 components with variance sum 0.9783388860470209 needed for obtaining sum of variance > 0.95


array([5.59659924e-01, 2.83820126e-01, 7.98987742e-02, 5.49600618e-02,
       1.33578547e-02, 4.56400628e-03, 2.98069757e-03, 6.77729896e-04,
       8.08254659e-05, 1.52048234e-31])

In [478]:
combined_int_df = preproc.create_pca_df(combined_int_pca, combined_int_clr)

In [479]:
combined_int_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,PC08,PC09,PC10
2809,-0.156071,0.415668,0.292946,0.122852,0.090998,0.097529,0.022490,0.070398,0.004281,-5.947402e-16
2810,-0.151599,0.414158,0.293223,0.123356,0.091647,0.097604,0.022548,0.069931,0.004318,-2.262792e-16
2811,-0.148159,0.413081,0.293405,0.123764,0.092138,0.097657,0.022604,0.069577,0.004347,-6.877648e-16
2812,-0.157101,0.465223,0.195572,0.101890,0.094256,0.078665,0.018640,0.082849,0.003593,-5.773814e-16
2813,-0.156534,0.465403,0.195300,0.101977,0.094323,0.078618,0.018671,0.082832,0.003595,-2.943190e-16
...,...,...,...,...,...,...,...,...,...,...
363036,-0.053823,0.415002,0.077979,-0.102632,-0.057779,0.019189,-0.039607,-0.015951,-0.002681,-1.599371e-16
363037,-0.053838,0.415033,0.077970,-0.102634,-0.057778,0.019187,-0.039609,-0.015951,-0.002682,9.095537e-17
363038,-0.053850,0.415063,0.077962,-0.102635,-0.057776,0.019185,-0.039611,-0.015951,-0.002682,-2.174139e-16
363039,-0.053858,0.415091,0.077953,-0.102635,-0.057776,0.019183,-0.039612,-0.015952,-0.002682,1.747118e-16


In [480]:
preproc.save_obj(combined_int_df, "combined_int_df")

----

## Fe_Number

In [483]:
Fe_combined1 = pd.read_excel("../_CLUSTER/groups_time_area/interpolation/combined_int_values.xlsx", index_col=0, usecols = 0)
Fe_combined = pd.read_excel("../_CLUSTER/groups_time_area/interpolation/combined_int_values.xlsx", index_col=0, usecols = 0)


  usecols = _maybe_convert_usecols(usecols)


In [484]:
Fe_combined1

2809
2810
2811
2812
2813
...
363036
363037
363038
363039
363040


In [485]:
Fe_combined1['Fe'] = combined_int_clr['FeO'] + combined_int_clr['Fe2O3'] 
Fe_combined1['Fe+Mg']= combined_int_clr['MgO'] +combined_int_clr['FeO'] + combined_int_clr['Fe2O3']


In [486]:
Fe_comb_x1 = Fe_combined1.iloc[:,:1]
Fe_comb_x2 = Fe_combined1.iloc[:,1:2]


In [487]:
Fe_combined1

Unnamed: 0,Fe,Fe+Mg
2809,-0.953265,-1.526015
2810,-0.952981,-1.527883
2811,-0.952812,-1.529369
2812,-0.968339,-1.576602
2813,-0.968490,-1.577100
...,...,...
363036,-1.004436,-1.798703
363037,-1.004451,-1.798709
363038,-1.004466,-1.798717
363039,-1.004480,-1.798728


In [488]:
scaler1 = StandardScaler().fit(Fe_comb_x1)
scaler2 = StandardScaler().fit(Fe_comb_x2)


In [489]:
Fe_comb_x1_scaled = scaler1.transform(Fe_comb_x1)
Fe_comb_x2_scaled = scaler2.transform(Fe_comb_x2)


In [490]:
Fe_comb_x1_df = pd.DataFrame(data =Fe_comb_x1_scaled,columns=['Fe'])
Fe_comb_x2_df = pd.DataFrame(data =Fe_comb_x2_scaled,columns=['Fe+Mg'])

In [491]:
Fe_combined['Fe'] = Fe_comb_x1_df['Fe']
Fe_combined['Fe+Mg']= Fe_comb_x2_df['Fe+Mg']

In [492]:
Fe_combined

Unnamed: 0,Fe,Fe+Mg
2809,-1.356979,-0.534569
2810,-1.270884,-0.533841
2811,-1.292053,-0.523423
2812,-1.310933,-0.519251
2813,-1.287209,-0.586043
...,...,...
363036,0.257854,0.376717
363037,0.318076,0.374703
363038,0.320229,0.371428
363039,0.321691,0.367330


In [493]:
preproc.save_obj(Fe_combined, "Fe_combined")

In [494]:
Fe_combined_pca = preproc.pca(Fe_combined)
preproc.pca_variance(Fe_combined_pca)

2 PCA components  out of 2 components with variance sum 1.0 needed for obtaining sum of variance > 0.95


array([0.87211172, 0.12788828])

In [495]:
Fe_combined_pca_df = preproc.create_pca_df(Fe_combined_pca, Fe_combined)

In [496]:
preproc.save_obj(Fe_combined_pca_df, "Fe_combined_pca_df")

## MALI

In [497]:
MALI_combined1 = pd.read_excel("../_CLUSTER/groups_time_area/interpolation/combined_int_values.xlsx", index_col=0, usecols = 0)
MALI_combined = pd.read_excel("../_CLUSTER/groups_time_area/interpolation/combined_int_values.xlsx", index_col=0, usecols = 0)

  usecols = _maybe_convert_usecols(usecols)


In [498]:
MALI_combined1['Na2O'] = combined_int_clr['Na2O']
MALI_combined1['K2O'] = combined_int_clr['K2O']
MALI_combined1['CaO'] = combined_int_clr['CaO']
MALI_combined1['SiO2'] = combined_int_clr['SiO2']

In [499]:
MALI_combined_x1 = MALI_combined1.iloc[:,:1]
MALI_combined_x2 = MALI_combined1.iloc[:,1:2]
MALI_combined_x3 = MALI_combined1.iloc[:,2:3]
MALI_combined_x4 = MALI_combined1.iloc[:,3:4]

In [500]:
MALI_combined1

Unnamed: 0,Na2O,K2O,CaO,SiO2
2809,0.719515,0.685173,0.110596,3.617717
2810,0.720766,0.687057,0.108421,3.619309
2811,0.721736,0.688514,0.106766,3.620542
2812,0.719065,0.686071,0.100003,3.607553
2813,0.719245,0.686340,0.099758,3.607763
...,...,...,...,...
363036,0.559775,0.794516,0.101917,3.609093
363037,0.559770,0.794514,0.101923,3.609090
363038,0.559767,0.794513,0.101929,3.609088
363039,0.559765,0.794514,0.101934,3.609087


In [501]:
scaler1 = StandardScaler().fit(MALI_combined_x1)
scaler2 = StandardScaler().fit(MALI_combined_x2)
scaler3 = StandardScaler().fit(MALI_combined_x3)
scaler4 = StandardScaler().fit(MALI_combined_x4)

In [502]:
MALI_combined_x1_scaled = scaler1.transform(MALI_combined_x1)
MALI_combined_x2_scaled = scaler2.transform(MALI_combined_x2)
MALI_combined_x3_scaled = scaler3.transform(MALI_combined_x3)
MALI_combined_x4_scaled = scaler4.transform(MALI_combined_x4)

In [503]:
MALI_combined_x1_df = pd.DataFrame(data =MALI_combined_x1_scaled,columns=['Na2O'])
MALI_combined_x2_df = pd.DataFrame(data =MALI_combined_x2_scaled,columns=['K2O'])
MALI_combined_x3_df = pd.DataFrame(data =MALI_combined_x3_scaled,columns=['CaO'])
MALI_combined_x4_df = pd.DataFrame(data =MALI_combined_x4_scaled,columns=['SiO2'])

In [504]:
MALI_combined['Na2O'] = MALI_combined_x1_df['Na2O']
MALI_combined['K2O'] = MALI_combined_x2_df['K2O']
MALI_combined['CaO'] = MALI_combined_x3_df['CaO']
MALI_combined['SiO2'] = MALI_combined_x4_df['SiO2']

In [505]:
MALI_combined

Unnamed: 0,Na2O,K2O,CaO,SiO2
2809,0.756949,0.321229,0.341758,0.293983
2810,0.814191,0.362977,0.200366,0.358516
2811,0.792138,0.346882,0.251435,0.336688
2812,0.776167,0.335243,0.289201,0.319856
2813,0.879962,0.407784,0.124019,0.434175
...,...,...,...,...
363036,-0.143381,-0.354502,0.263038,-0.115509
363037,-0.120446,-0.354177,0.215359,-0.105831
363038,-0.118472,-0.348547,0.208966,-0.103478
363039,-0.115815,-0.342132,0.202026,-0.100313


In [506]:
preproc.save_obj(MALI_combined, "MALI_combined")

In [507]:
MALI_combined_pca = preproc.pca(MALI_combined)
preproc.pca_variance(MALI_combined_pca)

3 PCA components  out of 4 components with variance sum 0.9964188346015275 needed for obtaining sum of variance > 0.95


array([0.85623602, 0.08871336, 0.05146946, 0.00358117])

In [508]:
MALI_combined_pca_df = preproc.create_pca_df(MALI_combined_pca, MALI_combined)

In [509]:
preproc.save_obj(MALI_combined_pca_df, "MALI_combined_pca_df")