# Preprocessing

In [71]:
import numpy as np
import pandas as pd
import os
import pickle
import utm
import random

In [72]:
import prepostprocessing.cleaning as cleaning
import prepostprocessing.pre_processing as preproc

In [73]:
# Load jupyter extension to reload packages before executing user code.
# https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html
%load_ext autoreload
# Reload all packages (except those excluded by %aimport) every time before executing the Python code typed.
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Mineralogy

**To do**
* ~~Clean last points in Excel file while using "sum" as check~~

In [74]:
mineralogy = pd.read_excel("../_CLEANED/Vistelius_data_cleaned_all.xlsx", index_col=0)

### Check for wrong entries

In [75]:
np.isclose(mineralogy.loc[:, :"oth"].sum(axis=1), mineralogy.loc[:, "sum"])

array([ True,  True,  True, ...,  True,  True,  True])

In [76]:
wrong_sum = mineralogy.loc[~np.isclose(mineralogy.loc[:, :"oth"].sum(axis=1), mineralogy.loc[:, "sum"]), :]

In [77]:
# Check to see if any remaining incorrect lines are present
wrong_sum

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O,P2O5,l.i.,oth,sum,hs


In [78]:
mineralogy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4659 entries, 1 to 4659
Data columns (total 15 columns):
SiO2     4659 non-null float64
TiO2     4626 non-null float64
Al2O3    4659 non-null float64
Fe2O3    4657 non-null float64
FeO      4659 non-null float64
MnO      4544 non-null float64
MgO      4658 non-null float64
CaO      4659 non-null float64
Na2O     4659 non-null float64
K2O      4659 non-null float64
P2O5     3834 non-null float64
l.i.     4659 non-null float64
oth      874 non-null float64
sum      4659 non-null float64
hs       2240 non-null float64
dtypes: float64(15)
memory usage: 582.4 KB


In [79]:
wrong_sum.loc[:, :"oth"].sum(axis=1)

Series([], dtype: float64)

In [80]:
mineralogy = preproc.replace_nan(mineralogy, 0)

In [81]:
mineralogy['oth'] = mineralogy['P2O5'] + mineralogy['l.i.'] + mineralogy['oth']
mineralogy.drop(["l.i.", "P2O5", ], axis = 1, inplace = True)

In [82]:
mineralogy.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O,oth,sum,hs
1,80.8,0.04,10.16,0.61,1.72,0.0,0.4,0.55,2.0,3.59,0.35,100.22,0.0
2,80.0,0.1,10.1,0.17,0.56,0.02,0.4,0.35,2.3,5.1,0.55,99.65,0.0
3,79.92,0.05,9.89,0.16,1.73,0.02,0.12,0.14,0.75,6.15,1.1,100.03,0.3
4,79.65,0.04,9.64,1.15,0.75,0.1,0.45,0.67,3.71,4.25,0.26,100.67,0.01
5,79.18,0.08,10.24,0.64,2.6,0.04,0.05,1.25,1.52,3.08,1.92,100.6,0.28


### Cleaning
**To do**
* ~~Replace zero values~~


In [83]:
# Would not do this this way since it becomes less clear what the variable means
# You should also replace 'minralogy' in all remaining cells by 'x' if you would want to do this
# x = mineralogy

In [84]:
# Replace zero values
mineralogy = preproc.replace_zero(mineralogy, 0.01)

In [85]:
mineralogy.to_excel("../_CLEANED/Vistelius_data_cleaned.xlsx")

* ~~Replace nan values~~

In [86]:
# Also replace NaN values by 0.01
mineralogy = preproc.replace_nan(mineralogy, 0.01)

* ~~Normalize~~

In [87]:
# Normalize specific columns
mineralogy.loc[:, :"oth"] = preproc.normalize(mineralogy.loc[:, :"oth"])# , total=mineralogy['sum'])

In [88]:
# Renew 'sum' column to reflect changes applied during cleaning
mineralogy["sum"] = mineralogy.loc[:, :"oth"].sum(axis=1)

In [89]:
# Check that sum of all variables + 'sum' == 200
assert all(np.isclose(mineralogy.loc[:, :'sum'].sum(axis=1), 200.0))

In [90]:
mineralogy.to_excel("../_INTERPOLATION/normalised_values.xlsx")

### centred log-ratio (clr) transformation

In [91]:
mineralogy_clr = preproc.clr(mineralogy)
mineralogy_clr.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O,oth,sum,hs
1,4.51918,-3.091673,2.445661,-0.367093,0.669527,-4.477967,-0.789088,-0.470634,0.82035,1.405355,-0.922619,4.73467,-4.47567
2,4.534383,-2.150229,2.464892,-1.6196,-0.427462,-3.759667,-0.763934,-0.897466,0.985266,1.781597,-0.445481,4.75402,-4.45632
3,4.425654,-2.951104,2.336152,-1.787953,0.592749,-3.867395,-2.075636,-1.921485,-0.243054,1.86108,0.139938,4.650098,-1.159045
4,4.285378,-3.31114,2.173657,0.047498,-0.379946,-2.394849,-0.890771,-0.492741,1.218768,1.354655,-1.439337,4.519584,-4.690756
5,4.046324,-2.851128,2.000902,-0.771686,0.630112,-3.544275,-3.321132,-0.102256,0.093311,0.79953,0.326926,4.285753,-1.592383


### Principal Component Analysis (PCA)

In [92]:
mineralogy_pca = preproc.pca(mineralogy_clr)
preproc.pca_variance(mineralogy_pca)

7 PCA components  out of 13 components with variance sum 0.9535306717860614 needed for obtaining sum of variance > 0.95


array([3.59050577e-01, 2.60958082e-01, 1.29041804e-01, 6.59819203e-02,
       5.75172491e-02, 4.46332610e-02, 3.63477772e-02, 2.62125369e-02,
       1.48989319e-02, 4.34031836e-03, 9.77100841e-04, 4.04401875e-05,
       2.24837794e-32])

In [93]:
mineralogy_pca_df = preproc.create_pca_df(mineralogy_pca, mineralogy_clr)

In [94]:
mineralogy_pca_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,PC08,PC09,PC10,PC11,PC12,PC13
1,-0.447174,-1.225068,0.116980,-1.118125,0.072895,0.968142,0.135201,-1.046815,-0.163684,0.405531,0.210871,0.012263,-4.978631e-16
2,-0.376621,-1.410760,0.977915,-0.596661,-0.400204,0.249458,0.797682,0.539827,-0.157654,0.334672,0.207434,0.038930,1.605954e-16
3,3.187993,-1.541733,1.059821,-0.101622,-1.098362,-0.327546,0.899679,-1.063429,-0.901138,0.906119,0.141421,0.049035,2.368786e-16
4,-0.733521,-1.464722,-0.690817,0.790791,-0.052407,1.647700,0.216261,0.043582,-0.113234,-0.046977,0.278490,0.029399,-6.753686e-16
5,2.656838,-0.964694,0.074225,0.373453,-1.419034,-0.564535,-1.424684,-1.158970,-0.447363,0.403525,0.278001,0.015713,3.613596e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4655,0.892870,2.326275,-1.147952,0.585820,-0.731866,0.744900,-0.265913,-0.424258,0.030811,-0.273406,-0.207447,0.065138,4.183668e-16
4656,-2.408156,2.450686,-0.553429,0.009524,0.068795,0.014798,-0.218064,-0.688277,0.071384,0.193791,-0.133836,0.084928,6.753052e-16
4657,1.106035,3.018540,-1.204171,0.246965,-0.323195,-0.017034,0.076226,-0.156054,-0.219544,0.018146,-0.119151,0.087319,-4.693849e-16
4658,1.357590,2.827987,-0.736605,-0.025258,-0.809447,-0.410685,-0.559875,-0.039578,0.624331,-0.042759,-0.275724,0.061361,-3.417382e-16


____

## Coordinates

In [95]:
coordinates = pd.read_excel("../_DATA/full_with_coordinates.xlsx", index_col=0, usecols=[0, 1, 2, 3])

In [96]:
coordinates

Unnamed: 0,Latitude,Longitude,past_mer
1,"44°31'30.0""","138°37'30.0""",
2,"54°12'10.0""","119°24'0.0""",
3,"62°36'0.0""","155°36'0.0""",
4,"61°35'0.0""","146°2'0.0""",
5,"68°55'0.0""","164°24'0.0""",
...,...,...,...
4655,"66°42'0.0""","164°23'0.0""",
4656,"46°56'30.0""","137°5'3.0""",
4657,"58°12'0.0""","138°12'0.0""",
4658,"60°51'0.0""","147°31'0.0""",


In [97]:
# Delete negative signs in "Longitude" column for dms2dec function to work properly
coordinates["Longitude"] = coordinates["Longitude"].str.replace("-", "")

In [98]:
# Include W in "Longitude" column
sum_ = 0

for index, row in coordinates.iterrows():
    if ("W" in str(row["past_mer"])) or ("w" in str(row["past_mer"])):
        coordinates.loc[index, "Longitude"] = row["Longitude"] + "W"
        
        sum_ += 1

In [99]:
# Check that all occurences of "W" or "w" are catched
assert sum_ == int(coordinates["past_mer"].value_counts())

### Convert from degrees to decimal format

In [100]:
coordinates["Y"] = coordinates.loc[:, "Latitude"].apply(cleaning.dms2dec)
coordinates["X"] = coordinates.loc[:, "Longitude"].apply(cleaning.dms2dec)

In [101]:
# Check
coordinates.loc[42, "X"]

149.38333333333333

In [102]:
coordinates

Unnamed: 0,Latitude,Longitude,past_mer,Y,X
1,"44°31'30.0""","138°37'30.0""",,44.525000,138.625000
2,"54°12'10.0""","119°24'0.0""",,54.202778,119.400000
3,"62°36'0.0""","155°36'0.0""",,62.600000,155.600000
4,"61°35'0.0""","146°2'0.0""",,61.583333,146.033333
5,"68°55'0.0""","164°24'0.0""",,68.916667,164.400000
...,...,...,...,...,...
4655,"66°42'0.0""","164°23'0.0""",,66.700000,164.383333
4656,"46°56'30.0""","137°5'3.0""",,46.941667,137.084167
4657,"58°12'0.0""","138°12'0.0""",,58.200000,138.200000
4658,"60°51'0.0""","147°31'0.0""",,60.850000,147.516667


coordinates = coordinates.rename({"Y" : "Latitude"}, axis=1)
coordinates = coordinates.rename({"X" : "Longitude"}, axis=1)


** plotting in Qgis does not work --> will look into it (something to do with qgis) **

### adding noise to coordinates

**Although the idea was to only add noise to those data points that have replicates, I think it's okay to add noise to all data points.**

In [103]:
coordinates_noise = pd.read_excel("../_INTERPOLATION/coordinates_decimal.xlsx", index_col=0)

In [104]:
coordinates_noise = coordinates_noise.rename({"Y": "Y_without_noise", "X": "X_without_noise"}, axis=1)

In [105]:
# coordinates_noise["random_value"] = np.random.random(size=len(coordinates_noise))
# coordinates_noise["mean"] = 0
# coordinates_noise["std"] = 0.00007

# You don't seem to use the 'random_value' anywhere so commented it
# Instead of assigning the mean and std to every row it is much faster to define it once
# and then use it when calling the random normal sample.
noise_mean = 0
noise_std = 0.00007

In [106]:
# You can set up your own 'pseudo' random number generator with np.random.RandomState(x)
# where x is the random seed that you can choose, I just chose 4343
# This way the samples from the normal distribution will be random but will be the same every time you
# rerun this cell, so that your results in the second notebook 'Interpolation' will also stay the same
# even after rerunning. Otherwise, at every run of the next cell, numpy will choose a new random seed.
# You may also find online that people say to just set the RandomState with np.random.seed = 4343
# but this will affect every random process you start anywhere and that's not what you want for now
pnrg = np.random.RandomState(4343)

In [107]:
# coordinates_noise["normal_distribution"] =  np.random.normal(noise_mean, noise_std)

# Would sample two random numbers so that the coordinates can change in all direction and not just NE or SW
coordinates_noise["noise_for_X"] =  pnrg.normal(noise_mean, noise_std, size=coordinates_noise.shape[0])
coordinates_noise["noise_for_Y"] =  pnrg.normal(noise_mean, noise_std, size=coordinates_noise.shape[0])

In [108]:
coordinates_noise["Y"] = coordinates_noise["Y_without_noise"] + coordinates_noise["noise_for_X"]
coordinates_noise["X"] = coordinates_noise["X_without_noise"] + coordinates_noise["noise_for_Y"]

In [109]:
# NEW
coordinates_noise

Unnamed: 0,Latitude,Longitude,past_mer,Y_without_noise,X_without_noise,noise_for_X,noise_for_Y,Y,X
1,"44°31'30.0""","138°37'30.0""",,44.525027,138.625027,0.000058,-0.000078,44.525085,138.624949
2,"54°12'10.0""","119°24'0.0""",,54.202861,119.400083,0.000016,-0.000237,54.202877,119.399847
3,"62°36'0.0""","155°36'0.0""",,62.600001,155.600001,0.000044,-0.000109,62.600044,155.599891
4,"61°35'0.0""","146°2'0.0""",,61.583302,146.033302,0.000132,-0.000076,61.583433,146.033226
5,"68°55'0.0""","164°24'0.0""",,68.916552,164.399886,-0.000019,-0.000077,68.916533,164.399808
...,...,...,...,...,...,...,...,...,...
4655,"66°42'0.0""","164°23'0.0""",,66.699978,164.383312,-0.000118,0.000014,66.699861,164.383326
4656,"46°56'30.0""","137°5'3.0""",,46.941677,137.084177,-0.000016,-0.000020,46.941661,137.084157
4657,"58°12'0.0""","138°12'0.0""",,58.199973,138.199973,-0.000079,-0.000040,58.199894,138.199934
4658,"60°51'0.0""","147°31'0.0""",,60.849992,147.516659,-0.000163,0.000135,60.849829,147.516794


In [110]:
# OLD
coordinates_noise

Unnamed: 0,Latitude,Longitude,past_mer,Y_without_noise,X_without_noise,noise_for_X,noise_for_Y,Y,X
1,"44°31'30.0""","138°37'30.0""",,44.525027,138.625027,0.000058,-0.000078,44.525085,138.624949
2,"54°12'10.0""","119°24'0.0""",,54.202861,119.400083,0.000016,-0.000237,54.202877,119.399847
3,"62°36'0.0""","155°36'0.0""",,62.600001,155.600001,0.000044,-0.000109,62.600044,155.599891
4,"61°35'0.0""","146°2'0.0""",,61.583302,146.033302,0.000132,-0.000076,61.583433,146.033226
5,"68°55'0.0""","164°24'0.0""",,68.916552,164.399886,-0.000019,-0.000077,68.916533,164.399808
...,...,...,...,...,...,...,...,...,...
4655,"66°42'0.0""","164°23'0.0""",,66.699978,164.383312,-0.000118,0.000014,66.699861,164.383326
4656,"46°56'30.0""","137°5'3.0""",,46.941677,137.084177,-0.000016,-0.000020,46.941661,137.084157
4657,"58°12'0.0""","138°12'0.0""",,58.199973,138.199973,-0.000079,-0.000040,58.199894,138.199934
4658,"60°51'0.0""","147°31'0.0""",,60.849992,147.516659,-0.000163,0.000135,60.849829,147.516794


In [111]:
coordinates_noise.to_excel("../_DATA/full_with_coordinates_noice_final_new.xlsx")

In [112]:
coordinates = pd.read_excel("../_DATA/full_with_coordinates_noice_final.xlsx", index_col=0, usecols=[0, 1, 2, 3, 10, 11])

In [113]:
coordinates

Unnamed: 0,Latitude,Longitude,past_mer,Y,X
1,"44°31'30.0""","138°37'30.0""",,44.525040,138.625040
2,"54°12'10.0""","119°24'0.0""",,54.202798,119.400020
3,"62°36'0.0""","155°36'0.0""",,62.599978,155.599978
4,"61°35'0.0""","146°2'0.0""",,61.583249,146.033249
5,"68°55'0.0""","164°24'0.0""",,68.916572,164.399905
...,...,...,...,...,...
4655,"66°42'0.0""","164°23'0.0""",,66.699992,164.383326
4656,"46°56'30.0""","137°5'3.0""",,46.941584,137.084084
4657,"58°12'0.0""","138°12'0.0""",,58.200076,138.200076
4658,"60°51'0.0""","147°31'0.0""",,60.849958,147.516625


### Convert to UTM coordinates

**To do**
* Group samples into certain groups based on spatial distribution
* Recalculate utm coordinates based on fixed zone (fixed letter and number)

In [114]:
coordinates_utm = coordinates.apply(lambda row : utm.from_latlon(row["Y"], row["X"], force_zone_number=55, force_zone_letter="W"), axis=1)
coordinates_utm = coordinates_utm.apply(pd.Series)
coordinates_utm.columns = ["Y_UTM", "X_UTM", "ZoneNumber", "ZoneLetter"]

In [115]:
coordinates_utm

Unnamed: 0,Y_UTM,X_UTM,ZoneNumber,ZoneLetter
1,-1.655413e+05,4.964422e+06,55,W
2,-1.275960e+06,6.364711e+06,55,W
3,9.406014e+05,6.970457e+06,55,W
4,4.486752e+05,6.828136e+06,55,W
5,1.190516e+06,7.743846e+06,55,W
...,...,...,...,...
4655,1.259115e+06,7.504777e+06,55,W
4656,-2.544172e+05,5.246604e+06,55,W
4657,-1.634235e+04,6.484789e+06,55,W
4658,5.280736e+05,6.746186e+06,55,W


In [116]:
coordinates_utm["ZoneNumber"].value_counts()

55    4659
Name: ZoneNumber, dtype: int64

In [117]:
coordinates_utm["ZoneLetter"].value_counts()

W    4659
Name: ZoneLetter, dtype: int64

In [118]:
(coordinates_utm["ZoneNumber"].astype(str) + coordinates_utm["ZoneLetter"]).value_counts()

55W    4659
dtype: int64

In [119]:
coordinates_utm.to_excel("../_INTERPOLATION/coordinates_UTM.xlsx")
cof= pd.read_excel("../_RESULTS/working_data.xlsx", index_col=0, usecols = lambda column : column not in ["Lat_deg", "Lat_min", "Lat_sec", "Long_deg", "Long_min", "Long_sec", "past_mer"] )

In [120]:
cof

Unnamed: 0,type_granite,time,massif,sampler,others,sampler+year
512,Granite coarse-grained,Tr-J,Yugalkan massif,T.A.Alfer'eva,,"A.D.Kanischev,1959"
1299,Granite coarse-grained,Tr-J,,G.L.Znamenskaya,,"N.P.Kostyakov,1961"
1536,Bt granite porphyraceous,Tr-J,,E.G.Ivanova,,"L.A.Kozubova,1957"
1760,Bt granite medium-grained,Tr-J,,,,"V.I.Fel'dman,1956"
1890,Plagiogranite,Tr-J,,NI.Serebryakova,,"A.V.Vnukov,1959"
...,...,...,...,...,...,...
1150,Granite leucocratic,J,Upper-Buy massif,,.N.P.Mel'nikova,"V.V.Starchenko,1968"
1396,Granite,Tr,Ust'-Nerchugan massifmassif,N.Aolebedeva,,"K.F.Khatskevich,1967"
2116,Granite,,,N.I.Serebryakov~,,"V.Yu.Shenfil,1962"
2973,Bt granite-porphyry,,Ergelyakh massif,G.P.Ignatovich,,"G.G.Naumov,1978"


In [121]:
coordinates_full = pd.concat([coordinates_utm, coordinates, mineralogy, cof], axis = 1)

In [122]:
coordinates_full.to_excel("../_INTERPOLATION/coordinates_full_data.xlsx")


### grouping the data

In [123]:
areax = coordinates_full
areax["area"] = ""


In [124]:
areay = areax[areax["X"].between(148.0, 152.0)]
area8 = areay[areay["Y"].between(69.0, 70)]
area8["area"] = 8
area8.to_excel("../_INTERPOLATION/area3_a.xlsx")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


____

## Metadata

In [125]:
metadata = pd.read_excel("../_INTERPOLATION/coordinates_full_data.xlsx", index_col=0, usecols=[0, 23, 24, 25, 26, 27, 28])

In [126]:
metadata

Unnamed: 0,type_granite,time,massif,sampler,others,sampler+year
1,Granite leucogranitic,K,,,,"V.N.Musin,1970"
2,Granite leucocratic,Tr,,V.I.Zhigalova,,"E.A.Ivanov,1969"
3,Granite leucocratic coarse-grained,K,Omsukchan massif,P.M.Bosek,,"O.S.Gracheva,1948"
4,Granite leucocratic,K,Buksandzhin massif,A.Kh.Brovtman,,"A.F.Mikhaylov,1948"
5,Granite-porphyry micropegmatitic,K,Attykveem massif,L.G.Semenova,0th.:S-0.16,"A.I.Sadovsky,1963"
...,...,...,...,...,...,...
4655,Diorite,K,Egdegkych massif,,Oth.:co2-0.12,"V.A.lgnat'ev,1964"
4656,Quartz diorite,K,Verkhneplotnikovsky massif,,,"A.A.Syas'ko,1969"
4657,Diorite,J,,,"Oth.:co2-0.02,so3-0.0l","N.N.Remizov,1967"
4658,Diorite,K,,,,"A.P.Osipov,1966"


In [127]:
metadata["type_granite"].value_counts()

Granite                                                     656
Granodiorite                                                523
Granite-porphyry                                            282
Bt granite                                                  268
Granite leucocratic                                         220
                                                           ... 
Granodiorite porphyraceous medium-grained uneven-grained      1
Granite-porphyry leucogranitic                                1
Two mica granite uneven-grained                               1
Bt-Hb diorite-granodiorite medium-grained                     1
Quartz monzonite(granodiorite)                                1
Name: type_granite, Length: 401, dtype: int64

In [128]:
metadata["massif"].value_counts()

Ulakhan-Sis massif         58
Khoboyotuu-Echiy massif    56
Vladimirsky massif         35
Zimov'e massif             33
Bom-Gorkhon massif         31
                           ..
Upper-Myaundzhin massif     1
Trakt massif                1
Aplitic massif              1
Maksunuokha-Tas massif      1
Udoma massif                1
Name: massif, Length: 941, dtype: int64

In [129]:
metadata["time"].value_counts()

K       2522
J        825
Tr       517
Pg       365
Tr-J     338
Mz        88
Name: time, dtype: int64

In [130]:
metadata["sampler"].value_counts()

L.S.Voronova         120
D.M.Shuster           87
N.A.Lebedeva          86
V.I.Zhigalova         70
N.P.Mel'nikova        51
                    ... 
T.M.Sablina            1
P.A.Volkov             1
OoP.Maratkanova        1
Nl oSerebryakova       1
Z.l.Shlyadinskaya      1
Name: sampler, Length: 736, dtype: int64

In [131]:
metadata["sampler+year"].value_counts()

G.A.Valuy,1975         76
V.A.Popeko,1968        61
V.A.Faradzhev,1971     43
R.O.Galabala,1976      37
V.S.Ivanov,1968        37
                       ..
Z.M.Mendel'son,1941     1
V.I.Vysotsky,1963       1
A.V.Demin,1964          1
M.G.Zolotov,1949        1
M.N.Kozhemyako,1947     1
Name: sampler+year, Length: 1392, dtype: int64

## data for each area

In [132]:
metadata_area8 = pd.read_excel("../_INTERPOLATION/area3_a.xlsx", index_col=0, usecols=[0, 23, 24, 25, 26, 27, 28])

In [133]:
coordinates_utm_area8 = pd.read_excel("../_INTERPOLATION/area3_a.xlsx", index_col=0, usecols=[0, 1, 2, 3, 4])

In [134]:
coordinates_area8 = pd.read_excel("../_INTERPOLATION/area3_a.xlsx", index_col=0, usecols=[0, 5, 6, 7, 8, 9])

In [135]:
mineralogy_area8 = pd.read_excel("../_INTERPOLATION/area3_a.xlsx", index_col=0, usecols=[0, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20])

In [136]:
mineralogy_clr_area8 = preproc.clr(mineralogy_area8)

In [137]:
mineralogy_pca_area8 = preproc.pca(mineralogy_clr_area8)
preproc.pca_variance(mineralogy_pca_area8)


6 PCA components  out of 11 components with variance sum 0.9542886960234712 needed for obtaining sum of variance > 0.95


array([4.21794666e-01, 2.11933263e-01, 1.03212795e-01, 9.69390259e-02,
       7.31783864e-02, 4.72305596e-02, 2.19770251e-02, 1.72156060e-02,
       5.63065726e-03, 8.88015614e-04, 3.13771337e-32])

In [138]:
mineralogy_pca_area8_df = preproc.create_pca_df(mineralogy_pca_area8, mineralogy_clr_area8)

## Saving of data

In [139]:
# Save data as pickle files to use them in later notebooks
preproc.save_obj(mineralogy_area8, "mineralogy_area8") # mineralogy
preproc.save_obj(mineralogy_clr_area8, "mineralogy_clr_area8") # mineralogy clr
preproc.save_obj(mineralogy_pca_area8, "mineralogy_pca_area8") # mineralogy pca info
preproc.save_obj(mineralogy_pca_area8_df, "mineralogy_pca_area8_df") # mineralogy pca scores

preproc.save_obj(coordinates_area8, "coordinates_area8") # coordinates latlon
preproc.save_obj(coordinates_utm_area8, "coordinates_utm_area8") # coordinates utm
preproc.save_obj(metadata_area8, "metadata_area8") # metadata

____

In [140]:
coordinates_utm_area8

Unnamed: 0,Y_UTM,X_UTM,ZoneNumber,ZoneLetter
15,598837.908412,7715890.0,55,W
119,635439.679361,7725109.0,55,W
400,592263.14065,7717475.0,55,W
888,603288.106669,7703025.0,55,W
1305,636193.302213,7723289.0,55,W
1433,547179.692295,7695679.0,55,W
1511,598296.065461,7697260.0,55,W
1650,637385.481435,7725232.0,55,W
1905,591461.767224,7704428.0,55,W
1942,585316.07941,7693012.0,55,W
