# Preprocessing

In [1]:
import numpy as np
import pandas as pd
import os
import pickle
import utm
import random

In [2]:
import prepostprocessing.cleaning as cleaning
import prepostprocessing.pre_processing as preproc

In [3]:
# Load jupyter extension to reload packages before executing user code.
# https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html
%load_ext autoreload
# Reload all packages (except those excluded by %aimport) every time before executing the Python code typed.
%autoreload 2

## Mineralogy

**To do**
* ~~Clean last points in Excel file while using "sum" as check~~

In [4]:
mineralogy = pd.read_excel("../_CLEANED/Vistelius_data_cleaned.xlsx", index_col=0)

### Check for wrong entries

In [5]:
np.isclose(mineralogy.loc[:, :"oth"].sum(axis=1), mineralogy.loc[:, "sum"])

array([ True,  True,  True, ...,  True,  True,  True])

In [6]:
wrong_sum = mineralogy.loc[~np.isclose(mineralogy.loc[:, :"oth"].sum(axis=1), mineralogy.loc[:, "sum"]), :]

In [7]:
# Check to see if any remaining incorrect lines are present
wrong_sum

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O,P2O5,l.i.,oth,sum,hs


In [8]:
mineralogy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4659 entries, 1 to 4659
Data columns (total 15 columns):
SiO2     4659 non-null float64
TiO2     4626 non-null float64
Al2O3    4659 non-null float64
Fe2O3    4657 non-null float64
FeO      4659 non-null float64
MnO      4544 non-null float64
MgO      4658 non-null float64
CaO      4659 non-null float64
Na2O     4659 non-null float64
K2O      4659 non-null float64
P2O5     3834 non-null float64
l.i.     4659 non-null float64
oth      874 non-null float64
sum      4659 non-null float64
hs       2240 non-null float64
dtypes: float64(15)
memory usage: 582.4 KB


In [9]:
wrong_sum.loc[:, :"oth"].sum(axis=1)

Series([], dtype: float64)

In [10]:
mineralogy.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O,P2O5,l.i.,oth,sum,hs
1,80.8,0.04,10.16,0.61,1.72,,0.4,0.55,2.0,3.59,,0.35,,100.22,
2,80.0,0.1,10.1,0.17,0.56,0.02,0.4,0.35,2.3,5.1,0.05,0.5,,99.65,
3,79.92,0.05,9.89,0.16,1.73,0.02,0.12,0.14,0.75,6.15,,1.02,0.08,100.03,0.3
4,79.65,0.04,9.64,1.15,0.75,0.1,0.45,0.67,3.71,4.25,,0.26,,100.67,0.01
5,79.18,0.08,10.24,0.64,2.6,0.04,0.05,1.25,1.52,3.08,0.01,1.75,0.16,100.6,0.28


### Cleaning
**To do**
* ~~Replace zero values~~


In [11]:
# Would not do this this way since it becomes less clear what the variable means
# You should also replace 'minralogy' in all remaining cells by 'x' if you would want to do this
# x = mineralogy

In [12]:
# Replace zero values
mineralogy = preproc.replace_zero(mineralogy, 0.01)

In [13]:
mineralogy.to_excel("../_CLEANED/Vistelius_data_cleaned.xlsx")

* ~~Replace nan values~~

In [14]:
# Also replace NaN values by 0.01
mineralogy = preproc.replace_nan(mineralogy, 0.01)

* ~~Normalize~~

In [15]:
# Normalize specific columns
mineralogy.loc[:, :"oth"] = preproc.normalize(mineralogy.loc[:, :"oth"])# , total=mineralogy['sum'])

In [16]:
# Renew 'sum' column to reflect changes applied during cleaning
mineralogy["sum"] = mineralogy.loc[:, :"oth"].sum(axis=1)

In [17]:
# Check that sum of all variables + 'sum' == 200
assert all(np.isclose(mineralogy.loc[:, :'sum'].sum(axis=1), 200.0))

In [18]:
mineralogy.to_excel("../_INTERPOLATION/normalised_values.xlsx")

### centred log-ratio (clr) transformation

In [19]:
mineralogy_clr = preproc.clr(mineralogy)
mineralogy_clr.head()

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O,P2O5,l.i.,oth,sum,hs
1,5.116216,-2.494637,3.042697,0.229942,1.266563,-3.880932,-0.192052,0.126402,1.417386,2.002391,-3.880932,-0.325583,-3.880932,5.331906,-3.878435
2,5.027136,-1.657475,2.957645,-1.126847,0.065291,-3.266913,-0.271181,-0.404712,1.478019,2.27435,-2.350623,-0.048037,-3.96006,5.246874,-3.963466
3,4.900117,-2.476641,2.810616,-1.31349,1.067213,-3.392932,-1.601172,-1.447021,0.231409,2.335543,-4.086079,0.538894,-2.006637,5.124661,-0.684482
4,4.911676,-2.684842,2.799955,0.673796,0.246352,-1.768551,-0.264473,0.133557,1.845066,1.980953,-4.071136,-0.813039,-4.071136,5.146081,-4.06426
5,4.525075,-2.372377,2.479653,-0.292936,1.108863,-3.065524,-2.842381,0.376495,0.572062,1.278281,-4.451819,0.712967,-1.67923,4.764504,-1.113632


### Principal Component Analysis (PCA)

In [20]:
mineralogy_pca = preproc.pca(mineralogy_clr)
preproc.pca_variance(mineralogy_pca)

9 PCA components  out of 15 components with variance sum 0.966625886419962 needed for obtaining sum of variance > 0.95


array([2.72769107e-01, 1.98486337e-01, 1.35269270e-01, 1.11034812e-01,
       8.48676592e-02, 5.96963124e-02, 4.62857519e-02, 3.20709944e-02,
       2.61456423e-02, 1.88742451e-02, 1.07280680e-02, 3.03849513e-03,
       7.04237437e-04, 2.90679570e-05, 1.71494552e-32])

In [21]:
mineralogy_pca_df = preproc.create_pca_df(mineralogy_pca, mineralogy_clr)

In [22]:
mineralogy_pca_df

Unnamed: 0,PC01,PC02,PC03,PC04,PC05,PC06,PC07,PC08,PC09,PC10,PC11,PC12,PC13,PC14,PC15
1,0.070870,-1.570739,-0.662051,0.609276,0.342458,-0.268580,-1.071697,0.923562,0.139705,-1.049387,-0.157287,0.386721,0.210287,0.012389,4.991755e-16
2,-0.261688,-1.375756,0.805870,-0.300059,0.759149,-0.675690,-0.451482,0.306834,0.781251,0.542305,-0.157483,0.341606,0.209525,0.039844,2.172231e-16
3,3.914195,-1.572119,0.813464,1.076293,0.784873,-1.234797,0.391221,-0.305773,0.900094,-1.047324,-0.883750,0.863245,0.139512,0.048587,5.252676e-16
4,-0.160643,-1.815379,-1.297618,0.321477,-0.189479,0.327796,0.756335,1.567153,0.233893,0.037679,-0.114667,-0.074479,0.277424,0.028466,-1.286639e-16
5,3.508039,-1.311824,0.204961,1.717288,-0.080573,-1.328450,1.018126,-0.541024,-1.417501,-1.140806,-0.429481,0.342445,0.272092,0.014137,7.446751e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4655,-0.006406,2.916627,-0.195252,-1.112546,-1.041502,-0.522344,0.727264,0.878206,-0.288274,-0.425628,0.025780,-0.221353,-0.205672,0.066938,-1.660513e-16
4656,-2.943692,2.152769,-0.382931,0.289568,-0.498874,0.052471,-0.014788,0.022861,-0.220074,-0.690008,0.068836,0.207449,-0.131385,0.085468,4.239758e-16
4657,0.390174,3.309386,0.031281,-0.029892,-1.308320,-0.381930,0.393931,0.030935,0.068661,-0.155013,-0.229203,0.029254,-0.115538,0.087171,1.345464e-16
4658,0.424394,3.407414,-0.188329,-0.859408,-0.620405,-1.057895,0.184711,-0.269273,-0.595109,-0.036959,0.622897,-0.002238,-0.274277,0.062982,4.152935e-16


____

## Coordinates

In [23]:
coordinates = pd.read_excel("../_DATA/full_with_coordinates.xlsx", index_col=0, usecols=[0, 1, 2, 3])

In [24]:
coordinates

Unnamed: 0,Latitude,Longitude,past_mer
1,"44°31'30.0""","138°37'30.0""",
2,"54°12'10.0""","119°24'0.0""",
3,"62°36'0.0""","155°36'0.0""",
4,"61°35'0.0""","146°2'0.0""",
5,"68°55'0.0""","164°24'0.0""",
...,...,...,...
4655,"66°42'0.0""","164°23'0.0""",
4656,"46°56'30.0""","137°5'3.0""",
4657,"58°12'0.0""","138°12'0.0""",
4658,"60°51'0.0""","147°31'0.0""",


In [25]:
# Delete negative signs in "Longitude" column for dms2dec function to work properly
coordinates["Longitude"] = coordinates["Longitude"].str.replace("-", "")

In [26]:
# Include W in "Longitude" column
sum_ = 0

for index, row in coordinates.iterrows():
    if ("W" in str(row["past_mer"])) or ("w" in str(row["past_mer"])):
        coordinates.loc[index, "Longitude"] = row["Longitude"] + "W"
        
        sum_ += 1

In [27]:
# Check that all occurences of "W" or "w" are catched
assert sum_ == int(coordinates["past_mer"].value_counts())

### Convert from degrees to decimal format

In [28]:
coordinates["Y"] = coordinates.loc[:, "Latitude"].apply(cleaning.dms2dec)
coordinates["X"] = coordinates.loc[:, "Longitude"].apply(cleaning.dms2dec)

In [29]:
# Check
coordinates.loc[42, "X"]

149.38333333333333

In [30]:
coordinates

Unnamed: 0,Latitude,Longitude,past_mer,Y,X
1,"44°31'30.0""","138°37'30.0""",,44.525000,138.625000
2,"54°12'10.0""","119°24'0.0""",,54.202778,119.400000
3,"62°36'0.0""","155°36'0.0""",,62.600000,155.600000
4,"61°35'0.0""","146°2'0.0""",,61.583333,146.033333
5,"68°55'0.0""","164°24'0.0""",,68.916667,164.400000
...,...,...,...,...,...
4655,"66°42'0.0""","164°23'0.0""",,66.700000,164.383333
4656,"46°56'30.0""","137°5'3.0""",,46.941667,137.084167
4657,"58°12'0.0""","138°12'0.0""",,58.200000,138.200000
4658,"60°51'0.0""","147°31'0.0""",,60.850000,147.516667


coordinates = coordinates.rename({"Y" : "Latitude"}, axis=1)
coordinates = coordinates.rename({"X" : "Longitude"}, axis=1)


** plotting in Qgis does not work --> will look into it (something to do with qgis) **

### adding noise to coordinates

**Although the idea was to only add noise to those data points that have replicates, I think it's okay to add noise to all data points.**

In [31]:
coordinates_noise = pd.read_excel("../_INTERPOLATION/coordinates_decimal.xlsx", index_col=0)

In [32]:
coordinates_noise = coordinates_noise.rename({"Y": "Y_without_noise", "X": "X_without_noise"}, axis=1)

In [33]:
# coordinates_noise["random_value"] = np.random.random(size=len(coordinates_noise))
# coordinates_noise["mean"] = 0
# coordinates_noise["std"] = 0.00007

# You don't seem to use the 'random_value' anywhere so commented it
# Instead of assigning the mean and std to every row it is much faster to define it once
# and then use it when calling the random normal sample.
noise_mean = 0
noise_std = 0.00007

In [34]:
# You can set up your own 'pseudo' random number generator with np.random.RandomState(x)
# where x is the random seed that you can choose, I just chose 4343
# This way the samples from the normal distribution will be random but will be the same every time you
# rerun this cell, so that your results in the second notebook 'Interpolation' will also stay the same
# even after rerunning. Otherwise, at every run of the next cell, numpy will choose a new random seed.
# You may also find online that people say to just set the RandomState with np.random.seed = 4343
# but this will affect every random process you start anywhere and that's not what you want for now
pnrg = np.random.RandomState(4343)

In [35]:
# coordinates_noise["normal_distribution"] =  np.random.normal(noise_mean, noise_std)

# Would sample two random numbers so that the coordinates can change in all direction and not just NE or SW
coordinates_noise["noise_for_X"] =  pnrg.normal(noise_mean, noise_std, size=coordinates_noise.shape[0])
coordinates_noise["noise_for_Y"] =  pnrg.normal(noise_mean, noise_std, size=coordinates_noise.shape[0])

In [36]:
coordinates_noise["Y"] = coordinates_noise["Y_without_noise"] + coordinates_noise["noise_for_X"]
coordinates_noise["X"] = coordinates_noise["X_without_noise"] + coordinates_noise["noise_for_Y"]

In [37]:
# NEW
coordinates_noise

Unnamed: 0,Latitude,Longitude,past_mer,Y_without_noise,X_without_noise,noise_for_X,noise_for_Y,Y,X
1,"44°31'30.0""","138°37'30.0""",,44.525027,138.625027,0.000058,-0.000078,44.525085,138.624949
2,"54°12'10.0""","119°24'0.0""",,54.202861,119.400083,0.000016,-0.000237,54.202877,119.399847
3,"62°36'0.0""","155°36'0.0""",,62.600001,155.600001,0.000044,-0.000109,62.600044,155.599891
4,"61°35'0.0""","146°2'0.0""",,61.583302,146.033302,0.000132,-0.000076,61.583433,146.033226
5,"68°55'0.0""","164°24'0.0""",,68.916552,164.399886,-0.000019,-0.000077,68.916533,164.399808
...,...,...,...,...,...,...,...,...,...
4655,"66°42'0.0""","164°23'0.0""",,66.699978,164.383312,-0.000118,0.000014,66.699861,164.383326
4656,"46°56'30.0""","137°5'3.0""",,46.941677,137.084177,-0.000016,-0.000020,46.941661,137.084157
4657,"58°12'0.0""","138°12'0.0""",,58.199973,138.199973,-0.000079,-0.000040,58.199894,138.199934
4658,"60°51'0.0""","147°31'0.0""",,60.849992,147.516659,-0.000163,0.000135,60.849829,147.516794


In [38]:
# OLD
coordinates_noise

Unnamed: 0,Latitude,Longitude,past_mer,Y_without_noise,X_without_noise,noise_for_X,noise_for_Y,Y,X
1,"44°31'30.0""","138°37'30.0""",,44.525027,138.625027,0.000058,-0.000078,44.525085,138.624949
2,"54°12'10.0""","119°24'0.0""",,54.202861,119.400083,0.000016,-0.000237,54.202877,119.399847
3,"62°36'0.0""","155°36'0.0""",,62.600001,155.600001,0.000044,-0.000109,62.600044,155.599891
4,"61°35'0.0""","146°2'0.0""",,61.583302,146.033302,0.000132,-0.000076,61.583433,146.033226
5,"68°55'0.0""","164°24'0.0""",,68.916552,164.399886,-0.000019,-0.000077,68.916533,164.399808
...,...,...,...,...,...,...,...,...,...
4655,"66°42'0.0""","164°23'0.0""",,66.699978,164.383312,-0.000118,0.000014,66.699861,164.383326
4656,"46°56'30.0""","137°5'3.0""",,46.941677,137.084177,-0.000016,-0.000020,46.941661,137.084157
4657,"58°12'0.0""","138°12'0.0""",,58.199973,138.199973,-0.000079,-0.000040,58.199894,138.199934
4658,"60°51'0.0""","147°31'0.0""",,60.849992,147.516659,-0.000163,0.000135,60.849829,147.516794


In [39]:
coordinates_noise.to_excel("../_DATA/full_with_coordinates_noice_final_new.xlsx")

In [40]:
coordinates = pd.read_excel("../_DATA/full_with_coordinates_noice_final.xlsx", index_col=0, usecols=[0, 1, 2, 3, 10, 11])

In [41]:
coordinates

Unnamed: 0,Latitude,Longitude,past_mer,Y,X
1,"44°31'30.0""","138°37'30.0""",,44.525040,138.625040
2,"54°12'10.0""","119°24'0.0""",,54.202798,119.400020
3,"62°36'0.0""","155°36'0.0""",,62.599978,155.599978
4,"61°35'0.0""","146°2'0.0""",,61.583249,146.033249
5,"68°55'0.0""","164°24'0.0""",,68.916572,164.399905
...,...,...,...,...,...
4655,"66°42'0.0""","164°23'0.0""",,66.699992,164.383326
4656,"46°56'30.0""","137°5'3.0""",,46.941584,137.084084
4657,"58°12'0.0""","138°12'0.0""",,58.200076,138.200076
4658,"60°51'0.0""","147°31'0.0""",,60.849958,147.516625


### Convert to UTM coordinates

**To do**
* Group samples into certain groups based on spatial distribution
* Recalculate utm coordinates based on fixed zone (fixed letter and number)

In [42]:
coordinates_utm = coordinates.apply(lambda row : utm.from_latlon(row["Y"], row["X"], force_zone_number=48, force_zone_letter="U"), axis=1)
coordinates_utm = coordinates_utm.apply(pd.Series)
coordinates_utm.columns = ["Y_UTM", "X_UTM", "ZoneNumber", "ZoneLetter"]

In [43]:
coordinates_utm

Unnamed: 0,Y_UTM,X_UTM,ZoneNumber,ZoneLetter
1,3.166850e+06,5.512837e+06,48,U
2,1.436111e+06,6.102354e+06,48,U
3,2.874853e+06,7.966637e+06,48,U
4,2.565651e+06,7.521202e+06,48,U
5,2.540731e+06,8.757731e+06,48,U
...,...,...,...,...
4655,2.759727e+06,8.616851e+06,48,U
4656,2.927123e+06,5.721711e+06,48,U
4657,2.397980e+06,6.939549e+06,48,U
4658,2.684981e+06,7.505261e+06,48,U


In [44]:
coordinates_utm["ZoneNumber"].value_counts()

48    4659
Name: ZoneNumber, dtype: int64

In [45]:
coordinates_utm["ZoneLetter"].value_counts()

U    4659
Name: ZoneLetter, dtype: int64

In [46]:
(coordinates_utm["ZoneNumber"].astype(str) + coordinates_utm["ZoneLetter"]).value_counts()

48U    4659
dtype: int64

In [47]:
coordinates_utm.to_excel("../_INTERPOLATION/coordinates_UTM.xlsx")
cof= pd.read_excel("../_RESULTS/working_data.xlsx", index_col=0, usecols = lambda column : column not in ["Lat_deg", "Lat_min", "Lat_sec", "Long_deg", "Long_min", "Long_sec", "past_mer"] )

In [48]:
cof

Unnamed: 0,type_granite,time,massif,sampler,others,sampler+year
512,Granite coarse-grained,Tr-J,Yugalkan massif,T.A.Alfer'eva,,"A.D.Kanischev,1959"
1299,Granite coarse-grained,Tr-J,,G.L.Znamenskaya,,"N.P.Kostyakov,1961"
1536,Bt granite porphyraceous,Tr-J,,E.G.Ivanova,,"L.A.Kozubova,1957"
1760,Bt granite medium-grained,Tr-J,,,,"V.I.Fel'dman,1956"
1890,Plagiogranite,Tr-J,,NI.Serebryakova,,"A.V.Vnukov,1959"
...,...,...,...,...,...,...
1150,Granite leucocratic,J,Upper-Buy massif,,.N.P.Mel'nikova,"V.V.Starchenko,1968"
1396,Granite,Tr,Ust'-Nerchugan massifmassif,N.Aolebedeva,,"K.F.Khatskevich,1967"
2116,Granite,,,N.I.Serebryakov~,,"V.Yu.Shenfil,1962"
2973,Bt granite-porphyry,,Ergelyakh massif,G.P.Ignatovich,,"G.G.Naumov,1978"


In [49]:
coordinates_full = pd.concat([coordinates_utm, coordinates, mineralogy, cof], axis = 1)

In [50]:
coordinates_full.to_excel("../_INTERPOLATION/coordinates_full_data.xlsx")


### grouping the data

In [51]:
areax = coordinates_full
areax["area"] = ""


In [52]:
areay1 = areax[areax["X"].between(107.20, 121)]
area1 = areay1[areay1["Y"].between(48.85, 56.6)]
area1["area"] = 1
area1.to_excel("../_INTERPOLATION/area1.xlsx")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [53]:
areay2 = areax[areax["X"].between(131.75, 141.5)]
area2 = areay2[areay2["Y"].between(42, 56)]
area2["area"] = 2
area2.to_excel("../_INTERPOLATION/area2.xlsx")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [54]:
areay3 = areax[areax["X"].between(133, 160.5)]
area3 = areay3[areay3["Y"].between(58.6, 70.85)]
area3["area"] = 3
area3.to_excel("../_INTERPOLATION/area3.xlsx")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [55]:
areay4 = areax[areax["X"].between(160.5, 180)]
area4 = areay4[areay4["Y"].between(58, 75)]
area4["area"] = 4
area4.to_excel("../_INTERPOLATION/area4.xlsx")

In [56]:
areay5 = areax[areax["X"].between(-180, -165)]
area5 = areay5[areay5["Y"].between(63, 70)]
area5["area"] = 5
area5.to_excel("../_INTERPOLATION/area5.xlsx")

In [57]:
area12 = area1.append(area2)
area123 = area12.append(area3) 
area1234 = area123.append(area4) 
area = area1234.append(area5) 
area.to_excel("../_INTERPOLATION/area_subdivided.xlsx")

____

## Metadata

In [58]:
metadata = pd.read_excel("../_INTERPOLATION/coordinates_full_data.xlsx", index_col=0, usecols=[0, 25, 26, 27, 28, 29, 30])

In [59]:
metadata

Unnamed: 0,type_granite,time,massif,sampler,others,sampler+year
1,Granite leucogranitic,K,,,,"V.N.Musin,1970"
2,Granite leucocratic,Tr,,V.I.Zhigalova,,"E.A.Ivanov,1969"
3,Granite leucocratic coarse-grained,K,Omsukchan massif,P.M.Bosek,,"O.S.Gracheva,1948"
4,Granite leucocratic,K,Buksandzhin massif,A.Kh.Brovtman,,"A.F.Mikhaylov,1948"
5,Granite-porphyry micropegmatitic,K,Attykveem massif,L.G.Semenova,0th.:S-0.16,"A.I.Sadovsky,1963"
...,...,...,...,...,...,...
4655,Diorite,K,Egdegkych massif,,Oth.:co2-0.12,"V.A.lgnat'ev,1964"
4656,Quartz diorite,K,Verkhneplotnikovsky massif,,,"A.A.Syas'ko,1969"
4657,Diorite,J,,,"Oth.:co2-0.02,so3-0.0l","N.N.Remizov,1967"
4658,Diorite,K,,,,"A.P.Osipov,1966"


In [60]:
metadata["type_granite"].value_counts()

Granite                                         656
Granodiorite                                    523
Granite-porphyry                                282
Bt granite                                      268
Granite leucocratic                             220
                                               ... 
Granite leucocratic medium-coarse-grained         1
Bt-Amf granite porphyroblastic                    1
Augen diorite                                     1
Granosyenite-porphyry(granodiorite-porphyry)      1
Hb-Bt granosyenite                                1
Name: type_granite, Length: 401, dtype: int64

In [61]:
metadata["massif"].value_counts()

Ulakhan-Sis massif           58
Khoboyotuu-Echiy massif      56
Vladimirsky massif           35
Zimov'e massif               33
Bom-Gorkhon massif           31
                             ..
Monkity massif                1
Nunligran massif              1
Upper-Tiryakhtyakh massif     1
Supkan'ya massif              1
Kucheger massif               1
Name: massif, Length: 941, dtype: int64

In [62]:
metadata["time"].value_counts()

K       2522
J        825
Tr       517
Pg       365
Tr-J     338
Mz        88
Name: time, dtype: int64

In [63]:
metadata["sampler"].value_counts()

L.S.Voronova       120
D.M.Shuster         87
N.A.Lebedeva        86
V.I.Zhigalova       70
N.P.Mel'nikova      51
                  ... 
OoP.Maratkanova      1
MP.Merkul~eva        1
G.IMakarova          1
SYaGermenzon         1
AP.Lyubel'skaya      1
Name: sampler, Length: 736, dtype: int64

In [64]:
metadata["sampler+year"].value_counts()

G.A.Valuy,1975         76
V.A.Popeko,1968        61
V.A.Faradzhev,1971     43
V.S.Ivanov,1968        37
R.O.Galabala,1976      37
                       ..
G.V.Pavlov,1961         1
K.S.Sukhov,1962         1
V.N.Zavaritsky,1935     1
l.I.Puschin,1970        1
Ya.I.Fayn,1955          1
Name: sampler+year, Length: 1392, dtype: int64

## data for each area

In [65]:
metadata_area1 = pd.read_excel("../_INTERPOLATION/area1.xlsx", index_col=0, usecols=[0, 25, 26, 27, 28, 29, 30])
metadata_area2 = pd.read_excel("../_INTERPOLATION/area2.xlsx", index_col=0, usecols=[0, 25, 26, 27, 28, 29, 30])
metadata_area3 = pd.read_excel("../_INTERPOLATION/area3.xlsx", index_col=0, usecols=[0, 25, 26, 27, 28, 29, 30])
metadata_area4 = pd.read_excel("../_INTERPOLATION/area4.xlsx", index_col=0, usecols=[0, 25, 26, 27, 28, 29, 30])
metadata_area5 = pd.read_excel("../_INTERPOLATION/area5.xlsx", index_col=0, usecols=[0, 25, 26, 27, 28, 29, 30])




In [66]:
coordinates_utm_area1 = pd.read_excel("../_INTERPOLATION/area1.xlsx", index_col=0, usecols=[0, 1, 2, 3, 4])
coordinates_utm_area2 = pd.read_excel("../_INTERPOLATION/area2.xlsx", index_col=0, usecols=[0, 1, 2, 3, 4])
coordinates_utm_area3 = pd.read_excel("../_INTERPOLATION/area3.xlsx", index_col=0, usecols=[0, 1, 2, 3, 4])
coordinates_utm_area4 = pd.read_excel("../_INTERPOLATION/area4.xlsx", index_col=0, usecols=[0, 1, 2, 3, 4])
coordinates_utm_area5 = pd.read_excel("../_INTERPOLATION/area5.xlsx", index_col=0, usecols=[0, 1, 2, 3, 4])

In [67]:
coordinates_area1 = pd.read_excel("../_INTERPOLATION/area1.xlsx", index_col=0, usecols=[0, 5, 6, 7, 8, 9])
coordinates_area2 = pd.read_excel("../_INTERPOLATION/area2.xlsx", index_col=0, usecols=[0, 5, 6, 7, 8, 9])
coordinates_area3 = pd.read_excel("../_INTERPOLATION/area3.xlsx", index_col=0, usecols=[0, 5, 6, 7, 8, 9])
coordinates_area4 = pd.read_excel("../_INTERPOLATION/area4.xlsx", index_col=0, usecols=[0, 5, 6, 7, 8, 9])
coordinates_area5 = pd.read_excel("../_INTERPOLATION/area5.xlsx", index_col=0, usecols=[0, 5, 6, 7, 8, 9])

In [68]:
mineralogy_area1 = pd.read_excel("../_INTERPOLATION/area1.xlsx", index_col=0, usecols=[0, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])
mineralogy_area2 = pd.read_excel("../_INTERPOLATION/area2.xlsx", index_col=0, usecols=[0, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])
mineralogy_area3 = pd.read_excel("../_INTERPOLATION/area3.xlsx", index_col=0, usecols=[0, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])
mineralogy_area4 = pd.read_excel("../_INTERPOLATION/area4.xlsx", index_col=0, usecols=[0, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])
mineralogy_area5 = pd.read_excel("../_INTERPOLATION/area5.xlsx", index_col=0, usecols=[0, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22])

In [69]:
mineralogy_clr_area1 = preproc.clr(mineralogy_area1)
mineralogy_clr_area2 = preproc.clr(mineralogy_area2)
mineralogy_clr_area3 = preproc.clr(mineralogy_area3)
mineralogy_clr_area4 = preproc.clr(mineralogy_area4)
mineralogy_clr_area5 = preproc.clr(mineralogy_area5)

In [70]:
mineralogy_pca_area1 = preproc.pca(mineralogy_clr_area1)
preproc.pca_variance(mineralogy_pca_area1)

mineralogy_pca_area2 = preproc.pca(mineralogy_clr_area2)
preproc.pca_variance(mineralogy_pca_area2)

mineralogy_pca_area3 = preproc.pca(mineralogy_clr_area3)
preproc.pca_variance(mineralogy_pca_area3)

mineralogy_pca_area4 = preproc.pca(mineralogy_clr_area4)
preproc.pca_variance(mineralogy_pca_area4)

mineralogy_pca_area5 = preproc.pca(mineralogy_clr_area5)
preproc.pca_variance(mineralogy_pca_area5)

8 PCA components  out of 13 components with variance sum 0.9555579066346551 needed for obtaining sum of variance > 0.95
8 PCA components  out of 13 components with variance sum 0.9593403254547035 needed for obtaining sum of variance > 0.95
8 PCA components  out of 13 components with variance sum 0.9627406809523947 needed for obtaining sum of variance > 0.95
8 PCA components  out of 13 components with variance sum 0.9597457432608614 needed for obtaining sum of variance > 0.95
7 PCA components  out of 13 components with variance sum 0.96013561671489 needed for obtaining sum of variance > 0.95


array([3.26599017e-01, 2.13750993e-01, 1.57958438e-01, 1.02940256e-01,
       7.36900126e-02, 5.32433330e-02, 3.19535673e-02, 1.74469492e-02,
       1.30907170e-02, 7.53890350e-03, 1.25766416e-03, 5.30149449e-04,
       2.74294555e-32])

In [71]:
mineralogy_pca_area1_df = preproc.create_pca_df(mineralogy_pca_area1, mineralogy_clr_area1)
mineralogy_pca_area2_df = preproc.create_pca_df(mineralogy_pca_area2, mineralogy_clr_area2)
mineralogy_pca_area3_df = preproc.create_pca_df(mineralogy_pca_area3, mineralogy_clr_area3)
mineralogy_pca_area4_df = preproc.create_pca_df(mineralogy_pca_area4, mineralogy_clr_area4)
mineralogy_pca_area5_df = preproc.create_pca_df(mineralogy_pca_area5, mineralogy_clr_area5)


## Saving of data

In [72]:
# Save data as pickle files to use them in later notebooks
preproc.save_obj(mineralogy_area1, "mineralogy_area1") # mineralogy
preproc.save_obj(mineralogy_clr_area1, "mineralogy_clr_area1") # mineralogy clr
preproc.save_obj(mineralogy_pca_area1, "mineralogy_pca_area1") # mineralogy pca info
preproc.save_obj(mineralogy_pca_area1_df, "mineralogy_pca_area1_df") # mineralogy pca scores

preproc.save_obj(coordinates_area1, "coordinates_area1") # coordinates latlon
preproc.save_obj(coordinates_utm_area1, "coordinates_utm_area1") # coordinates utm
preproc.save_obj(metadata_area1, "metadata_area1") # metadata

____

In [73]:
coordinates_utm_area1

Unnamed: 0,Y_UTM,X_UTM,ZoneNumber,ZoneLetter
2,1.436111e+06,6.102354e+06,48,U
6,1.113927e+06,5.652652e+06,48,U
9,1.068636e+06,5.598845e+06,48,U
10,1.098840e+06,5.690263e+06,48,U
11,9.048985e+05,5.630297e+06,48,U
...,...,...,...,...
4538,1.304961e+06,5.902228e+06,48,U
4552,1.328849e+06,6.028456e+06,48,U
4574,1.413275e+06,6.019949e+06,48,U
4585,1.455811e+06,5.990438e+06,48,U
