# Preprocessing

In [1]:
import numpy as np
import pandas as pd
import os
import pickle

import utm

In [2]:
import prepostprocessing.cleaning as cleaning
import prepostprocessing.pre_processing as preproc

In [3]:
# Load jupyter extension to reload packages before executing user code.
# https://ipython.readthedocs.io/en/stable/config/extensions/autoreload.html
%load_ext autoreload
# Reload all packages (except those excluded by %aimport) every time before executing the Python code typed.
%autoreload 2

## Mineralogy

**To do**
* Clean last points in Excel file while using "sum" as check

In [6]:
mineralogy = pd.read_excel("../_CLEANED/Vistelius_data_cleaned.xlsx", index_col=0)

In [23]:
np.isclose(mineralogy.loc[:, :"oth"].sum(axis=1), mineralogy.loc[:, "sum"])

array([ True,  True,  True, ...,  True,  True,  True])

In [28]:
wrong_sum = mineralogy.loc[~np.isclose(mineralogy.loc[:, :"oth"].sum(axis=1), mineralogy.loc[:, "sum"]), :]

In [32]:
wrong_sum

Unnamed: 0,SiO2,TiO2,Al2O3,Fe2O3,FeO,MnO,MgO,CaO,Na2O,K2O,P2O5,l.i.,oth,sum,hs
841,74.75,0.16,14.19,1.13,0.48,0.28,0.28,0.96,5.63,1.52,0.82,0.2,,100.49,0.26
1342,73.53,0.0,16.4,1.38,0.54,0.07,0.27,0.25,4.55,2.12,0.02,1.06,,100.27,
1529,73.1,0.11,15.01,0.55,1.15,0.03,0.36,1.08,3.46,4.74,0.06,0.24,,99.95,
1550,73.06,0.46,13.36,0.71,2.02,0.06,0.37,1.24,3.57,4.34,,0.31,,99.6,0.35
1706,72.66,0.72,13.58,1.48,1.08,0.05,0.25,0.96,3.79,4.3,0.01,1.14,,100.1,0.05
2561,70.2,0.25,15.59,0.44,2.76,0.06,1.12,2.27,3.13,3.84,0.09,0.46,,100.31,0.2
2661,69.9,0.48,14.16,0.95,2.75,0.09,1.02,2.83,5.0,2.4,0.2,0.44,,100.23,
2890,69.18,0.52,14.46,1.01,2.2,0.07,1.38,2.43,3.97,3.78,0.16,0.7,,99.85,
3004,68.71,0.28,14.7,1.41,2.24,,0.61,3.08,4.69,3.96,,0.41,,100.11,
3938,64.76,0.56,15.31,0.96,4.54,0.11,1.57,1.95,3.0,6.2,0.12,0.94,,100.08,


In [67]:
mineralogy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4659 entries, 1 to 4659
Data columns (total 15 columns):
SiO2     4659 non-null float64
TiO2     4626 non-null float64
Al2O3    4659 non-null float64
Fe2O3    4657 non-null float64
FeO      4659 non-null float64
MnO      4544 non-null float64
MgO      4658 non-null float64
CaO      4659 non-null float64
Na2O     4659 non-null float64
K2O      4659 non-null float64
P2O5     3833 non-null float64
l.i.     4659 non-null float64
oth      874 non-null float64
sum      4659 non-null float64
hs       2240 non-null float64
dtypes: float64(15)
memory usage: 582.4 KB


In [31]:
wrong_sum.loc[:, :"oth"].sum(axis=1)

841     100.40
1342    100.19
1529     99.89
1550     99.50
1706    100.02
2561    100.21
2661    100.22
2890     99.86
3004    100.09
3938    100.02
4388     99.91
dtype: float64

### Cleaning
**To do**
* Replace zero values
* Normalize

### centred log-ratio (clr) transformation

In [33]:
mineralogy_clr = preproc.clr(mineralogy)
mineralogy_clr

  log_data = np.log(data)


AssertionError: 

### Principal Component Analysis (PCA)

In [None]:
mineralogy_pca = preproc.pca(mineralogy_clr)
preproc.pca_variance(mineralogy_pca)

In [None]:
mineralogy_pca_df = preproc.create_pca_df(mineralogy_pca, mineralogy_clr)

____

## Coordinates

In [42]:
coordinates = pd.read_excel("../_DATA/full_with_coordinates.xlsx", index_col=0, usecols=[0, 1, 2, 3])

In [43]:
coordinates

Unnamed: 0,Latitude,Longitude,past_mer
0,"44°31'30.0""","138°37'30.0""",
1,"54°12'10.0""","119°24'0.0""",
2,"62°36'0.0""","155°36'0.0""",
3,"61°35'0.0""","146°2'0.0""",
4,"68°55'0.0""","164°24'0.0""",
...,...,...,...
4654,"66°42'0.0""","164°23'0.0""",
4655,"46°56'30.0""","137°5'3.0""",
4656,"58°12'0.0""","138°12'0.0""",
4657,"60°51'0.0""","147°31'0.0""",


In [44]:
# Delete negative signs in "Longitude" column for dms2dec function to work properly
coordinates["Longitude"] = coordinates["Longitude"].str.replace("-", "")

In [45]:
# Include W in "Longitude" column
sum_ = 0

for index, row in coordinates.iterrows():
    if ("W" in str(row["past_mer"])) or ("w" in str(row["past_mer"])):
        coordinates.loc[index, "Longitude"] = row["Longitude"] + "W"
        
        sum_ += 1

In [73]:
# Check that all occurences of "W" or "w" are catched
assert sum_ == int(coordinates["past_mer"].value_counts())

### Convert from degrees to decimal format

In [47]:
coordinates["Y"] = coordinates.loc[:, "Latitude"].apply(cleaning.dms2dec)
coordinates["X"] = coordinates.loc[:, "Longitude"].apply(cleaning.dms2dec)

In [48]:
# Check
coordinates.loc[42, "X"]

-178.83333333333334

### Convert to UTM coordinates

**To do**
* Group samples into certain groups based on spatial distribution
* Recalculate utm coordinates based on fixed zone (fixed letter and number)

In [49]:
coordinates_utm = coordinates.apply(lambda row : utm.from_latlon(row["Y"], row["X"]), axis=1)
coordinates_utm = coordinates_utm.apply(pd.Series)
coordinates_utm.columns = ["X", "Y", "ZoneNumber", "ZoneLetter"]

In [52]:
coordinates_utm

Unnamed: 0,X,Y,ZoneNumber,ZoneLetter
0,311272.566098,4.932930e+06,54,T
1,656538.925786,6.008743e+06,50,U
2,633468.918083,6.943713e+06,56,V
3,448679.859182,6.828145e+06,55,V
4,475912.972942,7.645188e+06,58,W
...,...,...,...,...
4654,472781.240413,7.398072e+06,58,W
4655,658620.016371,5.200790e+06,53,T
4656,335447.154681,6.454395e+06,54,V
4657,528075.823175,6.746190e+06,55,V


In [53]:
coordinates_utm["ZoneNumber"].value_counts()

53    971
49    802
50    689
54    655
55    279
56    232
58    174
60    151
1     139
59    133
48    132
57    130
52     90
2      40
51     39
47      3
Name: ZoneNumber, dtype: int64

In [54]:
coordinates_utm["ZoneLetter"].value_counts()

U    1875
W    1117
T     878
V     758
X      31
Name: ZoneLetter, dtype: int64

In [63]:
(coordinates_utm["ZoneNumber"].astype(str) + coordinates_utm["ZoneLetter"]).value_counts()

49U    759
50U    658
53T    643
55V    197
56V    196
54W    193
54V    178
53U    167
53W    148
1W     139
54T    129
60W    126
59W    125
54U    124
58W    111
55W     82
48U     77
58V     63
52W     58
57V     57
57W     56
48T     55
2W      40
49T     40
51U     38
56W     36
52U     32
54X     31
60V     25
50V     20
57U     17
53V     13
50T     11
59V      8
47U      3
49W      3
51V      1
dtype: int64

____

## Metadata

In [34]:
metadata = pd.read_excel("../_DATA/full_with_coordinates.xlsx", index_col=0)\
             .drop(["Latitude", "Longitude", "past_mer"], axis=1)

In [35]:
metadata

Unnamed: 0,type_granite,time,massif,sampler,others,sampler+year
0,Granite leucogranitic,K2,,,,"V.N.Musin,1970"
1,Granite leucocratic,Tr,,V.I.Zhigalova,,"E.A.Ivanov,1969"
2,Granite leucocratic coarse-grained,K2,Omsukchan massif,P.M.Bosek,,"O.S.Gracheva,1948"
3,Granite leucocratic,K1,Buksandzhin massif,A.Kh.Brovtman,,"A.F.Mikhaylov,1948"
4,Granite-porphyry micropegmatitic,K1,Attykveem massif,L.G.Semenova,0th.:S-0.16,"A.I.Sadovsky,1963"
...,...,...,...,...,...,...
4654,Diorite,K1,Egdegkych massif,,Oth.:co2-0.12,"V.A.lgnat'ev,1964"
4655,Quartz diorite,K2,Verkhneplotnikovsky massif,,,"A.A.Syas'ko,1969"
4656,Diorite,J3,,,"Oth.:co2-0.02,so3-0.0l","N.N.Remizov,1967"
4657,Diorite,K2,,,,"A.P.Osipov,1966"


In [36]:
metadata["type_granite"].value_counts()

Granite                                    656
Granodiorite                               523
Granite-porphyry                           282
Bt granite                                 268
Granite leucocratic                        220
                                          ... 
Granodiorite leucocratic                     1
Mu granite porphyraceous coarse-grained      1
Px-Amf granodiorite                          1
Amf-Bt granite-porphyry                      1
Hb-Bt granite(granosyenite)                  1
Name: type_granite, Length: 401, dtype: int64

In [37]:
metadata["massif"].value_counts()

Ulakhan-Sis massif         58
Khoboyotuu-Echiy massif    56
Vladimirsky massif         35
Zimov'e massif             33
Bom-Gorkhon massif         31
                           ..
Shivkin massif              1
Kyragas massif              1
Dogdin massif               1
Alchan massif               1
Chekhalin massif            1
Name: massif, Length: 942, dtype: int64

In [38]:
metadata["time"].value_counts()

K2         1271
K1         1151
Tr          465
J3          411
J2          154
K2-Pg1      127
Tr-J1       122
Tr3-J1      121
J1_2        108
Tr-J         82
Pg1          78
Pg           74
J1           69
Mz           57
K            52
J            46
K2-Pg        42
J2_3         35
J3-K1        31
Mz1          28
Tr3          28
Pg1_2        20
Pg2          18
Tr2          17
K1_2          9
Tr-J2         7
Tr3-J         3
Tr2_3         3
K2_Pg         2
Tr1           2
Mz3           2
N             2
J-K           2
Tr3-J1N       2
Pg3           2
Mz1-N         1
J3-Pg         1
J2-K          1
J3-Pg1        1
K1-K2         1
J-Pg3         1
J 1_2         1
Pg2_3         1
Tr1-J1        1
Name: time, dtype: int64

In [39]:
metadata["sampler"].value_counts()

L.S.Voronova       120
D.M.Shuster         87
N.A.Lebedeva        86
V.I.Zhigalova       70
N.P.Mel'nikova      49
                  ... 
R.PKopnova           1
A.V.Skri pina        1
NP.Me l'nikova       1
N.A.Krivitskaya      1
E.N.Grigoryan        1
Name: sampler, Length: 739, dtype: int64

In [40]:
metadata["sampler+year"].value_counts()

G.A.Valuy,1975         76
V.A.Popeko,1968        61
V.A.Faradzhev,1971     43
V.S.Ivanov,1968        37
R.O.Galabala,1976      37
                       ..
I.N.Trumpe,1958         1
N.I.Tikhomirov,1938     1
A.I.Gus'kova,1954       1
M.N.Zlobin,1941         1
M.P.Krutous,1956        1
Name: sampler+year, Length: 1392, dtype: int64

## Saving of data

In [None]:
# Save data as pickle files to use them in later notebooks
preproc.save_obj(mineralogy) # mineralogy
preproc.save_obj(mineralogy_clr) # mineralogy clr
preproc.save_obj(mineralogy_pca) # mineralogy pca info
preproc.save_obj(mineralogy_pca_df) # mineralogy pca scores

preproc.save_obj(___) # coordinates latlon
preproc.save_obj(___) # coordinates utm
preproc.save_obj(___) # metadata

____