# Import Libraries

In [58]:
import os
import seaborn as sns
import pandas as pd

from matplotlib import pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# Load Data

In [17]:
curr_dir = (os.getcwd()).replace('\\', '/')
data_path = os.path.join(curr_dir, 'Data')
str_sed_data = pd.read_csv(data_path + '/stream_sediments_57F11.csv')

In [18]:
str_sed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 0 to 202
Data columns (total 71 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   gid        203 non-null    int64  
 1   objectid   203 non-null    int64  
 2   sampleno   203 non-null    object 
 3   longitude  203 non-null    float64
 4   latitude   203 non-null    float64
 5   sio2       203 non-null    float64
 6   al2o3      203 non-null    float64
 7   fe2o3      203 non-null    float64
 8   tio2       203 non-null    float64
 9   cao        203 non-null    float64
 10  mgo        203 non-null    float64
 11  mno        203 non-null    float64
 12  na2o       203 non-null    float64
 13  k2o        203 non-null    float64
 14  p2o5       203 non-null    float64
 15  loi        203 non-null    float64
 16  ba         203 non-null    float64
 17  ga         203 non-null    float64
 18  sc         203 non-null    float64
 19  v          203 non-null    float64
 20  th        

In [19]:
str_sed_data.describe()

Unnamed: 0,gid,objectid,longitude,latitude,sio2,al2o3,fe2o3,tio2,cao,mgo,...,w,u,pt,pd,in_,f,te,tl,hg,cd
count,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,...,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0
mean,3886.817734,3886.817734,77.625159,14.374757,61.577385,15.343576,5.106995,0.072808,1.711232,3.511527,...,2.5,2.898556,0.0,0.0,0.0,0.0,0.0,0.0,8.507389,51.55665
std,392.690039,392.690039,0.077262,0.07324,4.585337,1.115737,1.852035,0.037739,0.78106,1.287493,...,0.0,1.197724,0.0,0.0,0.0,0.0,0.0,0.0,9.023413,9.966801
min,2947.0,2947.0,77.5,14.259009,44.1,12.09,2.46,0.03,0.58,1.27,...,2.5,0.507633,0.0,0.0,0.0,0.0,0.0,0.0,2.5,50.0
25%,3705.5,3705.5,77.564815,14.313063,59.146,14.625,3.975,0.05,1.2,2.63,...,2.5,2.125364,0.0,0.0,0.0,0.0,0.0,0.0,5.0,50.0
50%,3934.0,3934.0,77.62037,14.367117,62.75,15.36,4.65,0.06,1.52,3.21,...,2.5,2.646843,0.0,0.0,0.0,0.0,0.0,0.0,7.0,50.0
75%,4160.5,4160.5,77.694444,14.439189,64.325,16.12,5.525,0.07,1.965,4.04,...,2.5,3.588826,0.0,0.0,0.0,0.0,0.0,0.0,9.0,50.0
max,5073.0,5073.0,77.749999,14.493242,69.86,17.97,12.25,0.23,5.72,8.38,...,2.5,7.094,0.0,0.0,0.0,0.0,0.0,0.0,80.0,128.0


Max and Min:
- Longitude
- Latitude

In [24]:
print(f"Maximum longitude co-ordinate: {max(str_sed_data['longitude'])}")
print(f"Minimum longitude co-ordinate: {min(str_sed_data['longitude'])}")

print(f"Maximum latitude co-ordinate: {max(str_sed_data['latitude'])}")
print(f"Minimum latitude co-ordinate: {min(str_sed_data['latitude'])}")

Maximum longitude co-ordinate: 77.749999
Minimum longitude co-ordinate: 77.5
Maximum latitude co-ordinate: 14.493242
Minimum latitude co-ordinate: 14.259009


In [25]:
str_sed_data.head()

Unnamed: 0,gid,objectid,sampleno,longitude,latitude,sio2,al2o3,fe2o3,tio2,cao,...,u,pt,pd,toposheet,in_,f,te,tl,hg,cd
0,2947,2947,57F11/177/SS/12,77.657407,14.475225,61.921,14.606,4.51,0.05,2.28,...,3.016905,0,0,57F11,0,0,0,0,8.0,50
1,2948,2948,57F11/178/SS/12,77.675926,14.475225,62.077,13.75,4.22,0.05,2.43,...,3.37855,0,0,57F11,0,0,0,0,9.0,50
2,2949,2949,57F11/179/SS/12,77.694444,14.475225,68.645,13.41,3.02,0.05,1.32,...,3.509,0,0,57F11,0,0,0,0,2.5,50
3,2950,2950,57F11/180/SS/12,77.712963,14.475225,64.95,14.28,3.46,0.05,1.75,...,4.241,0,0,57F11,0,0,0,0,19.0,50
4,2951,2951,57F11/181/SS/12,77.731481,14.475225,64.302,14.01,4.08,0.05,1.84,...,4.218,0,0,57F11,0,0,0,0,19.0,50


In [43]:
str_sed_data['toposheet'].value_counts()

toposheet
57F11    196
57F07      7
Name: count, dtype: int64

## Elements composition from different samples

In [34]:
elements = []

for col in list(str_sed_data.columns):
    if col not in ['gid', 'objectid', 'sampleno', 'longitude', 'latitude', 'toposheet']:
        elements.append(col)

In [51]:
elements_df = str_sed_data[elements]
elements_df.head()

Unnamed: 0,sio2,al2o3,fe2o3,tio2,cao,mgo,mno,na2o,k2o,p2o5,...,w,u,pt,pd,in_,f,te,tl,hg,cd
0,61.921,14.606,4.51,0.05,2.28,3.24,1.52,2.46,0.61,0.13,...,2.5,3.016905,0,0,0,0,0,0,8.0,50
1,62.077,13.75,4.22,0.05,2.43,4.14,1.52,2.55,0.64,0.12,...,2.5,3.37855,0,0,0,0,0,0,9.0,50
2,68.645,13.41,3.02,0.05,1.32,3.19,2.89,3.08,0.75,0.08,...,2.5,3.509,0,0,0,0,0,0,2.5,50
3,64.95,14.28,3.46,0.05,1.75,3.1,2.34,2.82,0.6,0.12,...,2.5,4.241,0,0,0,0,0,0,19.0,50
4,64.302,14.01,4.08,0.05,1.84,3.24,2.12,2.83,0.75,0.16,...,2.5,4.218,0,0,0,0,0,0,19.0,50


In [53]:
elem_rem = []
for col in elements:
    if (elements_df[col] == 0).all():
        elem_rem.append(col)
print(f"Elements which don't contribute in the sediment composition: {elem_rem}")

Elements which don't contribute in the sediment composition: ['pt', 'pd', 'in_', 'f', 'te', 'tl']


In [54]:
elements_df.drop(elem_rem, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elements_df.drop(elem_rem, axis=1, inplace=True)


In [None]:
sns.set(rc = {'figure.figsize':(12,12)})
ax = sns.heatmap(elements_df.corr(), 
                 cmap='YlGnBu', 
                 annot=True)
ax.set_title('Co-relation heatmap', 
             fontsize=18)

## Normalise the values

In [None]:
ct = ColumnTransformer(
    [('sc', StandardScaler(), [elements_df.columns])], 
    remainder='passthrough'
)

norm_data = ct.fit_transform(str_sed_data)
norm_str_sed_data = pd.DataFrame(norm_data, index=str_sed_data, col)