# Step 1: Data Aggregation and Preparation

## Installing prerequisite libraries and defining prerequisite function





In [None]:
from IPython.display import Javascript
def resize_colab_cell():
  display(Javascript('google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'))
get_ipython().events.register('pre_run_cell', resize_colab_cell)

In [None]:
# Installing ase python library

!pip install --upgrade git+https://gitlab.com/ase/ase.git@master


<IPython.core.display.Javascript object>

Collecting git+https://gitlab.com/ase/ase.git@master
  Cloning https://gitlab.com/ase/ase.git (to revision master) to /tmp/pip-req-build-bj94c8tq
  Running command git clone -q https://gitlab.com/ase/ase.git /tmp/pip-req-build-bj94c8tq


In [None]:
import ase.io
import numpy as np
import collections
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
%matplotlib inline
from IPython.display import display

import sys, os
from ase.io.trajectory import Trajectory

<IPython.core.display.Javascript object>

  import pandas.util.testing as tm


In [None]:
from scipy import stats 
from scipy.special import boxcox1p

# Model related libraries 
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,ExtraTreesRegressor,AdaBoostRegressor,StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
# from xgboost import XGBRegressor


# for feature slection
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,MinMaxScaler,StandardScaler
from sklearn.metrics import r2_score,mean_absolute_error,median_absolute_error,mean_squared_error


<IPython.core.display.Javascript object>

In [None]:
# defining a function "load_data(filename)" to load trajectory data

def load_data(filename):
  ''' Input:    filename   - string, name of traj containing the data
      Returns:  atoms_list - list of ASE atoms objects https://wiki.fysik.dtu.dk/ase/ase/atoms.html
                atoms_data - list of dictionaries (same len/order as in atoms_list); keys: 
                             'idx': indices of Al atoms in the ASE atoms objects (np.array)
                             'iso': isotropic shielding of Al atoms (np.array)
  '''
  # load ASE trajectory file https://wiki.fysik.dtu.dk/ase/ase/io/io.html
  atoms_list = ase.io.read(filename, index=':', format='traj')
  
  # features and target values in data will have the same order as in key_names
  atoms_data = []
  for atoms in atoms_list:
    # get indices of Al atoms in the structure
    idx = np.where(atoms.get_atomic_numbers() == 13)[0]
    # get isotropic shielding from the 'info' dictionary for every Al atom in 'atoms'
    iso = np.array([v['iso'] for v in atoms.info.values()])
    atoms_data.append({'idx': idx,
                       'iso': iso})
  return atoms_list, atoms_data

<IPython.core.display.Javascript object>

## loading data





In [None]:
data_file_location="/content/drive/MyDrive/Academics/Mission PhD in MIT/University-Application/Application/Prague-Charles-University/python-task/data.traj"

<IPython.core.display.Javascript object>

In [None]:
# loading data by using "load_data()" function
data= load_data(data_file_location)
print("data type :",type(data))

data

<IPython.core.display.Javascript object>

data type : <class 'tuple'>


([Atoms(symbols='O24NaAlSi11', pbc=True, cell=[[8.99, 0.0, 0.0], [-0.672978999999999, 9.056028999999999, 0.0], [-0.835060000000006, -0.909109999999994, 9.184415000000003]]),
  Atoms(symbols='O24NaAlSi11', pbc=True, cell=[[8.99, 0.0, 0.0], [-0.672978999999999, 9.056028999999999, 0.0], [-0.835060000000006, -0.909109999999994, 9.184415000000003]]),
  Atoms(symbols='O24NaAlSi11', pbc=True, cell=[[8.99, 0.0, 0.0], [-0.672978999999999, 9.056028999999999, 0.0], [-0.835060000000006, -0.909109999999994, 9.184415000000003]]),
  Atoms(symbols='O24NaAlSi11', pbc=True, cell=[[8.99, 0.0, 0.0], [-0.672978999999999, 9.056028999999999, 0.0], [-0.835060000000006, -0.909109999999994, 9.184415000000003]]),
  Atoms(symbols='O24NaAlSi11', pbc=True, cell=[[8.99, 0.0, 0.0], [-0.672978999999999, 9.056028999999999, 0.0], [-0.835060000000006, -0.909109999999994, 9.184415000000003]]),
  Atoms(symbols='O24NaAlSi11', pbc=True, cell=[[8.99, 0.0, 0.0], [-0.672978999999999, 9.056028999999999, 0.0], [-0.835060000000006

In [None]:
from google.colab import drive
drive.mount('/content/drive')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## atoms_data

In [None]:
atoms_data=data[1]
print("data length :",len(atoms_data))
atoms_data

<IPython.core.display.Javascript object>

data length : 1557


[{'idx': array([25]), 'iso': array([486.44861351])},
 {'idx': array([25]), 'iso': array([488.81963006])},
 {'idx': array([25]), 'iso': array([493.75034879])},
 {'idx': array([25]), 'iso': array([489.56519083])},
 {'idx': array([25]), 'iso': array([487.25631837])},
 {'idx': array([25]), 'iso': array([490.7713656])},
 {'idx': array([25]), 'iso': array([486.64376248])},
 {'idx': array([25]), 'iso': array([485.17146118])},
 {'idx': array([25]), 'iso': array([492.52818613])},
 {'idx': array([25]), 'iso': array([490.83234686])},
 {'idx': array([25]), 'iso': array([493.27518564])},
 {'idx': array([25]), 'iso': array([491.27245083])},
 {'idx': array([25]), 'iso': array([491.8651455])},
 {'idx': array([25]), 'iso': array([489.18311833])},
 {'idx': array([25]), 'iso': array([493.0596475])},
 {'idx': array([25]), 'iso': array([488.60053506])},
 {'idx': array([25]), 'iso': array([490.6996296])},
 {'idx': array([25]), 'iso': array([488.45618369])},
 {'idx': array([25]), 'iso': array([492.37484352])

In [None]:
atoms_data=pd.DataFrame.from_dict(atoms_data)
atoms_data    
    

<IPython.core.display.Javascript object>

Unnamed: 0,idx,iso
0,[25],[486.44861350943205]
1,[25],[488.8196300611301]
2,[25],[493.75034879423293]
3,[25],[489.56519083046305]
4,[25],[487.256318369585]
...,...,...
1552,"[54, 55, 56]","[493.9725311244411, 501.1908768035607, 498.654..."
1553,"[54, 55, 56]","[493.11263745761636, 503.10605394710774, 496.0..."
1554,"[54, 55, 56]","[492.0389279937553, 496.2079707295236, 498.636..."
1555,"[54, 55, 56]","[497.1016773424867, 501.1233663161216, 499.609..."


In [None]:
iso_list_main=[]
idx_list_main=[]


for i in range(len(atoms_data)):

    iso_list=(atoms_data.iso)[i][0]
    iso_list_main.append(iso_list)
    
    idx_list=(atoms_data.idx)[i][0]
    idx_list_main.append(idx_list)
    
# iso_list_main
# idx_list_main

dataset_atoms_data = pd.DataFrame({'idx': idx_list_main, 'iso': iso_list_main})
dataset_atoms_data

<IPython.core.display.Javascript object>

Unnamed: 0,idx,iso
0,25,486.448614
1,25,488.819630
2,25,493.750349
3,25,489.565191
4,25,487.256318
...,...,...
1552,54,493.972531
1553,54,493.112637
1554,54,492.038928
1555,54,497.101677


## atomlist

In [None]:
from ase import Atom

<IPython.core.display.Javascript object>

In [None]:
atoms_list=data[0]
print("data length :",len(atoms_list))
atoms_list

<IPython.core.display.Javascript object>

data length : 1557


[Atoms(symbols='O24NaAlSi11', pbc=True, cell=[[8.99, 0.0, 0.0], [-0.672978999999999, 9.056028999999999, 0.0], [-0.835060000000006, -0.909109999999994, 9.184415000000003]]),
 Atoms(symbols='O24NaAlSi11', pbc=True, cell=[[8.99, 0.0, 0.0], [-0.672978999999999, 9.056028999999999, 0.0], [-0.835060000000006, -0.909109999999994, 9.184415000000003]]),
 Atoms(symbols='O24NaAlSi11', pbc=True, cell=[[8.99, 0.0, 0.0], [-0.672978999999999, 9.056028999999999, 0.0], [-0.835060000000006, -0.909109999999994, 9.184415000000003]]),
 Atoms(symbols='O24NaAlSi11', pbc=True, cell=[[8.99, 0.0, 0.0], [-0.672978999999999, 9.056028999999999, 0.0], [-0.835060000000006, -0.909109999999994, 9.184415000000003]]),
 Atoms(symbols='O24NaAlSi11', pbc=True, cell=[[8.99, 0.0, 0.0], [-0.672978999999999, 9.056028999999999, 0.0], [-0.835060000000006, -0.909109999999994, 9.184415000000003]]),
 Atoms(symbols='O24NaAlSi11', pbc=True, cell=[[8.99, 0.0, 0.0], [-0.672978999999999, 9.056028999999999, 0.0], [-0.835060000000006, -0.9

In [None]:
# atoms_list=pd.DataFrame.from_dict(atoms_list)
# # atoms_list

<IPython.core.display.Javascript object>

In [None]:
atoms_list[0]

<IPython.core.display.Javascript object>

Atoms(symbols='O24NaAlSi11', pbc=True, cell=[[8.99, 0.0, 0.0], [-0.672978999999999, 9.056028999999999, 0.0], [-0.835060000000006, -0.909109999999994, 9.184415000000003]])

### positions based features

In [None]:
dataset_position_main= pd.DataFrame()


for i in range(len(atoms_list)):

    a1=atoms_list[i]
    position=a1.get_positions().flatten()
    dataset_position = pd.DataFrame({i: position})
    dataset_position=dataset_position.T
    dataset_position_main = pd.concat([dataset_position_main,dataset_position], ignore_index=True)
    
    
# dataset_position   
dataset_position_main=dataset_position_main.fillna(0)
# dataset_position_main.to_excel("dataset_position_main.xlsx")
dataset_position_main

<IPython.core.display.Javascript object>

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,257,258,259,260,261,262,263,264,265,266
0,-0.731503,5.107745,7.804457,4.635843,0.100921,7.920823,6.779093,1.000704,5.444429,7.493853,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.323777,4.821551,7.746136,4.898302,-0.063683,7.901260,6.709995,0.827481,5.489984,7.244149,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.468969,5.162387,7.428079,5.039972,-0.068103,7.805926,6.633251,0.942144,5.644466,7.145261,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.707422,5.049390,7.822183,4.715939,0.188620,7.775893,6.955133,0.926423,5.381516,7.571735,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.589937,5.085590,7.623891,4.551480,0.303018,7.840643,6.759740,0.933135,5.496689,7.634863,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552,13.856194,1.504067,2.479810,13.562707,2.970068,2.373920,7.225627,9.970719,6.743151,5.964682,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1553,13.404355,1.282632,2.246385,13.647060,2.740447,2.697924,7.388598,9.880125,7.031935,6.105467,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1554,13.334153,1.149743,2.643018,13.418233,2.633618,2.179185,7.342396,10.097740,6.972051,5.884146,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1555,0.291602,1.594114,1.924040,13.572714,2.631844,2.897184,7.215419,9.986682,6.853566,5.821480,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### atomic_numbers based features

In [None]:
dataset_atomic_numbers_main= pd.DataFrame()


for i in range(len(atoms_list)):

    a1=atoms_list[i]
    atomic_numbers=a1.get_atomic_numbers().flatten()
    atomic_numbers = pd.DataFrame({i: atomic_numbers})
    dataset_atomic_numbers=atomic_numbers.T
    dataset_atomic_numbers_main = pd.concat([dataset_atomic_numbers_main,dataset_atomic_numbers], ignore_index=True)
    
    
# atomic_numbers   
dataset_atomic_numbers_main=dataset_atomic_numbers_main.fillna(0)
# dataset_atomic_numbers_main.to_excel("dataset_position_main.xlsx")
dataset_atomic_numbers_main

<IPython.core.display.Javascript object>

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,79,80,81,82,83,84,85,86,87,88
0,8,8,8,8,8,8,8,8,8,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8,8,8,8,8,8,8,8,8,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8,8,8,8,8,8,8,8,8,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8,8,8,8,8,8,8,8,8,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,8,8,8,8,8,8,8,8,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552,1,1,8,8,8,8,8,8,8,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1553,1,1,8,8,8,8,8,8,8,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1554,1,1,8,8,8,8,8,8,8,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1555,1,1,8,8,8,8,8,8,8,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### masses based features

In [None]:
dataset_masses_numbers_main= pd.DataFrame()


for i in range(len(atoms_list)):

    a1=atoms_list[i]
    atomic_masses=a1.get_masses().flatten()
    atomic_masses = pd.DataFrame({i: atomic_masses})
    dataset_atomic_masses=atomic_masses.T
    dataset_masses_numbers_main = pd.concat([dataset_masses_numbers_main,dataset_atomic_masses], ignore_index=True)
    
    
# atomic_masses   
dataset_masses_numbers_main=dataset_masses_numbers_main.fillna(0)
# dataset_masses_numbers_main.to_excel("dataset_position_main.xlsx")
dataset_masses_numbers_main

<IPython.core.display.Javascript object>

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,79,80,81,82,83,84,85,86,87,88
0,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552,1.008,1.008,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1553,1.008,1.008,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1554,1.008,1.008,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1555,1.008,1.008,15.999,15.999,15.999,15.999,15.999,15.999,15.999,15.999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### cell based features

In [None]:
dataset_cell_main= pd.DataFrame()


for i in range(len(atoms_list)):

    a1=atoms_list[i]
    cell=a1.cell[:].flatten()
    cell = pd.DataFrame({i: cell})
    dataset_cell=cell.T
    dataset_cell_main = pd.concat([dataset_cell_main,dataset_cell], ignore_index=True)
    
    
# cell   
dataset_masses_numbers_main=dataset_cell_main.fillna(0)
# dataset_cell_main.to_excel("dataset_position_main.xlsx")
dataset_cell_main

<IPython.core.display.Javascript object>

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,8.990,0.0,0.0,-0.672979,9.056029,0.0,-0.83506,-0.90911,9.184415
1,8.990,0.0,0.0,-0.672979,9.056029,0.0,-0.83506,-0.90911,9.184415
2,8.990,0.0,0.0,-0.672979,9.056029,0.0,-0.83506,-0.90911,9.184415
3,8.990,0.0,0.0,-0.672979,9.056029,0.0,-0.83506,-0.90911,9.184415
4,8.990,0.0,0.0,-0.672979,9.056029,0.0,-0.83506,-0.90911,9.184415
...,...,...,...,...,...,...,...,...,...
1552,13.738,0.0,0.0,1.608014,13.643567,0.0,0.00000,0.00000,7.542000
1553,13.738,0.0,0.0,1.608014,13.643567,0.0,0.00000,0.00000,7.542000
1554,13.738,0.0,0.0,1.608014,13.643567,0.0,0.00000,0.00000,7.542000
1555,13.738,0.0,0.0,1.608014,13.643567,0.0,0.00000,0.00000,7.542000


In [None]:
# position=a1.get_positions().flatten()
# position


# atomic_num=a1.get_atomic_numbers().flatten()
# atomic_num

# atomic_mass=a1.get_masses().flatten()
# atomic_mass

# cell=a1.cell[:].flatten()
# cell

<IPython.core.display.Javascript object>

In [None]:
# dataset_position = pd.DataFrame({'position': position})
# dataset_position=dataset_position.T
# dataset_position


# dataset_atomic_num = pd.DataFrame({'atomic_num': atomic_num})
# dataset_atomic_num=dataset_atomic_num.T

# dataset_atomic_mass = pd.DataFrame({'atomic_mass': atomic_mass})
# dataset_atomic_mass=dataset_atomic_mass.T

# dataset_cell = pd.DataFrame({'cell': cell})
# dataset_cell=dataset_cell.T



<IPython.core.display.Javascript object>

### Main Dataset_1

In [None]:
frames = [dataset_position_main, dataset_atomic_numbers_main, dataset_masses_numbers_main, dataset_cell_main,  dataset_atoms_data]

Main_Dataset_1 = pd.concat(frames, axis=1)
Main_Dataset_1

<IPython.core.display.Javascript object>

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1,idx,iso
0,-0.731503,5.107745,7.804457,4.635843,0.100921,7.920823,6.779093,1.000704,5.444429,7.493853,...,0.0,0.0,-0.672979,9.056029,0.0,-0.83506,-0.90911,9.184415,25,486.448614
1,-0.323777,4.821551,7.746136,4.898302,-0.063683,7.901260,6.709995,0.827481,5.489984,7.244149,...,0.0,0.0,-0.672979,9.056029,0.0,-0.83506,-0.90911,9.184415,25,488.819630
2,-0.468969,5.162387,7.428079,5.039972,-0.068103,7.805926,6.633251,0.942144,5.644466,7.145261,...,0.0,0.0,-0.672979,9.056029,0.0,-0.83506,-0.90911,9.184415,25,493.750349
3,-0.707422,5.049390,7.822183,4.715939,0.188620,7.775893,6.955133,0.926423,5.381516,7.571735,...,0.0,0.0,-0.672979,9.056029,0.0,-0.83506,-0.90911,9.184415,25,489.565191
4,-0.589937,5.085590,7.623891,4.551480,0.303018,7.840643,6.759740,0.933135,5.496689,7.634863,...,0.0,0.0,-0.672979,9.056029,0.0,-0.83506,-0.90911,9.184415,25,487.256318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552,13.856194,1.504067,2.479810,13.562707,2.970068,2.373920,7.225627,9.970719,6.743151,5.964682,...,0.0,0.0,1.608014,13.643567,0.0,0.00000,0.00000,7.542000,54,493.972531
1553,13.404355,1.282632,2.246385,13.647060,2.740447,2.697924,7.388598,9.880125,7.031935,6.105467,...,0.0,0.0,1.608014,13.643567,0.0,0.00000,0.00000,7.542000,54,493.112637
1554,13.334153,1.149743,2.643018,13.418233,2.633618,2.179185,7.342396,10.097740,6.972051,5.884146,...,0.0,0.0,1.608014,13.643567,0.0,0.00000,0.00000,7.542000,54,492.038928
1555,0.291602,1.594114,1.924040,13.572714,2.631844,2.897184,7.215419,9.986682,6.853566,5.821480,...,0.0,0.0,1.608014,13.643567,0.0,0.00000,0.00000,7.542000,54,497.101677


# Preprocessing the dataset

In [None]:
### cleaning(dataset) :: Cleaning the dataset 

### function created in order to take only numerical features and  remove inf, nan values


def cleaning(dataset):

    print("Shape of dataset before Cleaning", dataset.shape)
    print("Datatypes in this dataset", dataset.dtypes)
    
    df1=dataset.select_dtypes(include=['float64','int64']) # taking only the Columns that contain Numerical Values
    
    df2=df1.loc[:,~df1.columns.duplicated()]  # removing duplicate Columns 
    
    df3=df2.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
    
    
    print("\nShape of dataset after Cleaning", df3.shape)
    print("Datatypes in this dataset", df3.dtypes)
    return df3





## variance(dataset,threshold_value)

# function created in order to remove features that are constant 

# parameter1 -- dataset 
# parameter2 -- threshold_value (range between 0 to 1)


def variance(dataset,threshold_value):
    
    
    
    from sklearn.feature_selection import VarianceThreshold
    varModel =VarianceThreshold(threshold=threshold_value) #Setting variance threshold to 0 which means features that have same value in all samples.
    varModel.fit(dataset)

    constArr=varModel.get_support()  #get_support() return True and False value for each feature.
    #print(constArr)
    #True: Not a constant feature
    #False: Constant feature(It contains same value in all samples.)

    #To find total number of constant and non constant features we will be using collections.Counter function.
    print("Counter for Dataset",collections.Counter(constArr))
    
    constCol=[col for col in dataset.columns if col not in dataset.columns[constArr]]
    print("Constant feature for Dataset \n \n ",constCol)
    
    print("\nShape of  dataset before and after removing  Features of variance  :: ", threshold_value)

    print('Shape before drop-->',dataset.shape)

    dataset.drop(columns=constCol,axis=1,inplace=True)
    print('Shape after drop-->',dataset.shape)
    
    return dataset

    
    

    
# ### duplicateColumns(dataset)

# function created in order to remove duplicate columns of labels
# datasets contain one or more features that show the same values across all the observations. This means that both features are in essence identical

#The method will find the duplicate columns and return name of duplicated columns in an array
def duplicateColumns(dataset):
    dupliCols=[]
    for i in range(0,len(dataset.columns)):
        col1=dataset.columns[i]
        for col2 in dataset.columns[i+1:]:
            if dataset[col1].equals(dataset[col2]):
                dupliCols.append(col1+','+col2)
                
    print('\n \n \n # Total Duplicated columns in  dataset ::',len(dupliCols))
    print("\n\n # Duplicate in  dataset\n\n",dupliCols)
    
    
    #Get the duplicate column names for  Dataset
    dCols =[col.split(',')[1] for col in dupliCols]
    print("\n # Duplicate Columns \n \n ", dCols)
    
    #Find the count of unique columns
    print("\n # Length of Unique Columns for  dataset :: ", len(set(dCols)))
    
    print('\n # Shape before droping duplicate columns for  dataset -->',dataset.shape)
    dataset = dataset.drop(columns=dCols,axis=1)
    print('\n # Shape after droping duplicate columns  dataset-->',dataset.shape)
    
    return dataset

    
    
    
    

### Mul_correlation(dataset,threshold)

# function created to remove Multicollinearity

def Mul_correlation(dataset,threshold):
    col_corr=set() # set will contains unique values.
    corr_matrix=dataset.corr() #finding the correlation between columns.
    for i in range(len(corr_matrix.columns)): #number of columns
        for j in range(i):
            if abs(corr_matrix.iloc[i,j])>threshold: #checking the correlation between columns.
                colName=corr_matrix.columns[i] #getting the column name
                col_corr.add(colName) #adding the correlated column name heigher than threshold value.
                
    
    print("\n \n # Length Correlated columns for  Dataset:", len(col_corr))
    print('\n # Correlated columns for  Dataset:\n \n ',col_corr) 
    
    print('\n # Shape before droping Correlated duplicate columns for  dataset-->',dataset.shape)
    dataset=dataset.drop(columns=col_corr,axis=1)
    print('\n # Shape after droping Correlated duplicate columns for Original dataset-->',dataset.shape)
    
    return dataset

#     return col_corr #returning set of column names





### Correlation 

# finding the correlated labels with target

# Original Datasets
#Correlation with output variable
def rel_feature(dataset, target, thres, corr_method):
    
    cor = dataset.corr(method=corr_method)
    cor_target = abs(cor[target])
    #Selecting highly correlated features
    Relevant_Features_Original = cor_target[cor_target>thres]
    print("Relevant_Features_Original \n \n ", Relevant_Features_Original, '\n' )

    Relevant_Features =dataset.loc[:, abs(dataset.corr(method=corr_method)[target]) > thres]
    return Relevant_Features


# rel_feature(dataset, target, thres)

# corr_method = {‘pearson’, ‘kendall’, ‘spearman’} 


## Clean(dataset) --  function to clean dataset at once

def Clean(dataset):
    df1=cleaning(dataset)
    df2=variance(df1,0)
    df3=variance(df2,0.1)
    df4=duplicateColumns(df3)
    df5=Mul_correlation(df4,0.9)
    
    
    return df5  

<IPython.core.display.Javascript object>

In [None]:
df=Main_Dataset_1.copy()


target = df.iso
features = df.drop('iso', axis = 1)

<IPython.core.display.Javascript object>

In [None]:
features

<IPython.core.display.Javascript object>

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,0.1,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1,idx
0,-0.731503,5.107745,7.804457,4.635843,0.100921,7.920823,6.779093,1.000704,5.444429,7.493853,...,8.990,0.0,0.0,-0.672979,9.056029,0.0,-0.83506,-0.90911,9.184415,25
1,-0.323777,4.821551,7.746136,4.898302,-0.063683,7.901260,6.709995,0.827481,5.489984,7.244149,...,8.990,0.0,0.0,-0.672979,9.056029,0.0,-0.83506,-0.90911,9.184415,25
2,-0.468969,5.162387,7.428079,5.039972,-0.068103,7.805926,6.633251,0.942144,5.644466,7.145261,...,8.990,0.0,0.0,-0.672979,9.056029,0.0,-0.83506,-0.90911,9.184415,25
3,-0.707422,5.049390,7.822183,4.715939,0.188620,7.775893,6.955133,0.926423,5.381516,7.571735,...,8.990,0.0,0.0,-0.672979,9.056029,0.0,-0.83506,-0.90911,9.184415,25
4,-0.589937,5.085590,7.623891,4.551480,0.303018,7.840643,6.759740,0.933135,5.496689,7.634863,...,8.990,0.0,0.0,-0.672979,9.056029,0.0,-0.83506,-0.90911,9.184415,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552,13.856194,1.504067,2.479810,13.562707,2.970068,2.373920,7.225627,9.970719,6.743151,5.964682,...,13.738,0.0,0.0,1.608014,13.643567,0.0,0.00000,0.00000,7.542000,54
1553,13.404355,1.282632,2.246385,13.647060,2.740447,2.697924,7.388598,9.880125,7.031935,6.105467,...,13.738,0.0,0.0,1.608014,13.643567,0.0,0.00000,0.00000,7.542000,54
1554,13.334153,1.149743,2.643018,13.418233,2.633618,2.179185,7.342396,10.097740,6.972051,5.884146,...,13.738,0.0,0.0,1.608014,13.643567,0.0,0.00000,0.00000,7.542000,54
1555,0.291602,1.594114,1.924040,13.572714,2.631844,2.897184,7.215419,9.986682,6.853566,5.821480,...,13.738,0.0,0.0,1.608014,13.643567,0.0,0.00000,0.00000,7.542000,54


In [None]:
features= Clean(features)
features

<IPython.core.display.Javascript object>

Shape of dataset before Cleaning (1557, 375)
Datatypes in this dataset 0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
5      float64
6      float64
7      float64
8      float64
idx      int64
Length: 375, dtype: object

Shape of dataset after Cleaning (1557, 268)
Datatypes in this dataset 0      float64
1      float64
2      float64
3      float64
4      float64
        ...   
263    float64
264    float64
265    float64
266    float64
idx      int64
Length: 268, dtype: object
Counter for Dataset Counter({True: 268})
Constant feature for Dataset 
 
  []

Shape of  dataset before and after removing  Features of variance  ::  0
Shape before drop--> (1557, 268)
Shape after drop--> (1557, 268)
Counter for Dataset Counter({True: 268})
Constant feature for Dataset 
 
  []

Shape of  dataset before and after removing  Features of variance  ::  0.1
Shape before drop--> (1557, 268)
Shape after drop--> (1557, 268)

 
 
 # Total Duplicated columns in  d

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,194,196,205,213,216,226,237,242,243,245
0,-0.731503,5.107745,7.804457,4.635843,0.100921,7.920823,6.779093,1.000704,5.444429,7.493853,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
1,-0.323777,4.821551,7.746136,4.898302,-0.063683,7.901260,6.709995,0.827481,5.489984,7.244149,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
2,-0.468969,5.162387,7.428079,5.039972,-0.068103,7.805926,6.633251,0.942144,5.644466,7.145261,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
3,-0.707422,5.049390,7.822183,4.715939,0.188620,7.775893,6.955133,0.926423,5.381516,7.571735,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
4,-0.589937,5.085590,7.623891,4.551480,0.303018,7.840643,6.759740,0.933135,5.496689,7.634863,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552,13.856194,1.504067,2.479810,13.562707,2.970068,2.373920,7.225627,9.970719,6.743151,5.964682,...,7.130358,13.535510,8.515632,14.672404,6.714327,6.370318,0.0,0.0,0.0,0.0
1553,13.404355,1.282632,2.246385,13.647060,2.740447,2.697924,7.388598,9.880125,7.031935,6.105467,...,7.108486,13.534691,8.484116,14.654799,6.723550,6.450678,0.0,0.0,0.0,0.0
1554,13.334153,1.149743,2.643018,13.418233,2.633618,2.179185,7.342396,10.097740,6.972051,5.884146,...,7.204948,13.627604,8.575255,14.599416,6.834753,6.479057,0.0,0.0,0.0,0.0
1555,0.291602,1.594114,1.924040,13.572714,2.631844,2.897184,7.215419,9.986682,6.853566,5.821480,...,7.271468,0.000000,8.723697,1.066539,6.796779,6.335527,0.0,0.0,0.0,0.0


### Main_dataset_2

In [None]:
Main_Dataset_2 = pd.concat([features,target], axis=1)
Main_Dataset_2

<IPython.core.display.Javascript object>

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,196,205,213,216,226,237,242,243,245,iso
0,-0.731503,5.107745,7.804457,4.635843,0.100921,7.920823,6.779093,1.000704,5.444429,7.493853,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,486.448614
1,-0.323777,4.821551,7.746136,4.898302,-0.063683,7.901260,6.709995,0.827481,5.489984,7.244149,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,488.819630
2,-0.468969,5.162387,7.428079,5.039972,-0.068103,7.805926,6.633251,0.942144,5.644466,7.145261,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,493.750349
3,-0.707422,5.049390,7.822183,4.715939,0.188620,7.775893,6.955133,0.926423,5.381516,7.571735,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,489.565191
4,-0.589937,5.085590,7.623891,4.551480,0.303018,7.840643,6.759740,0.933135,5.496689,7.634863,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,487.256318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552,13.856194,1.504067,2.479810,13.562707,2.970068,2.373920,7.225627,9.970719,6.743151,5.964682,...,13.535510,8.515632,14.672404,6.714327,6.370318,0.0,0.0,0.0,0.0,493.972531
1553,13.404355,1.282632,2.246385,13.647060,2.740447,2.697924,7.388598,9.880125,7.031935,6.105467,...,13.534691,8.484116,14.654799,6.723550,6.450678,0.0,0.0,0.0,0.0,493.112637
1554,13.334153,1.149743,2.643018,13.418233,2.633618,2.179185,7.342396,10.097740,6.972051,5.884146,...,13.627604,8.575255,14.599416,6.834753,6.479057,0.0,0.0,0.0,0.0,492.038928
1555,0.291602,1.594114,1.924040,13.572714,2.631844,2.897184,7.215419,9.986682,6.853566,5.821480,...,0.000000,8.723697,1.066539,6.796779,6.335527,0.0,0.0,0.0,0.0,497.101677


<IPython.core.display.Javascript object>