<a href="https://colab.research.google.com/github/Estiven-99/Predicci-n-de-Propiedades-Moleculares-/blob/main/Proyecto%20de%20Predicci%C3%B3n%20Final%20Greedy%20Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Primero debemos importar las librerias que vamos a utilizar, para lo cual se debe descargar el kaggle.json (que es el archivo que se encuentra adjunto) para obtener los datos.**

In [None]:
!pip install -U -q kaggle
!mkdir -p ~/.kaggle
from google.colab import files
files.upload()

In [None]:
!cp kaggle.json ~/.kaggle/

In [None]:
!kaggle competitions download -c champs-scalar-coupling

In [None]:
!unzip champs-scalar-coupling.zip

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import make_scorer,mean_squared_error
from sklearn.model_selection import cross_val_score
import random
random.seed(42)
import os
#print(os.listdir("../input"))

# **Necesitamos los siguientes datos, los cuales hemos descargado y guardado en una carpeta llamada input**

In [None]:
pot_energy=pd.read_csv('potential_energy.csv')#('../input/potential_energy.csv')
mulliken_charges=pd.read_csv('mulliken_charges.csv')#('../input/mulliken_charges.csv')
train_df=pd.read_csv('train.csv')#('../input/train.csv')
scalar_coupling_cont=pd.read_csv('scalar_coupling_contributions.csv')#('../input/scalar_coupling_contributions.csv')
test_df=pd.read_csv('test.csv')#('../input/test.csv')
magnetic_shield_tensor=pd.read_csv('magnetic_shielding_tensors.csv')#('../input/magnetic_shielding_tensors.csv')
dipole_moment=pd.read_csv('dipole_moments.csv')#('../input/dipole_moments.csv')
structures=pd.read_csv('structures.csv')#('../input/structures.csv')

In [None]:
print('Shape of potential energy dataset:',pot_energy.shape)
print('Shape of mulliken_charges dataset:',mulliken_charges.shape)
print('Shape of train dataset:',train_df.shape)
print('Shape of scalar coupling contributions dataset:',scalar_coupling_cont.shape)
print('Shape of test dataset:',test_df.shape)
print('Shape of magnetic shielding tensors dataset:',magnetic_shield_tensor.shape)
print('Shape of dipole moments dataset:',dipole_moment.shape)
print('Shape of structures dataset:',structures.shape)

# **Iniciamos la exploracion de los dataset**

In [None]:
#Dataset de energia
print('Data Types:\n',pot_energy.dtypes)
print('Descriptive statistics:\n',np.round(pot_energy.describe(),3))
pot_energy.head(6)



#Datasetes de la carga
print('Data Types:\n',mulliken_charges.dtypes)
print('Descriptive statistics:\n',np.round(mulliken_charges.describe(),3))
mulliken_charges.head(6)


#Datasetes 
print('Data Types:\n',train_df.dtypes)
print('Descriptive statistics:\n',np.round(train_df.describe(),3))
train_df.head(6)



#Datasetes del acople escalar
print('Data Types:\n',scalar_coupling_cont.dtypes)
print('Descriptive statistics:\n',np.round(scalar_coupling_cont.describe(),3))
scalar_coupling_cont.head(6)

#Dataset estadistico
print('Data Types:\n',test_df.dtypes)
print('Descriptive statistics:\n',np.round(test_df.describe(),3))
test_df.head(6)

#Dataset de tensor de campo magnetico
print('Data Types:\n',magnetic_shield_tensor.dtypes)
print('Descriptive statistics:\n',np.round(magnetic_shield_tensor.describe(),3))
magnetic_shield_tensor.head(6)


#Dataset de la estructura
print('Data Types:\n',structures.dtypes)
print('Descriptive statistics:\n',np.round(structures.describe(),3))
structures.head(6)

# **Realizamos un mapa de la estructura atomica y la probamos**

In [None]:
def map_atom_data(df,atom_idx):
    df=pd.merge(df,structures,how='left',
               left_on=['molecule_name',f'atom_index_{atom_idx}'],
               right_on=['molecule_name','atom_index'])
    df=df.drop('atom_index',axis=1)
    df=df.rename(columns={'atom':f'atom_{atom_idx}',
                         'x':f'x_{atom_idx}',
                         'y':f'y_{atom_idx}',
                         'z':f'z_{atom_idx}'})
    return df

train_df=map_atom_data(train_df,0)
train_df=map_atom_data(train_df,1)

test_df=map_atom_data(test_df,0)
test_df=map_atom_data(test_df,1)

In [None]:
train_m_0=train_df[['x_0','y_0','z_0']].values
train_m_1=train_df[['x_1','y_1','z_1']].values

test_m_0=test_df[['x_0','y_0','z_0']].values
test_m_1=test_df[['x_0','y_0','z_0']].values


train_df['dist_vector']=np.linalg.norm(train_m_0-train_m_1,axis=1)
train_df['dist_X']=(train_df['x_0']-train_df['x_1'])**2
train_df['dist_Y']=(train_df['y_0']-train_df['y_1'])**2
train_df['dist_Z']=(train_df['z_0']-train_df['z_1'])**2


test_df['dist_vector']=np.linalg.norm(test_m_0-test_m_1,axis=1)
test_df['dist_X']=(test_df['x_0']-test_df['x_1'])**2
test_df['dist_Y']=(test_df['y_0']-test_df['y_1'])**2
test_df['dist_Z']=(test_df['z_0']-test_df['z_1'])**2

In [None]:
train_df['type_0']=train_df['type'].apply(lambda x:x)
test_df['type_0']=test_df['type'].apply(lambda x : x)

train_df=train_df.drop(columns=['molecule_name','type'],axis=1)
display(train_df.head(6))


test_df=test_df.drop(columns=['molecule_name','type'],axis=1)
display(test_df.head(10))

# **Hacemos un histograma de visualizacion**

In [None]:
train_df['type_0']=train_df.type_0.astype('category')
train_df['atom_0']=train_df.atom_0.astype('category')
train_df['atom_1']=train_df.atom_1.astype('category')


test_df['type_0']=test_df.type_0.astype('category')
test_df['atom_0']=test_df.atom_0.astype('category')
test_df['atom_1']=test_df.atom_1.astype('category')

In [None]:
plt.hist(train_df['scalar_coupling_constant'])
plt.ylabel('No of times')
plt.xlabel('scalar copling constant')
plt.show()

In [None]:
plt.hist(train_df['dist_vector'])
plt.ylabel('No of times')
plt.xlabel('Distance vector')
plt.show()

In [None]:
plt.hist(train_df['dist_X'])
plt.ylabel('No of times')
plt.xlabel('X distance vector')
plt.show()

In [None]:
plt.hist(train_df['dist_Y'])
plt.ylabel('No of times')
plt.xlabel('Y distance vector')
plt.show()

In [None]:
plt.hist(train_df['dist_Z'])
plt.ylabel('No of times')
plt.xlabel('Z distance vector')
plt.show()

In [None]:
train_df.head(5)

In [None]:
plt.figure(figsize=(10,8))
plt.ylabel('Frequency')
plt.xlabel('scalar coupling constant')
sn.distplot(train_df['scalar_coupling_constant'])

In [None]:
plt.figure(figsize=(10,8))
plt.ylabel('Frequency')
plt.xlabel('Distance vector')
sn.distplot(train_df['dist_vector'])

In [None]:
plt.figure(figsize=(10,8))
plt.ylabel('Frequency')
plt.xlabel('dist_X')
sn.distplot(train_df['dist_X'])

In [None]:
plt.figure(figsize=(10,8))
plt.ylabel('Frequency')
plt.xlabel('dist_Y')
sn.distplot(train_df['dist_Y'])

In [None]:
plt.figure(figsize=(10,8))
plt.ylabel('Frequency')
plt.xlabel('dist_Z')
sn.distplot(train_df['dist_Z'])

In [None]:
Attributes=['atom_index_0','atom_index_1','type_0','x_0','y_0','z_0','atom_0',
            'atom_1','x_1','y_1','z_1','dist_vector','dist_X','dist_Y','dist_Z']

cat_attributes=['type_0','atom_0','atom_1']
target_label=['scalar_coupling_constant']


X_train=train_df[Attributes]
X_test=test_df[Attributes]
y_target=train_df[target_label]

In [None]:
X_train=pd.get_dummies(data=X_train,columns=cat_attributes)
X_test=pd.get_dummies(data=X_test,columns=cat_attributes)

In [None]:
print(X_train.shape,X_test.shape)

In [None]:
X_1=X_train
y_1=y_target

In [None]:
X_train.head(6)
y_target.head(6)

In [None]:
threshold=0.95

corr_matrix=train_df.corr().abs()


upper=corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))

In [None]:
to_drop=[column for column in upper.columns if any(upper[column]>threshold)]
print('There are are %d columns to remove.'%(len(to_drop)))

In [None]:
train_df=train_df.drop(columns=to_drop)
test_df=test_df.drop(columns=to_drop)
print('Training data shape',train_df.shape)
print('Testing data shape',test_df.shape)

In [None]:
Attributes=['atom_index_0','atom_index_1','type_0','x_0','y_0','z_0','atom_0',
            'atom_1','x_1','y_1','z_1','dist_vector','dist_X','dist_Y','dist_Z']

cat_attributes=['type_0','atom_0','atom_1']
target_label=['scalar_coupling_constant']


X_train=train_df[Attributes]
X_test=test_df[Attributes]
y_target=train_df[target_label]

In [None]:
X_train=pd.get_dummies(data=X_train,columns=cat_attributes)
X_test=pd.get_dummies(data=X_test,columns=cat_attributes)

In [None]:
print(X_train.shape,X_test.shape)

In [None]:
display(y_target.shape)

In [None]:
X_2=X_train
y_2=y_target

In [None]:
X_train.head(6)
X_test.head(6)
y_target.head(6)

# **Comenzamos con el modelo de prediccion**

In [None]:
Attributes=['atom_index_0','atom_index_1','type_0','x_0','y_0','z_0','atom_0',
            'atom_1','x_1','y_1','z_1','dist_vector','dist_X','dist_Y','dist_Z']

cat_attributes=['type_0','atom_0','atom_1']
target_label=['scalar_coupling_constant']


X_train=train_df[Attributes]
X_test=test_df[Attributes]
y_target=train_df[target_label]

In [None]:
X_train=pd.get_dummies(data=X_train,columns=cat_attributes)
X_test=pd.get_dummies(data=X_test,columns=cat_attributes)

In [None]:
print(X_train.shape,X_test.shape)

In [None]:
X_train.head(6)
y_target.head(6)

In [None]:
threshold=0.95

corr_matrix=train_df.corr().abs()


upper=corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))

In [None]:
to_drop=[column for column in upper.columns if any(upper[column]>threshold)]
print('There are are %d columns to remove.'%(len(to_drop)))

In [None]:
Attributes=['atom_index_0','atom_index_1','type_0','x_0','y_0','z_0','atom_0',
            'atom_1','x_1','y_1','z_1','dist_vector','dist_X','dist_Y','dist_Z']

cat_attributes=['type_0','atom_0','atom_1']
target_label=['scalar_coupling_constant']


X_train=train_df[Attributes]
X_test=test_df[Attributes]
y_target=train_df[target_label]

In [None]:
X_train=pd.get_dummies(data=X_train,columns=cat_attributes)
X_test=pd.get_dummies(data=X_test,columns=cat_attributes)

In [None]:

print(X_train.shape,X_test.shape)

In [None]:
display(y_target.shape)

In [None]:
X_2=X_train
y_2=y_target

In [None]:
X_train.head(6)
X_test.head(6)
y_target.head(6)

# **Algoritmo de Greedy Forest**

In [None]:
from rgf.sklearn import RGFRegressor,FastRGFRegressor
from sklearn.metrics import make_scorer,mean_squared_error
from sklearn.model_selection import cross_val_score

In [None]:
rgf=RGFRegressor(max_leaf=500,algorithm='RGF_Sib',test_interval=100,
                loss='LS',verbose=False)
n_folds=3
rgf_scores = cross_val_score(rgf,X_train,y_target,scoring=make_scorer(mean_squared_error,greater_is_better=False),cv=n_folds)
rgf_score=sum(rgf_scores)/n_folds
print('rgf_score:',rgf_score)

In [None]:
rgf_model=rgf.fit(X_train,y_target)


In [None]:
y_pred=rgf_model.predict(X_test)
SCC=pd.read_csv('sample_submission.csv')#('../input/sample_submission.csv')
SCC['scalar_coupling_constant']= y_pred
SCC.to_csv('RGF_model.csv',index=False)
y_pred

In [None]:
Frgf=FastRGFRegressor( opt_algorithm='rgf',l2=2000.0,
                     min_child_weight=5.0,
                     sparse_max_features=80000,
                     sparse_min_occurences=5
                     )
Frgf=FastRGFRegressor(n_estimators=1000)
n_folds=3
Frgf_scores=cross_val_score(Frgf,X_train,y_target,
                          scoring=make_scorer(mean_squared_error),
                          cv=n_folds)
Frgf_score=sum(Frgf_scores)/n_folds
print('Frgf_score:',Frgf_score)
print(Frgf)