<a href="https://colab.research.google.com/github/06navarro/Predicci-n-de-propiedades-moleculares/blob/main/propiedades_moleculares.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Primero debemos importar las librerias que vamos a utilizar, para lo cual se debe descargar el kaggle.json (adjunto) para obtener los datos 

In [None]:
!pip install -U -q kaggle
!mkdir -p ~/.kaggle
from google.colab import files
files.upload()

In [None]:
!cp kaggle.json ~/.kaggle/

In [None]:
!kaggle competitions download -c champs-scalar-coupling

In [None]:
!unzip champs-scalar-coupling.zip

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import make_scorer,mean_squared_error
from sklearn.model_selection import cross_val_score
import random
random.seed(42)
import os
#print(os.listdir("../input"))

Necesitamos los siguientes datos, los cuales hemos descargado y guardado en una carpeta llamada input

In [None]:
pot_energy=pd.read_csv('potential_energy.csv')#('../input/potential_energy.csv')
mulliken_charges=pd.read_csv('mulliken_charges.csv')#('../input/mulliken_charges.csv')
train_df=pd.read_csv('train.csv')#('../input/train.csv')
scalar_coupling_cont=pd.read_csv('scalar_coupling_contributions.csv')#('../input/scalar_coupling_contributions.csv')
test_df=pd.read_csv('test.csv')#('../input/test.csv')
magnetic_shield_tensor=pd.read_csv('magnetic_shielding_tensors.csv')#('../input/magnetic_shielding_tensors.csv')
dipole_moment=pd.read_csv('dipole_moments.csv')#('../input/dipole_moments.csv')
structures=pd.read_csv('structures.csv')#('../input/structures.csv')

In [None]:
print('Shape of potential energy dataset:',pot_energy.shape)
print('Shape of mulliken_charges dataset:',mulliken_charges.shape)
print('Shape of train dataset:',train_df.shape)
print('Shape of scalar coupling contributions dataset:',scalar_coupling_cont.shape)
print('Shape of test dataset:',test_df.shape)
print('Shape of magnetic shielding tensors dataset:',magnetic_shield_tensor.shape)
print('Shape of dipole moments dataset:',dipole_moment.shape)
print('Shape of structures dataset:',structures.shape)

Iniciamos la exploracion de los dataset

In [None]:
#Dataset de energia
print('Data Types:\n',pot_energy.dtypes)
print('Descriptive statistics:\n',np.round(pot_energy.describe(),3))
pot_energy.head(6)



#Datasetes de la carga
print('Data Types:\n',mulliken_charges.dtypes)
print('Descriptive statistics:\n',np.round(mulliken_charges.describe(),3))
mulliken_charges.head(6)


#Datasetes 
print('Data Types:\n',train_df.dtypes)
print('Descriptive statistics:\n',np.round(train_df.describe(),3))
train_df.head(6)



#Datasetes del acople escalar
print('Data Types:\n',scalar_coupling_cont.dtypes)
print('Descriptive statistics:\n',np.round(scalar_coupling_cont.describe(),3))
scalar_coupling_cont.head(6)

#Dataset estadistico
print('Data Types:\n',test_df.dtypes)
print('Descriptive statistics:\n',np.round(test_df.describe(),3))
test_df.head(6)

#Dataset de tensor de campo magnetico
print('Data Types:\n',magnetic_shield_tensor.dtypes)
print('Descriptive statistics:\n',np.round(magnetic_shield_tensor.describe(),3))
magnetic_shield_tensor.head(6)


#Dataset de la estructura
print('Data Types:\n',structures.dtypes)
print('Descriptive statistics:\n',np.round(structures.describe(),3))
structures.head(6)

Realizamos un mapa de la estructura atomica y la probamos

In [None]:
def map_atom_data(df,atom_idx):
    df=pd.merge(df,structures,how='left',
               left_on=['molecule_name',f'atom_index_{atom_idx}'],
               right_on=['molecule_name','atom_index'])
    df=df.drop('atom_index',axis=1)
    df=df.rename(columns={'atom':f'atom_{atom_idx}',
                         'x':f'x_{atom_idx}',
                         'y':f'y_{atom_idx}',
                         'z':f'z_{atom_idx}'})
    return df

train_df=map_atom_data(train_df,0)
train_df=map_atom_data(train_df,1)

test_df=map_atom_data(test_df,0)
test_df=map_atom_data(test_df,1)

In [None]:
train_m_0=train_df[['x_0','y_0','z_0']].values
train_m_1=train_df[['x_1','y_1','z_1']].values

test_m_0=test_df[['x_0','y_0','z_0']].values
test_m_1=test_df[['x_0','y_0','z_0']].values


train_df['dist_vector']=np.linalg.norm(train_m_0-train_m_1,axis=1)
train_df['dist_X']=(train_df['x_0']-train_df['x_1'])**2
train_df['dist_Y']=(train_df['y_0']-train_df['y_1'])**2
train_df['dist_Z']=(train_df['z_0']-train_df['z_1'])**2


test_df['dist_vector']=np.linalg.norm(test_m_0-test_m_1,axis=1)
test_df['dist_X']=(test_df['x_0']-test_df['x_1'])**2
test_df['dist_Y']=(test_df['y_0']-test_df['y_1'])**2
test_df['dist_Z']=(test_df['z_0']-test_df['z_1'])**2

In [None]:
train_df['type_0']=train_df['type'].apply(lambda x:x)
test_df['type_0']=test_df['type'].apply(lambda x : x)

train_df=train_df.drop(columns=['molecule_name','type'],axis=1)
display(train_df.head(6))


test_df=test_df.drop(columns=['molecule_name','type'],axis=1)
display(test_df.head(10))

Hacemos un histograma de visualizacion

In [None]:
train_df['type_0']=train_df.type_0.astype('category')
train_df['atom_0']=train_df.atom_0.astype('category')
train_df['atom_1']=train_df.atom_1.astype('category')


test_df['type_0']=test_df.type_0.astype('category')
test_df['atom_0']=test_df.atom_0.astype('category')
test_df['atom_1']=test_df.atom_1.astype('category')


In [None]:
plt.hist(train_df['scalar_coupling_constant'])
plt.ylabel('No of times')
plt.xlabel('scalar copling constant')
plt.show()

In [None]:
plt.hist(train_df['dist_vector'])
plt.ylabel('No of times')
plt.xlabel('Distance vector')
plt.show()

In [None]:
plt.hist(train_df['dist_X'])
plt.ylabel('No of times')
plt.xlabel('X distance vector')
plt.show()

In [None]:
plt.hist(train_df['dist_Y'])
plt.ylabel('No of times')
plt.xlabel('Y distance vector')
plt.show()

In [None]:
plt.hist(train_df['dist_Z'])
plt.ylabel('No of times')
plt.xlabel('Z distance vector')
plt.show()

In [None]:
train_df.head(5)

In [None]:
plt.figure(figsize=(10,8))
plt.ylabel('Frequency')
plt.xlabel('scalar coupling constant')
sn.distplot(train_df['scalar_coupling_constant'])

In [None]:
plt.figure(figsize=(10,8))
plt.ylabel('Frequency')
plt.xlabel('Distance vector')
sn.distplot(train_df['dist_vector'])

In [None]:
plt.figure(figsize=(10,8))
plt.ylabel('Frequency')
plt.xlabel('dist_X')
sn.distplot(train_df['dist_X'])

In [None]:
plt.figure(figsize=(10,8))
plt.ylabel('Frequency')
plt.xlabel('dist_Y')
sn.distplot(train_df['dist_Y'])

In [None]:
plt.figure(figsize=(10,8))
plt.ylabel('Frequency')
plt.xlabel('dist_Z')
sn.distplot(train_df['dist_Z'])

In [None]:
Attributes=['atom_index_0','atom_index_1','type_0','x_0','y_0','z_0','atom_0',
            'atom_1','x_1','y_1','z_1','dist_vector','dist_X','dist_Y','dist_Z']

cat_attributes=['type_0','atom_0','atom_1']
target_label=['scalar_coupling_constant']


X_train=train_df[Attributes]
X_test=test_df[Attributes]
y_target=train_df[target_label]

In [None]:
X_train=pd.get_dummies(data=X_train,columns=cat_attributes)
X_test=pd.get_dummies(data=X_test,columns=cat_attributes)

In [None]:
print(X_train.shape,X_test.shape)

In [None]:
X_1=X_train
y_1=y_target

In [None]:
X_train.head(6)
y_target.head(6)

In [None]:
threshold=0.95

corr_matrix=train_df.corr().abs()


upper=corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))

In [None]:
to_drop=[column for column in upper.columns if any(upper[column]>threshold)]
print('There are are %d columns to remove.'%(len(to_drop)))

In [None]:
train_df=train_df.drop(columns=to_drop)
test_df=test_df.drop(columns=to_drop)
print('Training data shape',train_df.shape)
print('Testing data shape',test_df.shape)

In [None]:
Attributes=['atom_index_0','atom_index_1','type_0','x_0','y_0','z_0','atom_0',
            'atom_1','x_1','y_1','z_1','dist_vector','dist_X','dist_Y','dist_Z']

cat_attributes=['type_0','atom_0','atom_1']
target_label=['scalar_coupling_constant']


X_train=train_df[Attributes]
X_test=test_df[Attributes]
y_target=train_df[target_label]

In [None]:
X_train=pd.get_dummies(data=X_train,columns=cat_attributes)
X_test=pd.get_dummies(data=X_test,columns=cat_attributes)

In [None]:
print(X_train.shape,X_test.shape)

In [None]:
display(y_target.shape)

In [None]:
X_2=X_train
y_2=y_target

In [None]:
X_train.head(6)
X_test.head(6)
y_target.head(6)

Comenzamos con el modelo de prediccion

In [None]:
Attributes=['atom_index_0','atom_index_1','type_0','x_0','y_0','z_0','atom_0',
            'atom_1','x_1','y_1','z_1','dist_vector','dist_X','dist_Y','dist_Z']

cat_attributes=['type_0','atom_0','atom_1']
target_label=['scalar_coupling_constant']


X_train=train_df[Attributes]
X_test=test_df[Attributes]
y_target=train_df[target_label]

In [None]:
X_train=pd.get_dummies(data=X_train,columns=cat_attributes)
X_test=pd.get_dummies(data=X_test,columns=cat_attributes)

In [None]:
print(X_train.shape,X_test.shape)

In [None]:
X_train.head(6)
y_target.head(6)

In [None]:
threshold=0.95

corr_matrix=train_df.corr().abs()


upper=corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))

In [None]:
to_drop=[column for column in upper.columns if any(upper[column]>threshold)]
print('There are are %d columns to remove.'%(len(to_drop)))

In [None]:
train_df=train_df.drop(columns=to_drop)
test_df=test_df.drop(columns=to_drop)
print('Training data shape',train_df.shape)
print('Testing data shape',test_df.shape)

In [None]:
Attributes=['atom_index_0','atom_index_1','type_0','x_0','y_0','z_0','atom_0',
            'atom_1','x_1','y_1','z_1','dist_vector','dist_X','dist_Y','dist_Z']

cat_attributes=['type_0','atom_0','atom_1']
target_label=['scalar_coupling_constant']


X_train=train_df[Attributes]
X_test=test_df[Attributes]
y_target=train_df[target_label]

In [None]:
X_train=pd.get_dummies(data=X_train,columns=cat_attributes)
X_test=pd.get_dummies(data=X_test,columns=cat_attributes)

In [None]:
print(X_train.shape,X_test.shape)

In [None]:
display(y_target.shape)

In [None]:
X_2=X_train
y_2=y_target

In [None]:
X_train.head(6)
X_test.head(6)
y_target.head(6)

Modelo linear

In [None]:
from sklearn import linear_model
linear_reg=linear_model.LinearRegression()
n_folds=5

lin_reg_score=cross_val_score(linear_reg,X_train,y_target,
                          scoring=make_scorer(mean_squared_error),
                          cv=n_folds)
lin_score=sum(lin_reg_score)/n_folds
print('Lin_score:',lin_score)

In [None]:
lr_model=linear_reg.fit(X_train,y_target)
score=np.round(lr_model.score(X_train,y_target),3)
print('Accuracy of trained model:',score)
model_coeff=np.round(lr_model.coef_,3)
print('Model coefficients:',model_coeff)
model_intercept=np.round(lr_model.intercept_,3)
print('Model intercept value:',model_intercept)

In [None]:
from sklearn.metrics import r2_score
y_pred=lr_model.predict(X_test)
SCC=pd.read_csv('sample_submission.csv')#('../input/sample_submission.csv')
SCC['scalar_coupling_constant']= y_pred
SCC.to_csv('Linear_Regression_model.csv',index=False)
y_pred

Modelo de  regresion de Lasso

In [None]:
#Tiempo de ejecucion de 1 hr
from sklearn import linear_model
lasso=linear_model.Lasso(alpha=0.001)
n_folds=5

lasso_score=cross_val_score(lasso,X_train,y_target,
                          scoring=make_scorer(mean_squared_error),
                          cv=n_folds)
lasso_score=sum(lasso_score)/n_folds
print('lasso_score:',lasso_score)
print(lasso)

In [None]:
lasso_model=lasso.fit(X_train,y_target)
score=np.round(lasso_model.score(X_train,y_target),3)
print('Accuracy of trained model:',score)

In [None]:
y_pred=lasso_model.predict(X_test)
SCC=pd.read_csv('sample_submission.csv')#('../input/sample_submission.csv')
SCC['scalar_coupling_constant']= y_pred
SCC.to_csv('Lasso_Regression_model.csv',index=False)
y_pred

In [None]:
from sklearn import linear_model
Elast=linear_model.ElasticNet(alpha=0.008,l1_ratio=0.5,random_state=42)
n_folds=5

Elast_score=cross_val_score(Elast,X_train,y_target,
                          scoring=make_scorer(mean_squared_error),
                          cv=n_folds)
Elast_score=sum(Elast_score)/n_folds
print('Elast_score:',Elast_score)
print(Elast)

In [None]:
ElasticNet_model=linear_reg.fit(X_train,y_target)
score=np.round(ElasticNet_model.score(X_train,y_target),3)
print('Accuracy of trained model:',score)

In [None]:
y_pred=ElasticNet_model.predict(X_test)
SCC=pd.read_csv('sample_submission.csv')#('../input/sample_submission.csv')
SCC['scalar_coupling_constant']= y_pred
SCC.to_csv('ElasticNet_Regression_model.csv',index=False)
y_pred

In [None]:
from hyperopt import fmin,hp,tpe,Trials,space_eval,STATUS_OK,STATUS_RUNNING
hyper_space={'objective':'regression',
             'metric':'mape',
             'boosting':'gbdt',
             'n_estimators':hp.choice('n_estimators',[100,250,450,600,850,1000,2000,3000,4000,5000]),
             'max_depth':hp.choice('max_depth',[5,10,15,20,25,30,35]),
             'num_leaves':hp.choice('num_leaves',[45,60,95,125,145,200]),
             'subsample':hp.choice('subsample',[.1,.2,.3,.4,0.5,0.6,0.7,0.8,0.9,1]),
             'colsample_bytree': hp.choice('colsample_bytree',[.1,.2,.3,.4,.5,0.6,0.7,0.8,0.9,1.0]),
             'learning_rate': hp.choice('learning_rate',[0.1,0.2,0.3,0.35,0.4,0.5]),
             'reg_lambda': hp.choice('reg_lambda',[.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0]),
             'reg_alpha': hp.choice('reg_alpha',[.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0]),
             'min_child_samples':hp.choice('min_child_samples',[3,6,8,12,15])
            }

In [None]:
def metric(df,pred):
    df['diff']=(df['scalar_coupling_constant']-pred).abs()
    return np.log(df.groupby([['type']])['diff'].mean().map(lambda x:max(x,1e-9))).mean()