## Importation

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from mendeleev import element

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MaxAbsScaler

import warnings
warnings.filterwarnings('ignore')

## Openning from text file - HALIDES

In [None]:
df = pd.DataFrame(pd.read_table("ICSD-halides.txt"))

In [None]:
df.rename(columns = {"Unnamed: 4": "Perovskite_label"},  
          inplace = True)
df['Perovskite_label']=0
df.fillna('-',inplace = True)

### The one that are perovskite

In [None]:
for l in range(len(df)):
    if 'Perovskite' in df.StructureType[l] or 'perovskite' in df.StructureType[l]:
        df.Perovskite_label[l] = 1

### Get rid of COMPLEX

In [None]:
a = []

for l in range(len(df)):
    if '.' in df.StructuredFormula[l]:
        a.append(l)
        
df_complex = df.drop(a, axis=0)
df_complex.sort_values(by=['Perovskite_label'], ascending=False, inplace=True)
df_complex.reset_index(drop=True, inplace=True)

### Get rid of DUPLICATE taking into account polymorphism

In [None]:
seen = []
uniq = []
for l in range(len(df_complex)):
    uniq.append(l)
    if df_complex.StructuredFormula[l] not in seen:
        del uniq[-1]
        seen.append(df_complex.StructuredFormula[l])

df_complex_multiplicate = df_complex.drop(uniq, axis=0)
df_complex_multiplicate.reset_index(drop=True, inplace=True)

## Code - FINISHED and now extraction

In [None]:
df_complex_multiplicate.to_csv('Parsed-halides.csv', columns = ['StructuredFormula', 'Perovskite_label'], index=False)


## DataFrame for the Oxide Perovskite

In [None]:
dfo1 = pd.DataFrame(pd.read_table("ICSD-oxides-st.txt"))
dfo2 = pd.DataFrame(pd.read_table("ICSD-oxides-2nd.txt"))
dfo3 = pd.DataFrame(pd.read_table("ICSD-oxides-3nd.txt"))

dfoxide = pd.concat([dfo1, dfo2,dfo3])

In [None]:
dfoxide.rename(columns = {"Unnamed: 9": "Perovskite_label"},  
          inplace = True)
dfoxide['Perovskite_label']=0
dfoxide.fillna('-',inplace = True)
dfoxide.reset_index(drop=True, inplace=True)

## Labelling

In [None]:
for l in range(len(dfoxide)):
    if 'Perovskite' in dfoxide.StructureType[l] or 'perovskite' in dfoxide.StructureType[l]:
        dfoxide.Perovskite_label[l] = 1

### Get rid of COMPLEX

In [None]:
aoxide = []

for l in range(len(dfoxide)):
    if '.' in dfoxide.StructuredFormula[l]:
        aoxide.append(l)
        
dfoxide_complex = dfoxide.drop(aoxide, axis=0)
dfoxide_complex.sort_values(by=['Perovskite_label'], ascending=False, inplace=True)
dfoxide_complex.reset_index(drop=True, inplace=True)

### Get rid of DUPLICATE taking into account polymorphism

In [None]:
seenoxide = []
uniqoxide = []
for l in range(len(dfoxide_complex)):
    uniqoxide.append(l)
    if dfoxide_complex.StructuredFormula[l] not in seenoxide:
        del uniqoxide[-1]
        seenoxide.append(dfoxide_complex.StructuredFormula[l])

dfoxide_complex_multiplicate = dfoxide_complex.drop(uniqoxide, axis=0)
dfoxide_complex_multiplicate.reset_index(drop=True, inplace=True)
dfoxide_complex_multiplicate['StructuredFormula'] = dfoxide_complex_multiplicate['StructuredFormula'].str.replace('(', '').str.replace(')','')


## Code - FINISHED and now extraction

In [None]:
dfoxide_complex_multiplicate.to_csv('Parsed-oxides.csv', columns = ['StructuredFormula', 'Perovskite_label'], index=False)


In [None]:
NN = dfoxide_complex_multiplicate[['StructuredFormula','Perovskite_label']]
NN[['Atom1','Atom2','Atom3','Atom4','Atom5','Atom6','Atom7','Atom8','Atom9','Atom10']] = NN.StructuredFormula.str.split(expand=True)

for l in range(len(NN)):
    if NN.Atom4[l] == None:
        NN.drop([l], axis=0, inplace=True)
        continue
    if NN.Atom5[l] != None:
        NN.drop([l], axis=0, inplace=True)

NN_final = NN[['Atom1','Atom2','Atom3','Atom4','Perovskite_label']]



In [None]:
NN_final.Atom1 = NN_final.Atom1.str.replace('\d+', '')
NN_final.Atom2 = NN_final.Atom2.str.replace('\d+', '')
NN_final.Atom3 = NN_final.Atom3.str.replace('\d+', '')
NN_final.Atom4 = NN_final.Atom4.str.replace('\d+', '')

NN_final

## From the dataframe to the numpy array of features. Each compound is defined by 4 features where the features correspond to the atomic number (Z) of the atom. The array should contain the label.


In [None]:
for l in range(len(NN_final)):
    NN_final.Atom1.iloc[l] = element(NN_final.Atom1.iloc[l]).atomic_number
    NN_final.Atom2.iloc[l] = element(NN_final.Atom2.iloc[l]).atomic_number
    NN_final.Atom3.iloc[l] = element(NN_final.Atom3.iloc[l]).atomic_number
    NN_final.Atom4.iloc[l] = element(NN_final.Atom4.iloc[l]).atomic_number
    

In [None]:
NN_4atoms = NN_final.sample(frac = 1)

NN_4atoms

Features = NN_4atoms.to_numpy()

## DOING THE NN

In [None]:
X = Features[:,0:4]
y = Features[:,-1].astype('int')

X = sklearn.preprocessing.normalize(X, norm='l2', axis=1, copy=True, return_norm=False)


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=1)


transformer = MaxAbsScaler().fit(X_train)
transformer.transform(X_train)

clf = MLPClassifier(random_state=1, max_iter=1750, activation='relu', solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,150,50),learning_rate='adaptive').fit(X_train, y_train)
#clf = MLPClassifier(random_state=1, max_iter=360, activation='relu', solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,150,50,100),learning_rate='adaptive').fit(X_train, y_train)



clf.predict_proba(X_test[:1])

clf.predict(X_test[:, :])

print(clf.score(X_test, y_test))
clf.score(X, y)

In [None]:
X = Features[:,0:4]
y = Features[:,-1].astype('int')

X = sklearn.preprocessing.normalize(X, norm='l2', axis=1, copy=True, return_norm=False)


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=1)


transformer = MaxAbsScaler().fit(X_train)
transformer.transform(X_train)

score = []
for l in range(20,2000,10):
    clf = MLPClassifier(random_state=1, max_iter=l, activation='relu', solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(20,20,20,20),learning_rate='adaptive').fit(X_train, y_train)
    score.append(clf.score(X_test, y_test))


In [None]:
plt.plot(range(len(score)), score)
plt.ylabel('score')
plt.xlabel('epochs or max_iter')
plt.show()


print(max(score))
20 + score.index(max(score))*10

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

Features = NN_4atoms.to_numpy()

score = [[],[],[]]


X = Features[:,0:4]
y = Features[:,-1].astype('int')


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 1)

max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_maxabs = max_abs_scaler.fit_transform(X_train)


for l in range(20,1000,30):
    for i in range(20,1000,30):
        for g in range(20,1000,30):
            clf = MLPClassifier(random_state=1, max_iter=240, activation='relu', solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(l,i,g),learning_rate='adaptive').fit(X_train, y_train)

            score.append(clf.score(X_test, y_test))
            layer[0].append(l)
            layer[1].append(i)
            layer[2].append(g)

In [None]:
plt.plot(range(len(score[4:])), score[4:])
plt.ylabel('score')
plt.xlabel('trials')
plt.show()



In [None]:
X = Features[:,0:4]
y = Features[:,-1].astype('int')

n_classes = y.shape

#X = sklearn.preprocessing.normalize(X, norm='l2', axis=1, copy=True, return_norm=False)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=1)


transformer = MaxAbsScaler().fit(X_train)
transformer.transform(X_train)

y_score = MLPClassifier(random_state=1, max_iter=240, activation='relu', solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(80,50,680),learning_rate='adaptive').fit(X_train, y_train)


fpr2, tpr2, threshold = roc_curve(y_test, clf.predict_proba(X_test)[:,1])
roc_auc2 = auc(fpr2, tpr2)

# image drawing
plt.figure()
plt.title('Receiver Operating Characteristic %d iter' %240)
plt.plot(fpr2, tpr2, label = 'MLP AUC = %0.2f' % 0.8331)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np


X = Features[:,0:4]
y = Features[:,-1].astype('int')


X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
                                                    random_state=11)

mlp_gs = MLPClassifier(max_iter=240)
parameter_space = {
    'hidden_layer_sizes': [(80,50,680),],
    'activation': ['tanh', 'relu', 'logistic'],
    'solver': ['sgd', 'adam','lbfgs'],
    'alpha': 10.0 ** -np.arange(1, 7),
    'learning_rate': ["constant", "invscaling", "adaptive"],
    'learning_rate_init': 10.0 ** -np.arange(1, 6),
    'random_state':np.arange(1, 4),
    'tol' : 10.0 ** -np.arange(1, 6),
}
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=5)
clf.fit(X_train, y_train)

print(clf.score(X_test, y_test))
print(clf.best_estimator_)