In [3]:
import numpy as np
import pandas as pd
import time
import sklearn
from sklearn import neural_network
import datetime
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode

%load_ext autoreload
%autoreload 2

np.random.seed(1)

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from functools import reduce
pd.set_option('display.max_columns', 1000)  # or 1000
pd.set_option('display.max_rows', 1000)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199

In [5]:
# Add train set
def load_dataset(path):
    train = pd.read_csv(path + 'train.csv')
    test = pd.read_csv(path + 'test.csv')
    structures = pd.read_csv(path + 'structures.csv')
    submission = pd.read_csv(path + 'sample_submission.csv')
    return train, test, structures, submission

train, test, structures, submission = load_dataset(
    path=
    '/Users/ak/Documents/personal/Personal/molecular/champs-scalar-coupling/')

In [None]:
# Merging all the data together
from sklearn import preprocessing

def merging_structure_train_test(data, structures, atom_idx):
    df = pd.merge(data,
                  structures,
                  how='left',
                  left_on=['molecule_name', f'atom_index_{atom_idx}'],
                  right_on=['molecule_name', 'atom_index'])
    df = df.drop(labels=['atom_index'], axis=1)
    df = df.rename(
        columns={
            'atom': f'atom_{atom_idx}',
            'x': f'x_{atom_idx}',
            'y': f'y_{atom_idx}',
            'z': f'z_{atom_idx}'
        })
    return df


def merging_data(train, test):

    merged_train_data = merging_structure_train_test(train,
                                                     structures=structures,
                                                     atom_idx=0)
    merged_train_data = merging_structure_train_test(merged_train_data,
                                                     structures=structures,
                                                     atom_idx=1)
    merged_train_data.drop(labels=['atom_0', 'atom_1'], axis=1, inplace=True)

    merged_test_data = merging_structure_train_test(test,
                                                    structures=structures,
                                                    atom_idx=0)
    merged_test_data = merging_structure_train_test(merged_test_data,
                                                    structures=structures,
                                                    atom_idx=1)
    merged_test_data.drop(labels=['atom_0', 'atom_1'], axis=1, inplace=True)

    # Calculate the distance between xyz and
    merged_train_data['distance'] = (
        merged_train_data['x_0'] - merged_train_data['x_1'])**2 + (
            merged_train_data['y_0'] - merged_train_data['y_1'])**2 + (
                merged_train_data['z_0'] - merged_train_data['z_1'])**2
    merged_train_data['distance'] = np.sqrt(merged_train_data['distance'])

    merged_test_data['distance'] = (
        merged_test_data['x_0'] - merged_test_data['x_1'])**2 + (
            merged_test_data['y_0'] - merged_test_data['y_1'])**2 + (
                merged_test_data['z_0'] - merged_test_data['z_1'])**2
    merged_test_data['distance'] = np.sqrt(merged_test_data['distance'])

    merged_train_data.drop(labels=['x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1'],
                           axis=1,
                           inplace=True)
    merged_test_data.drop(labels=['x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1'],
                          axis=1,
                          inplace=True)
    
    return merged_train_data, merged_test_data

# Convert the textual data into categorical data
def preporocessing_data(merged_train_data, merged_test_data):

    le = preprocessing.LabelEncoder()
    merged_train_data['molecule_name'] = le.fit_transform(
        merged_train_data['molecule_name'])
    merged_train_data['type'] = le.fit_transform(merged_train_data['type'])
    merged_test_data['molecule_name'] = le.fit_transform(
        merged_test_data['molecule_name'])
    merged_test_data['type'] = le.fit_transform(merged_test_data['type'])
    return merged_train_data, merged_test_data

merged_train, merged_test = merging_data(train, test)
merged_train, merged_test = preporocessing_data(merged_train, merged_test)


In [None]:
# Splitting the training and testing dataset to generate the X and y columns.
def split_data(merged_train_data, merged_test_data):
    train_x = np.array(
        merged_train_data.drop(
            labels=['id', 'molecule_name', 'scalar_coupling_constant'], axis=1))
    train_y = np.around(np.array(
        merged_train_data['scalar_coupling_constant']).reshape(
            (train_x.shape[0], )),
                        decimals=4)

    # Perform Normalization on the training dataset
    from sklearn.preprocessing import normalize
    train_x = normalize(train_x)

    print("Shape of the train_x", train_x.shape)
    print("Shape of the train_y", train_y.shape)

    # Testing data
    test_x = np.array(merged_test_data.iloc[:, 2:])
    print("Shape of the test_x", test_x.shape)
    return train_x, train_y, test_x

train_x, train_y, test_x = split_data(merged_train, merged_test)

In [None]:
# Neural Network using sklearn
def nnl(train_x, train_y):
    print("Starting the Neural Network model")
    # Importing all the required libraies
    import numpy as np
    import pandas as pd
    import time
    import sklearn
    from sklearn import neural_network
    from sklearn.model_selection import GridSearchCV
    # Selection of neurons
    neurons = train_x.shape[0] / (9 * (train_x.shape[1] + 1))
    
    # neurons
    nnl = neural_network.MLPRegressor()

    # Provide parameters to the neural network
    hidden_layer_sizes = [(10, 10, 5)]
    max_iteration = [10, 15]
    learning_rate = ["adaptive"]
    param_grid = dict(hidden_layer_sizes=hidden_layer_sizes,
                      max_iter=max_iteration,
                      learning_rate=learning_rate)
    # Running Grid search on the neural network model with mean squared error and cross fold and n_jobs
    grid_search = GridSearchCV(nnl,
                               param_grid,
                               scoring='neg_mean_squared_error',
                               n_jobs=-1,
                               cv=5,
                               verbose=1)
    grid_result = grid_search.fit(train_x, train_y)
    print("Best: %f using %s" %
          (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    # plot
    plt.errorbar(max_iteration, means, yerr=stds)
    plt.title("Neural Network losses")
    plt.xlabel('max_depth')
    plt.ylabel('Log Loss')
    plt.savefig('max_depth.png')
    return grid_search

grid_search_nnl = nnl(train_x, train_y)


In [None]:
# Random Forrest Regression

# Importing all the required libraies
def rfr(train_x, train_y):
    import numpy as np
    import pandas as pd
    import time
    from sklearn import ensemble
    import datetime
    import matplotlib.pyplot as plt
    import plotly.plotly as py
    import plotly.graph_objs as go
    import plotly.tools as tls
    from plotly.offline import iplot, init_notebook_mode
    %load_ext autoreload
    %autoreload 2

    np.random.seed(1)

    import warnings
    warnings.filterwarnings("ignore", category=FutureWarning)

    from functools import reduce
    pd.set_option('display.max_columns', 1000)  # or 1000
    pd.set_option('display.max_rows', 1000)  # or 1000
    pd.set_option('display.max_colwidth', -1)  # or 199
    from sklearn.model_selection import GridSearchCV
    # neurons
    rfr = ensemble.RandomForestRegressor()

    # Provide parameters to the neural network
    n_estimaters = [100, 150]
    max_depth = [train_x.shape[1]]
    param_grid = dict(n_estimators=n_estimaters)
    # Running Grid search on the neural network model with mean squared error and cross fold and n_jobs
    grid_search = GridSearchCV(rfr,
                               param_grid,
                               scoring='neg_mean_squared_error',
                               n_jobs=-1,
                               cv=5,
                               verbose=1)
    grid_result = grid_search.fit(train_x, train_y)
    print("Best: %f using %s" %
          (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    # plot
    plt.errorbar(n_estimaters, means, yerr=stds)
    plt.title("Neural Network losses")
    plt.xlabel('max_depth')
    plt.ylabel('Log Loss')
    plt.savefig('max_depth.png')
    return grid_search

grid_search_rfr = rfr(train_x, train_y)


In [None]:
# Implementing Xgboost model
    import numpy as np
    import pandas as pd
    import time
    from sklearn import ensemble
    import datetime
    import Xgboost as xgb
    from sklearn.model_selection import GridSearchCV
# Importing all the required libraies
def xgb(train_x, train_y):
    bst = xgb.XGBRegressor(n_estimators=100)
    max_depth = range(1, train_x.shape[1] + 1, 1)
    print(max_depth)
    param_grid = dict(max_depth=max_depth)
    grid_search = GridSearchCV(bst,
                               param_grid,
                               scoring='neg_mean_squared_error',
                               n_jobs=-1,
                               cv=2,
                               verbose=1)
    grid_result = grid_search.fit(train_x, train_y)
    print("Best: %f using %s" %
          (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    # plot
    plt.errorbar(max_depth, means, yerr=stds)
    plt.title("XGBoost max_depth vs Log Loss")
    plt.xlabel('max_depth')
    plt.ylabel('Log Loss')
    plt.savefig('max_depth.png')
    return grid_search

grid_search_xgb = xgb(train_x, train_y)


In [None]:
# Submission Report
def submission_report(grid_search, test_x):
    
    submission.drop(labels=['scalar_coupling_constant'], axis=1, inplace=True)
    submission['scalar_coupling_constant'] = grid_search.predict(test_x)
    submission.to_csv('final_submission_' + str(datetime.datetime.now()) + '.csv',
                      index=False)
    return True

submission_report(grid_search, test_x)
