In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor

from utils import split_data, fit_model, evaluate_model

In [2]:
training_df = pd.read_csv('training.csv')


In [3]:
# Split training data
X_train, X_validation, y_train, y_validation = split_data(training_df)

# Normalize training data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_validation = scaler.transform(X_validation)

# Perform dimensionality reduction on training data
pca = PCA(0.90)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_validation = pca.transform(X_validation)

# Fit model on training data
best_n = 80
best_d = 20
model = RandomForestRegressor(n_estimators = best_n, max_depth = best_d, random_state = 0)
tuned_model = fit_model(model, X_train, y_train)

# Evaluate model on training data
result = evaluate_model(model, X_validation, y_validation)
print(result)


RandomForestRegressor RMSE = 10.3301


In [4]:
test_df = pd.read_csv('test.csv')


In [5]:
# Normalize test data
X_test = scaler.transform(test_df)

# Perform dimensionality reduction on test data
X_test = pca.transform(X_test)

# Predict on test data
y_predicted = tuned_model.predict(X_test)


In [6]:
results_df = test_df
results_df['predicted_critical_temp'] = y_predicted
results_df.head()


Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,predicted_critical_temp
0,5,62.151602,50.375195,29.794101,22.257122,1.294091,1.350425,137.89753,18.478415,45.430813,...,2.825,2.459509,2.41261,1.494403,1.3877,4,1.025,1.32665,1.464368,32.738976
1,3,83.0047,112.237367,47.127841,75.735455,0.680597,0.509844,175.396,91.301667,74.886978,...,6.0,5.013298,5.773684,1.043353,0.887694,4,3.0,1.699673,1.414214,5.286796
2,4,51.81085,35.312828,43.468565,28.909959,1.241927,1.092581,71.6206,18.196,26.646429,...,2.0,2.0,2.0,1.386294,1.054586,0,1.012658,0.0,0.0,11.095005
3,3,156.022467,158.51394,151.747674,154.353143,1.070081,1.041001,87.2204,42.70314,37.982363,...,3.8,3.684031,3.465724,1.028184,1.050858,3,0.95,1.414214,1.469694,2.445855
4,3,156.022467,162.87496,151.747674,158.578945,1.070081,0.99304,87.2204,59.24016,37.982363,...,3.8,3.684031,3.465724,1.028184,1.017164,3,1.2,1.414214,1.469694,2.65871


In [7]:
results_df.to_csv('results.csv', index=False)
