In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import pickle
import numpy as np
from sklearn.linear_model import LinearRegression

def create_waterClass_model(data): 
    data = data.drop(['DMS (mg/L)'], axis=1)
    X = data.drop(['water_class'], axis=1)
    y = data['water_class']
  
  # The data is already "scaled" through converting all chemical substances to milliequivalents.
  
  # split the data
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, 
    random_state=42
    )

  # train the model
    model = RandomForestRegressor(n_estimators=100)
    model.fit(X_train, y_train)
  
  # test model
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    print("R-squared: \n", r2)

    input_data = {
                    'EC-(mS/m)':479.82,
                   'pH-Diss-Water (PH)':7.94,
                   'Ca (mg/L)':159.16,
                   'Mg (mg/L)':68.55,
                   'Na (mg/L)':898.14,
                   'K (mg/L)':16.42,
                   'TAL CaCO3 (mg/L)':329.1,
                   'Cl (mg/L)':1186.31,
                   'SO4 (mg/L)':554.24,
                   'F (mg/L)': 3.14}
# Predict water safety
    input_df = pd.DataFrame(input_data,  index=[0])
    prediction = model.predict(input_df)
    
    print("Water class prediction:", round(prediction[0]))
  
    return model

def create_DMS_model(data): 
    data = data.drop(['water_class'], axis=1)
    X = data.drop(['DMS (mg/L)'], axis=1)
    y = data['DMS (mg/L)']

    # The data is already "scaled" through converting all chemical substances to milliequivalents.

    # split the data
    X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, 
    random_state=42
    )

    # train the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # test model
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    print("R-squared: \n", r2)

    input_data = {
                    'EC-(mS/m)':479.82,
                   'pH-Diss-Water (PH)':7.94,
                   'Ca (mg/L)':159.16,
                   'Mg (mg/L)':68.55,
                   'Na (mg/L)':898.14,
                   'K (mg/L)':16.42,
                   'TAL CaCO3 (mg/L)':329.1,
                   'Cl (mg/L)':1186.31,
                   'SO4 (mg/L)':554.24,
                   'F (mg/L)': 3.14}
    # Predict water safety
    input_df = pd.DataFrame(input_data,  index=[0])
    prediction = model.predict(input_df).round(2)
    
    print("Predicted DMS:", prediction)
    return model

def get_clean_data():
    
    data = pd.read_excel('../../water_classification_mgL.xlsx','Sheet1')
    data = data.drop(['pH_range', 'EC_range', 'Ca_range', 'Mg_range', 'Na_range', 'TAL_range', 'Cl_range', 'F_range',
                    'SO4_range', 'Monitoring Point Name','Sample Start Date' ,'Latitude' ,
                    'Longitude' ,'Drainage Region Name', 'resource_unit','DMS_range'], axis=1)
    print(data.head())
    return data

def main():
    data = get_clean_data()

    waterClass_model = create_waterClass_model(data)
    DMS_model = create_DMS_model(data)

    with open('waterClass_model.pkl', 'wb') as f:
        pickle.dump(waterClass_model, f)

    with open('DMS_model.pkl', 'wb') as f:
        pickle.dump(DMS_model, f)
    
  #with open('scaler.pkl', 'wb') as f:
  #    pickle.dump(scaler, f)
  
if __name__ == '__main__':
    main()

   DMS (mg/L)  EC-(mS/m)  pH-Diss-Water (PH)  Ca (mg/L)  Mg (mg/L)  Na (mg/L)  \
0       277.0       40.0                8.42       29.8       12.2       28.6   
1       272.0       37.3                8.27       29.1       11.3       23.5   
2       247.0       36.9                8.18       27.6       10.6       25.0   
3       267.0       38.3                8.16       30.8       11.4       24.1   
4       236.0       35.2                8.18       28.6       10.6       20.8   

   K (mg/L)  TAL CaCO3 (mg/L)  Cl (mg/L)  SO4 (mg/L)  F (mg/L)  water_class  
0      2.53             120.9       25.3        29.6      0.22            0  
1      2.56             122.8       19.0        35.1      0.24            0  
2      2.77             108.8       16.8        29.2      0.25            0  
3      2.81             112.7       17.7        39.7      0.23            0  
4      2.27             100.0       14.2        35.2      0.21            0  
R-squared: 
 0.9955335424397401
Water class p