In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDClassifier, LinearRegression
import time
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import tensorflow as tf
%matplotlib inline

def plot_learning_curves(model, X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2) 
    train_errors, val_errors = [], []
    for m in range(1, len(X_train),10000):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val) 
        train_errors.append(mean_squared_error(y_train_predict, y_train[:m])) 
        val_errors.append(mean_squared_error(y_val_predict, y_val))
    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train") 
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")

In [3]:
# Load Data
FaultSect = pd.read_csv('data/Mon_Dis_JPCC_FAULT_SECT.csv')
CrackIdx = pd.read_csv('data/Mon_Dis_JPCC_CRACK_INDEX.csv')
Rev = pd.read_csv('data/Mon_Dis_JPCC_REV.csv')
TST = pd.read_csv('data/TST_L05B.csv')

In [4]:
FaultSect = FaultSect[['STATE_CODE','SHRP_ID','CRACK_OR_JOINT_EXP','AVG_EDGE_FAULT','AVG_WHEELPATH_FAULT']].dropna()
FaultSect['CRACK_OR_JOINT_EXP'] = LabelEncoder().fit_transform(FaultSect['CRACK_OR_JOINT_EXP'])
# FaultSect

In [5]:
CrackIdx = CrackIdx[['STATE_CODE','SHRP_ID','SURVEY_DATE','HPMS16_CRACKING_PERCENT_JPCC']].dropna()
# CrackIdx

In [6]:
TST = TST[['SHRP_ID','MATL_CODE']]

In [7]:
Rev = Rev.drop(['STATE_CODE_EXP','SURVEY_DATE','JT_SEALED','JT_SEALED_EXP','OTHER'], axis = 1)

In [8]:
Prepared_data = pd.merge(FaultSect,Rev, on = 'SHRP_ID')

In [9]:
Prepared_data = Prepared_data.dropna()

In [10]:
Prepared_data = pd.merge(Prepared_data,TST, on = 'SHRP_ID').dropna()

In [11]:
Prepared_data['MATL_CODE'] = OneHotEncoder().fit_transform(Prepared_data['MATL_CODE'].values.reshape(-1,1)).toarray()
# Prepared_data = Prepared_data.drop(columns='MATL_CODE')
Prepared_data = Prepared_data.drop(['SHRP_ID'], axis = 1)
# Prepared_data['CRACK_OR_JOINT_EXP'] = LabelEncoder().fit_transform(Prepared_data['CRACK_OR_JOINT_EXP'])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [12]:
Target_Labels = Prepared_data[['AVG_EDGE_FAULT','AVG_WHEELPATH_FAULT']]
Prepared_data = Prepared_data.drop(['AVG_EDGE_FAULT','AVG_WHEELPATH_FAULT'], axis = 1)

In [None]:
st = time.time()
tree_reg = DecisionTreeRegressor()
tree_scores = cross_val_score(tree_reg, Prepared_data, Target_Labels,scoring="neg_mean_squared_error", cv=10) 
rmse_scores = np.sqrt(-tree_scores)
print('Time used:',time.time()-st)
print('rmse scores:',rmse_scores)

In [None]:
plot_learning_curves(tree_reg, Prepared_data, Target_Labels)

In [None]:
st = time.time()
forest_reg = RandomForestRegressor()
forest_scores = cross_val_score(forest_reg, Prepared_data, Target_Labels,scoring="neg_mean_squared_error", cv=10) 
rmse_scores = np.sqrt(-forest_scores)
print('Time used:',time.time()-st)
print('rmse scores:',rmse_scores)
plot_learning_curves(forest_reg, Prepared_data, Target_Labels)

In [None]:
st = time.time()
lin_reg_model = LinearRegression()
lin_scores = cross_val_score(lin_reg_model, Prepared_data, Target_Labels,scoring="neg_mean_squared_error", cv=10) 
rmse_scores = np.sqrt(-lin_scores)
print('Time used:',time.time()-st)
print('rmse scores:',rmse_scores)
plot_learning_curves(lin_reg_model, Prepared_data, Target_Labels)

In [None]:
st = time.time()
from sklearn.svm import SVC 
poly_kernel_svm_clf = Pipeline((
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(kernel="poly", degree=3, coef0=1, C=5))
    ))
poly_svm_scores = cross_val_score(poly_kernel_svm_clf, Prepared_data, Target_Labels,scoring="neg_mean_squared_error", cv=10) 
rmse_scores = np.sqrt(-poly_svm_scores)
print('Time used:',time.time()-st)
print('rmse scores:',rmse_scores)
plot_learning_curves(poly_kernel_svm_clf, Prepared_data, Target_Labels)