In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDClassifier, LinearRegression
import time
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
%matplotlib inline

def plot_learning_curves(model, X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2) 
    train_errors, val_errors = [], []
    for m in range(1, len(X_train),10000):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val) 
        train_errors.append(mean_squared_error(y_train_predict, y_train[:m])) 
        val_errors.append(mean_squared_error(y_val_predict, y_val))
    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train") 
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")

In [2]:
# Load Data
FaultSect = pd.read_csv('data/Mon_Dis_JPCC_FAULT_SECT.csv')
CrackIdx = pd.read_csv('data/Mon_Dis_JPCC_CRACK_INDEX.csv')
Rev = pd.read_csv('data/Mon_Dis_JPCC_REV.csv')
TST = pd.read_csv('data/TST_L05B.csv')

In [3]:
FaultSect = FaultSect[['STATE_CODE','SHRP_ID','CRACK_OR_JOINT_EXP','AVG_EDGE_FAULT','AVG_WHEELPATH_FAULT']].dropna()
FaultSect['CRACK_OR_JOINT_EXP'] = LabelEncoder().fit_transform(FaultSect['CRACK_OR_JOINT_EXP'])
# FaultSect

In [4]:
CrackIdx = CrackIdx[['STATE_CODE','SHRP_ID','SURVEY_DATE','HPMS16_CRACKING_PERCENT_JPCC']].dropna()
# CrackIdx

In [5]:
TST = TST[['SHRP_ID','MATL_CODE']]

In [6]:
Rev = Rev.drop(['STATE_CODE_EXP','SURVEY_DATE','JT_SEALED','JT_SEALED_EXP','OTHER'], axis = 1)

In [7]:
Prepared_data = pd.merge(FaultSect,Rev, on = 'SHRP_ID')

In [8]:
Prepared_data = Prepared_data.dropna()

In [None]:
Prepared_data = pd.merge(Prepared_data,TST, on = 'SHRP_ID').dropna()

In [None]:
Prepared_data['MATL_CODE'] = OneHotEncoder().fit_transform(Prepared_data['MATL_CODE'].values.reshape(-1,1)).toarray()
# Prepared_data = Prepared_data.drop(columns='MATL_CODE')
Prepared_data = Prepared_data.drop(['SHRP_ID'], axis = 1)
# Prepared_data['CRACK_OR_JOINT_EXP'] = LabelEncoder().fit_transform(Prepared_data['CRACK_OR_JOINT_EXP'])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [None]:
Target_Labels = Prepared_data[['AVG_EDGE_FAULT','AVG_WHEELPATH_FAULT']]
Prepared_data = Prepared_data.drop(['AVG_EDGE_FAULT','AVG_WHEELPATH_FAULT'], axis = 1)

In [None]:
Corr_data = Prepared_data.corr()
print(Corr_data)

                       STATE_CODE_x  CRACK_OR_JOINT_EXP  STATE_CODE_y  \
STATE_CODE_x               1.000000            0.016499      0.107734   
CRACK_OR_JOINT_EXP         0.016499            1.000000     -0.025657   
STATE_CODE_y               0.107734           -0.025657      1.000000   
CONSTRUCTION_NO            0.044942           -0.159379      0.073157   
CORNER_BREAKS_NO_L         0.014761           -0.039044     -0.025898   
CORNER_BREAKS_NO_M         0.014713           -0.057543     -0.052352   
CORNER_BREAKS_NO_H         0.032870           -0.100567      0.066837   
DURAB_CRACK_NO_L           0.022951           -0.068359      0.027691   
DURAB_CRACK_NO_M           0.023944           -0.066082     -0.012846   
DURAB_CRACK_NO_H           0.032176           -0.088073     -0.029550   
DURAB_CRACK_A_L            0.020391           -0.056204      0.051172   
DURAB_CRACK_A_M            0.016465           -0.046336      0.008869   
DURAB_CRACK_A_H            0.026680           -0.07

In [None]:
from pandas.plotting import scatter_matrix
attributes = [col for col in Prepared_data.columns]
scatter_matrix(Prepared_data[attributes], figsize=(12, 8))