In [156]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df,name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name,x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the origional column (name) matches each of the target_values.  One column is added for
# each target value.
def encode_text_single_dummy(df,name,target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x)==str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name,tv)
        df[name2] = l
    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df,name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Encode a numeric column as zscores
def encode_numeric_zscore(df,name,mean=None,sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name]-mean)/sd

# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)

# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df,target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)

    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        return df.as_matrix(result).astype(np.float32),df.as_matrix([target]).astype(np.int32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32),df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

# Regression chart, we will see more of this chart in the next class.
def chart_regression(pred,y):
    t = pd.DataFrame({'pred' : pred, 'y' : y})
    t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()
    
# Get a new directory to hold checkpoints from a neural network.  This allows the neural network to be
# loaded later.  If the erase param is set to true, the contents of the directory will be cleared.
def get_model_dir(name,erase):
    base_path = os.path.join(".","dnn")
    model_dir = os.path.join(base_path,name)
    os.makedirs(model_dir,exist_ok=True)
    if erase and len(model_dir)>4 and os.path.isdir(model_dir):
        shutil.rmtree(model_dir,ignore_errors=True) # be careful, this deletes everything below the specified path
    return model_dir

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name]-df[name].mean())>=(sd*df[name].std()))]
    df.drop(drop_rows,axis=0,inplace=True)
    
# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low =-1, normalized_high =1, 
                         data_low=None, data_high=None):
    
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])
    
    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
                * (normalized_high - normalized_low) + normalized_low
        


In [162]:
from sklearn.model_selection import train_test_split

df = pd.read_csv("auto-mpg.csv",na_values=['NA','?'])


print("Esses testes foram feitos com um base sobre carros e seus atributos. O objetivo é prever quanto"+
      "o carro fará por litro de combustível (ou quantas milhas por galão). Abaixo segue uma porção dessa base.")        

print("\n")
print(df.head())

#procura por colunas que tenham algum Nan
df.isnull().any()
#corrige Nan
df['horsepower'][df['horsepower'].isnull()] = df.horsepower.mean()
df.isnull().any()
#remove nome
df.drop('name',axis=1,inplace=True)



Esses testes foram feitos com um base sobre carros e seus atributos. O objetivo é prever quantoo carro fará por litro de combustível (ou quantas milhas por galão). Abaixo segue uma porção dessa base.


    mpg  cylinders  displacement  horsepower  weight  acceleration  year  \
0  18.0          8         307.0       130.0    3504          12.0    70   
1  15.0          8         350.0       165.0    3693          11.5    70   
2  18.0          8         318.0       150.0    3436          11.0    70   
3  16.0          8         304.0       150.0    3433          12.0    70   
4  17.0          8         302.0       140.0    3449          10.5    70   

   origin                       name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino  


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [215]:
from sklearn                        import metrics, svm
from sklearn.linear_model           import LinearRegression
from sklearn.linear_model           import LogisticRegression
from sklearn.tree                   import DecisionTreeClassifier
from sklearn.neighbors              import KNeighborsClassifier
from sklearn.discriminant_analysis  import LinearDiscriminantAnalysis
from sklearn.naive_bayes            import GaussianNB
from sklearn.svm                    import SVC
from sklearn.svm                    import SVR
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


#Caso 1 - Linear Regression sem cross-validation
# Shuffle
print("Caso 1 - Linear Regression sem cross-validation")
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,1:], df.iloc[:,0:1], test_size=0.23, random_state=42) 
#Normalization
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

classifier = LinearRegression()
classifier.fit(x_train_scaled, y_train)
# The mean squared error
print("Mean squared error: %.2f" % np.mean((classifier.predict(x_test_scaled) - y_test) ** 2))
pred = classifier.predict(x_test_scaled)
score = metrics.mean_squared_error(y_test, pred)
print("Mean squared error: {}".format(score))
score=cross_val_score(classifier, x_test_scaled, y_test, scoring='neg_mean_squared_error') 
print("Mean squared error: {}".format(score))
# Evaluate success using accuracy
print("Accuracy: %.3f" % classifier.score(X=x_test_scaled,y=y_test))

print("\n\n")

#Caso 2 - Linear Regression com 5 fold cross-validation
# Shuffle
print("Caso 2 - Linear Regression com 5 fold cross-validation")
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

classifier = LinearRegression()
kf = KFold(5)
    
oos_y = []
oos_pred = []
fold = 0

x = df.as_matrix(columns=df.columns[1:])  
y = df.as_matrix(columns=['mpg'])

for train, test in kf.split(x):
    fold+=1
    print("Fold #{}".format(fold))
        
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    #Normalization
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    
    classifier.fit(x_train_scaled, y_train)
    pred = classifier.predict(x_test_scaled)
    oos_y.append(y_test)
    oos_pred.append(pred)        

    # Measure accuracy
    score = (metrics.mean_squared_error(y_test,pred))
    print("Fold score (RMSE): {}".format(score))

    # Evaluate success using accuracy
    print("Accuracy: %.3f" % classifier.score(X=x_test_scaled,y=y_test))

# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = (metrics.mean_squared_error(oos_y,oos_pred))
print("Final, out of sample score (RMSE): {}".format(score))    


print("\n")

print("\n Tentativa de selecionar melhores features \n")

from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel

#Removing features with low variance
print("Original shape: {}".format(np.shape(df.iloc[:,1:])))
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
features = sel.fit_transform(df.iloc[:,1:])
print("Shape apos Removing features with low variance {}".format(np.shape(features))) #nenhuma foi selecionada 
print("\n")
#Tree-based feature selection
clf = ExtraTreesRegressor()
clf = clf.fit(x_train,y_train)
data = np.zeros((1,x_train.shape[1]))
data = pd.DataFrame(data, columns=df.columns[1:])
data.iloc[0] = clf.feature_importances_
print("As features selecionadas com Tree-based feature selection foram: \n")
print(data)

model = SelectFromModel(clf, prefit=True)
X_new = model.transform(df.iloc[:,1:])
print("\n New shape apos Tree-based feature selection: {}".format(X_new.shape))

print("\n Fim tentativa selecionar melhores features \n")

print("Treinando novamente somente com as features relevantes \n")

x_train, x_test, y_train, y_test = train_test_split(X_new[:,1:], X_new[:,0:1], test_size=0.23, random_state=42) 
#Normalization
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

classifier = LinearRegression()
classifier.fit(x_train_scaled, y_train)
# The mean squared error
print("Mean squared error: %.2f" % np.mean((classifier.predict(x_test_scaled) - y_test) ** 2))
pred = classifier.predict(x_test_scaled)
score = metrics.mean_squared_error(y_test, pred)
print("Mean squared error: {}".format(score))
score=cross_val_score(classifier, x_test_scaled, y_test, scoring='neg_mean_squared_error') 
print("Mean squared error: {}".format(score))
# Evaluate success using accuracy
print("Accuracy: %.3f" % classifier.score(X=x_test_scaled,y=y_test))



Caso 1 - Linear Regression sem cross-validation
Mean squared error: 10.02
Mean squared error: 10.01658156097819
Mean squared error: [-10.79033883  -7.18706496 -20.23691089]
Accuracy: 0.812



Caso 2 - Linear Regression com 5 fold cross-validation
Fold #1
Fold score (RMSE): 7.4414231194449965
Accuracy: 0.830
Fold #2
Fold score (RMSE): 14.63317341217037
Accuracy: 0.792
Fold #3
Fold score (RMSE): 14.083844512588971
Accuracy: 0.801
Fold #4
Fold score (RMSE): 8.667085016994045
Accuracy: 0.831
Fold #5
Fold score (RMSE): 12.994028636748292
Accuracy: 0.803
Final, out of sample score (RMSE): 11.56759613613566



 Tentativa de selecionar melhores features 

Original shape: (398, 7)
Shape apos Removing features with low variance (398, 7)


As features selecionadas com Tree-based feature selection foram: 

   cylinders  displacement  horsepower   weight  acceleration      year  \
0   0.337626      0.189094    0.049277  0.23691      0.035665  0.135492   

     origin  
0  0.015936  

 New shape apo



In [159]:
classifier = SVR()
#Caso 3 - SVM
# Shuffle
print("Caso 2 - SVM")
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,1:], df.iloc[:,0:1], test_size=0.23, random_state=42) 
#Normalization
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

classifier.fit(x_train_scaled, y_train)

# The mean squared error
pred = classifier.predict(x_test_scaled)
score = metrics.mean_squared_error(y_test, pred)
print("Mean squared error: {}".format(score))
# Evaluate success using accuracy
print("Accuracy: %.3f" % classifier.score(X=x_test_scaled,y=y_test))

print("\n\n")

#Caso 2
# Shuffle
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

print("Caso 2 - SVM com cross-validation")


classifier = SVR()
kf = KFold(5)    
oos_y = []
oos_pred = []
fold = 0

x = df.as_matrix(columns=df.columns[1:])  
y = df.as_matrix(columns=['mpg'])

for train, test in kf.split(x):
    fold+=1
    print("Fold #{}".format(fold))
        
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    #Normalization
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    
    classifier.fit(x_train_scaled, y_train)
    pred = classifier.predict(x_test_scaled)
    oos_y.append(y_test)
    oos_pred.append(pred)        

    # Measure accuracy
    score = (metrics.mean_squared_error(y_test,pred))
    print("Fold score (RMSE): {}".format(score))

    # Evaluate success using accuracy
    print("Accuracy: %.3f" % classifier.score(X=x_test_scaled,y=y_test))

# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = (metrics.mean_squared_error(oos_y,oos_pred))
print("Final, out of sample score (RMSE): {}".format(score))    


Caso 2 - SVM
Mean squared error: 5.146443464489598
Accuracy: 0.879



Caso 2 - SVM com cross-validation
Fold #1
Fold score (RMSE): 4.750860656890248
Accuracy: 0.883
Fold #2
Fold score (RMSE): 11.097580127922166
Accuracy: 0.847
Fold #3
Fold score (RMSE): 13.380820206620523
Accuracy: 0.799
Fold #4
Fold score (RMSE): 13.092690781426922
Accuracy: 0.759
Fold #5
Fold score (RMSE): 9.176053794587032
Accuracy: 0.862
Final, out of sample score (RMSE): 10.295406283466676


  y = column_or_1d(y, warn=True)


In [172]:
##Random Forests
from sklearn.ensemble import RandomForestRegressor

# Shuffle
print("Random Forests")
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,1:], df.iloc[:,0:1], test_size=0.23, random_state=42) 


classifier = RandomForestRegressor(n_estimators=10)

classifier.fit(x_train, y_train)

# The mean squared error
pred = classifier.predict(x_test)
score = metrics.mean_squared_error(y_test, pred)
print("Mean squared error: {}".format(score))
# Evaluate success using accuracy
print("Accuracy: %.3f" % classifier.score(X=x_test,y=y_test))

print("\n\n")

#Caso 2
# Shuffle
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

print("Random Forests com cross-validation")


classifier = RandomForestRegressor(n_estimators=10)
kf = KFold(5)    
oos_y = []
oos_pred = []
fold = 0

x = df.as_matrix(columns=df.columns[1:])  
y = df.as_matrix(columns=['mpg'])

for train, test in kf.split(x):
    fold+=1
    print("Fold #{}".format(fold))
        
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    #Normalization
    #scaler = preprocessing.StandardScaler().fit(x_train)
    #x_train_scaled = scaler.transform(x_train)
    #x_test_scaled = scaler.transform(x_test)
    
    classifier.fit(x_train, y_train)
    pred = classifier.predict(x_test)
    oos_y.append(y_test)
    oos_pred.append(pred)        

    # Measure accuracy
    score = (metrics.mean_squared_error(y_test,pred))
    print("Fold score (RMSE): {}".format(score))

    # Evaluate success using accuracy
    print("Accuracy: %.3f" % classifier.score(X=x_test,y=y_test))

# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = (metrics.mean_squared_error(oos_y,oos_pred))
print("Final, out of sample score (RMSE): {}".format(score))    



Random Forests
Mean squared error: 10.030458695652172
Accuracy: 0.828



Random Forests com cross-validation
Fold #1
Fold score (RMSE): 10.418991249999998
Accuracy: 0.821
Fold #2
Fold score (RMSE): 11.093125000000004
Accuracy: 0.806
Fold #3
Fold score (RMSE): 5.741582500000002
Accuracy: 0.879
Fold #4
Fold score (RMSE): 6.169834177215187
Accuracy: 0.907
Fold #5
Fold score (RMSE): 10.966988607594937
Accuracy: 0.849
Final, out of sample score (RMSE): 8.879660552763818




In [175]:
##K NearestNeighbors
from sklearn.neighbors import KNeighborsRegressor

# Shuffle
print("KNeighborsRegressor")
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,1:], df.iloc[:,0:1], test_size=0.23, random_state=42) 

#Normalization
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

classifier = KNeighborsRegressor()

classifier.fit(x_train_scaled, y_train)

# The mean squared error
pred = classifier.predict(x_test_scaled)
score = metrics.mean_squared_error(y_test, pred)
print("Mean squared error: {}".format(score))
# Evaluate success using accuracy
print("Accuracy: %.3f" % classifier.score(X=x_test_scaled,y=y_test))

print("\n")

#Caso 2
# Shuffle
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

print("KNeighborsRegressor com cross-validation")


classifier = KNeighborsRegressor()
kf = KFold(5)    
oos_y = []
oos_pred = []
fold = 0

x = df.as_matrix(columns=df.columns[1:])  
y = df.as_matrix(columns=['mpg'])

for train, test in kf.split(x):
    fold+=1
    print("Fold #{}".format(fold))
        
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    #Normalization
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    
    classifier.fit(x_train_scaled, y_train)
    pred = classifier.predict(x_test_scaled)
    oos_y.append(y_test)
    oos_pred.append(pred)        

    # Measure accuracy
    score = (metrics.mean_squared_error(y_test,pred))
    print("Fold score (RMSE): {}".format(score))

    # Evaluate success using accuracy
    print("Accuracy: %.3f" % classifier.score(X=x_test_scaled,y=y_test))

# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = (metrics.mean_squared_error(oos_y,oos_pred))
print("Final, out of sample score (RMSE): {}".format(score))    



KNeighborsRegressor
Mean squared error: 5.634786956521739
Accuracy: 0.877


KNeighborsRegressor com cross-validation
Fold #1
Fold score (RMSE): 5.7734499999999995
Accuracy: 0.879
Fold #2
Fold score (RMSE): 5.840995
Accuracy: 0.892
Fold #3
Fold score (RMSE): 10.902890000000001
Accuracy: 0.829
Fold #4
Fold score (RMSE): 10.44900253164557
Accuracy: 0.843
Fold #5
Fold score (RMSE): 13.374724050632912
Accuracy: 0.809
Final, out of sample score (RMSE): 9.254927638190955


In [235]:
from sklearn.neural_network import MLPRegressor

# Shuffle
print("NN MLPRegressor")
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,1:], df.iloc[:,0:1], test_size=0.23, random_state=42) 

#Normalization
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

classifier = MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15, 2), random_state=1)

classifier.fit(x_train_scaled, y_train)

# The mean squared error
pred = classifier.predict(x_test_scaled)
score = metrics.mean_squared_error(y_test, pred)
print("Mean squared error: {}".format(score))
# Evaluate success using accuracy
print("Accuracy: %.3f" % classifier.score(X=x_test_scaled,y=y_test))

print("\n")

#Caso 2
# Shuffle
np.random.seed(42)
df = df.reindex(np.random.permutation(df.index))
df.reset_index(inplace=True, drop=True)

print("MLPRegressor com cross-validation")

classifier = MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15, 2), random_state=1)
kf = KFold(5)    
oos_y = []
oos_pred = []
fold = 0

x = df.as_matrix(columns=df.columns[1:])  
y = df.as_matrix(columns=['mpg'])

for train, test in kf.split(x):
    fold+=1
    print("Fold #{}".format(fold))
        
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    #Normalization
    scaler = preprocessing.StandardScaler().fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    
    classifier.fit(x_train_scaled, y_train)
    pred = classifier.predict(x_test_scaled)
    oos_y.append(y_test)
    oos_pred.append(pred)        

    # Measure accuracy
    score = (metrics.mean_squared_error(y_test,pred))
    print("Fold score (RMSE): {}".format(score))

    # Evaluate success using accuracy
    print("Accuracy: %.3f" % classifier.score(X=x_test_scaled,y=y_test))

# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = (metrics.mean_squared_error(oos_y,oos_pred))
print("Final, out of sample score (RMSE): {}".format(score))    



NN MLPRegressor
Mean squared error: 9.473819442913978
Accuracy: 0.855


MLPRegressor com cross-validation
Fold #1
Fold score (RMSE): 10.35780302746871
Accuracy: 0.849
Fold #2


  y = column_or_1d(y, warn=True)


Fold score (RMSE): 8.70451665534507
Accuracy: 0.795
Fold #3
Fold score (RMSE): 4.486055023488167
Accuracy: 0.922
Fold #4
Fold score (RMSE): 8.41235148468656
Accuracy: 0.850
Fold #5
Fold score (RMSE): 8.85285327097766
Accuracy: 0.883
Final, out of sample score (RMSE): 8.160354653772938
