In [25]:
import numpy as np
import tensorflow as tf
from sklearn import preprocessing
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Concatenate, Flatten, Input, Conv2D
from tensorflow.keras.activations import tanh
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

import time

In [2]:
import os
import dask.dataframe as dd


#Takes path of directory and finds train.csv, test.csv or direct path of singular .csv file
#and returns list of dask dataframe [singular_df_x, singular_df_y] or [train_df_x, train_df_y, test_df_x, test_df_y]
class DataframeExtractor_csv:
    def __init__(self, directory_path, label_names = []):
        self._directory_path = directory_path
        self._label_names = label_names
        self._df_list = []
        self.get_df_list()
        
    @property
    def directory_path(self):
        return self._directory_path
    
    @property
    def labels(self):
        return self._labels
    
    @property
    def df_list(self):
        return self._df_list
    
    #returns list of dask dataframes
    def get_df_list(self):
        
        #If csv file path has been entered
        if self._directory_path.endswith(".csv"):
            print(f"Reading single csv from {self._directory_path}")
            csv_df = dd.read_csv(self._directory_path, assume_missing = True, sample_rows=1000)
            self._df_list.append(csv_df.loc[:, ~csv_df.columns.isin(self._label_names)])
            self._df_list.append(csv_df[self._label_names])
            
        elif self._no_of_csv(self._directory_path) == 1:
            csv_dir = self._get_csv_path(self._directory_path)
            self._check_dir_exists(csv_dir)
            csv_df = dd.read_csv(csv_dir, assume_missing = True, sample_rows=1000)
            self._df_list.append(csv_df.loc[:, ~csv_df.columns.isin(self._label_names)])
            self._df_list.append(csv_df[self._label_names])
            
        #Finding train.csv and test.csv from directory
        else:            
            print(f"Reading train.csv and test.csv of directory {self._directory_path}")
            
            train_dir = os.path.join(self._directory_path, "train.csv")
            self._check_dir_exists(train_dir)
            csv_df = dd.read_csv(train_dir, assume_missing = True, sample_rows=1000)
            self._df_list.append(csv_df.loc[:, ~csv_df.columns.isin(self._label_names)])
            self._df_list.append(csv_df[self._label_names])
            
            
            test_dir = os.path.join(self._directory_path, "test.csv")
            self._check_dir_exists(test_dir)
            csv_df = dd.read_csv(test_dir, assume_missing = True, sample_rows=1000)
            self._df_list.append(csv_df.loc[:, ~csv_df.columns.isin(self._label_names)])
            self._df_list.append(csv_df[self._label_names])

        
        #If no datasets are found
        if not self._df_list:
            raise EmptyListError("No Datasets found")
            
        
    @staticmethod
    def _check_dir_exists(directory):
        if(not os.path.isfile(directory)):
            raise FileNotFoundError(f"Directory {directory} does not exist")
            
    @staticmethod
    def _no_of_csv(directory):
        i = 0
        filenames = os.listdir(directory)
        for filename in filenames:
            if filename.endswith(".csv"):
                i = i+1
        return i
    @staticmethod
    def _get_csv_path(directory):
        filenames = os.listdir(directory)
        for filename in filenames:
            if filename.endswith(".csv"):
                return os.path.join(directory,filename)

In [3]:
directory = "/home/anish/ASC-ML-EXP-DATASETS/LinReg-tabular/weather-history-dataset/weatherHistory.csv"
dataset_list = DataframeExtractor_csv(directory, label_names = ["Apparent Temperature (C)"]).df_list

Reading single csv from /home/anish/ASC-ML-EXP-DATASETS/LinReg-tabular/weather-history-dataset/weatherHistory.csv


In [4]:
dataset_list[0].head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


In [5]:
dataset_list[1].head()

Unnamed: 0,Apparent Temperature (C)
0,7.388889
1,7.227778
2,9.377778
3,5.944444
4,6.977778


In [6]:
train_x = dataset_list[0]
train_y = dataset_list[1]

In [7]:
train_x["Loud Cover"].value_counts(dropna = False).compute()

0.0    96453
Name: Loud Cover, dtype: int64

In [8]:
train_x["Precip Type"].value_counts(dropna = False).compute()

rain    85224
snow    10712
NaN       517
Name: Precip Type, dtype: int64

In [9]:
def func(row):   
    if row['Precip Type'] == 'rain':
        return 1  
    elif row['Precip Type'] == 'snow':
        return 2
    else:
        return 0

train_x['PrecipNo'] = train_x.apply(func, axis=1)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'int64'))



In [10]:
train_x["PrecipNo"].value_counts(dropna = False).compute()

1    85224
2    10712
0      517
Name: PrecipNo, dtype: int64

In [11]:
train_x = train_x.drop(["Summary", "Precip Type", "Formatted Date", "Daily Summary", "Loud Cover"], axis=1).compute()

In [12]:
train_x['PrecipNo'] = train_x['PrecipNo'].map({0: 'No Precip', 1: 'Rain', 2: 'Snow'})
train_x = dd.get_dummies(train_x, columns=['PrecipNo'], prefix='', prefix_sep='')

In [13]:
train_x.head()

Unnamed: 0,Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars),No Precip,Rain,Snow
0,9.472222,0.89,14.1197,251.0,15.8263,1015.13,0,1,0
1,9.355556,0.86,14.2646,259.0,15.8263,1015.63,0,1,0
2,9.377778,0.89,3.9284,204.0,14.9569,1015.94,0,1,0
3,8.288889,0.83,14.1036,269.0,15.8263,1016.41,0,1,0
4,8.755556,0.83,11.0446,259.0,15.8263,1016.51,0,1,0


In [14]:
input_dim = train_x.shape[1]
print(input_dim)

9


In [15]:
tf.keras.backend.clear_session()

In [16]:
def baseline_model():
    # create model
    adam_optimizer = Adam(lr = 1e-3)
    model = Sequential()
    model.add(Dense(9, input_dim=input_dim, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer=adam_optimizer, 
                  metrics = [tf.keras.metrics.MeanAbsoluteError(), "acc"])
    return model

In [17]:
X = np.array(train_x.values)
Y = np.array(train_y.values)

In [21]:
estimator = KerasRegressor(build_fn=baseline_model, epochs=30, batch_size=64, verbose=0)
kfold = KFold(n_splits=5)
results = cross_val_score(estimator, X, Y, cv=kfold)

In [22]:
print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Baseline: -0.39 (0.22) MSE


In [23]:
def baseline_model2():
    # create model
    adam_optimizer = Adam(lr = 1e-3)
    model = Sequential()
    model.add(Dense(9, input_dim=input_dim, kernel_initializer='normal', activation='relu'))
    model.add(Dense(32, kernel_initializer='normal'))
    model.add(Dense(32, kernel_initializer='normal'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer=adam_optimizer, 
                  metrics = [tf.keras.metrics.MeanAbsoluteError(), "acc"])
    return model

In [27]:
start = time.time()

estimator = KerasRegressor(build_fn=baseline_model2, epochs=30, batch_size=64, verbose=0)
kfold = KFold(n_splits=5)
results = cross_val_score(estimator, X, Y, cv=kfold)

end = time.time()
print ("Time elapsed:", end - start)
print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Time elapsed: 166.60303807258606
Baseline: -0.40 (0.16) MSE


In [30]:
def baseline_model3():
    # create model
    adam_optimizer = Adam(lr = 1e-3)
    model = Sequential()
    model.add(Dense(9, input_dim=input_dim, kernel_initializer='normal', activation='relu'))
    model.add(Dense(32, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_absolute_error', optimizer=adam_optimizer, 
                  metrics = [tf.keras.metrics.MeanAbsoluteError(), "acc"])
    return model

In [31]:
start = time.time()

estimator = KerasRegressor(build_fn=baseline_model2, epochs=30, batch_size=64, verbose=0)
kfold = KFold(n_splits=5)
results = cross_val_score(estimator, X, Y, cv=kfold)

end = time.time()
print ("Time elapsed:", end - start)
print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Time elapsed: 162.76261115074158
Baseline: -0.29 (0.09) MSE
