In [54]:
import pandas as pd
import mlflow

## NOTE: Uncomment to use the public tracking server.  Do not use it for data you cannot afford to lose. See note in assignment text. If you leave this line as a comment, mlflow will save the runs to your local filesystem.
# mlflow.set_tracking_uri("http://training.itu.dk:5000/")

# TODO: Set the experiment name
mlflow.set_experiment("<ITU Username> - <Descriptive experiment name>")

# Import some of the sklearn modules you are likely to use.
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

# Start a run
# TODO: Set a descriptive name. This is optional, but makes it easier to keep track of your runs.
with mlflow.start_run(run_name="andbe@itu.dk"):
    # TODO: Insert path to dataset
    df = pd.read_json("dataset.json", orient="split")

    # TODO: Handle missing data
    df = pd.DataFrame(df)
    df = df.dropna()

    #drop columns that are not needed
    df = df.drop(columns=['ANM'])
    df = df.drop(columns=['Non-ANM'])

    print(df.head())

    preprocessing = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['Direction']),
    ('num', SimpleImputer(strategy='mean'),["Speed"]),
    ('scaler', StandardScaler(with_mean=False),["Speed"]),
    ])

    pipeline = Pipeline([
        # TODO: You can start with your pipeline from assignment 1
    ('preprocessing', preprocessing),
    ('clf', LinearRegression()),
    ])

    pipeline.fit(df[["Speed","Direction"]],df["Total"])
    
    print(df)


    # TODO: Currently the only metric is MAE. You should add more. What other metrics could you use? Why?
    metrics = [
        ("MAE", mean_absolute_error, []),
        ("MSE", mean_squared_error, []),
        ("R2", r2_score, []),
    ]

    X = df[["Speed","Direction"]]
    y = df["Total"]

    number_of_splits = 5

    #TODO: Log your parameters. What parameters are important to log?
    #HINT: You can get access to the transformers in your pipeline using `pipeline.steps`
    mlflow.log_params({key: val for key, val 
                       in pipeline.get_params().items() if key!='steps'})
    
    for train, test in TimeSeriesSplit(number_of_splits).split(X,y):
        pipeline.fit(X.iloc[train],y.iloc[train])
        predictions = pipeline.predict(X.iloc[test])
        truth = y.iloc[test]

        from matplotlib import pyplot as plt 
        #plt.plot(truth.index, truth.values, label="Truth")
        #plt.plot(truth.index, predictions, label="Predictions")
        #plt.show()
        
        # Calculate and save the metrics for this fold
        for name, func, scores in metrics:
            score = func(truth, predictions)
            scores.append(score)
    
    # Log a summary of the metrics
    for name, _, scores in metrics:
            # NOTE: Here we just log the mean of the scores. 
            # Are there other summarizations that could be interesting?
            mean_score = sum(scores)/number_of_splits
            mlflow.log_metric(f"mean_{name}", mean_score)

                         Total Direction  Lead_hours         Source_time  \
2020-10-09 15:00:00   9.720000        NW         1.0 2020-10-09 12:00:00   
2020-10-09 18:00:00  14.232999        NW         1.0 2020-10-09 15:00:00   
2020-10-09 21:00:00   6.003000       WNW         1.0 2020-10-09 18:00:00   
2020-10-10 00:00:00   3.984000       WNW         1.0 2020-10-09 21:00:00   
2020-10-10 03:00:00   1.618000       NNW         1.0 2020-10-10 00:00:00   

                       Speed  
2020-10-09 15:00:00  5.81152  
2020-10-09 18:00:00  5.81152  
2020-10-09 21:00:00  4.91744  
2020-10-10 00:00:00  5.81152  
2020-10-10 03:00:00  3.12928  
                         Total Direction  Lead_hours         Source_time  \
2020-10-09 15:00:00   9.720000        NW         1.0 2020-10-09 12:00:00   
2020-10-09 18:00:00  14.232999        NW         1.0 2020-10-09 15:00:00   
2020-10-09 21:00:00   6.003000       WNW         1.0 2020-10-09 18:00:00   
2020-10-10 00:00:00   3.984000       WNW         1.0 

In [60]:
import pandas as pd
x = pd.read_json("dataset.json", orient="split")
x.head()


Unnamed: 0,ANM,Non-ANM,Total,Direction,Lead_hours,Source_time,Speed
2020-10-09 12:31:00,2.355,5.0,7.355,,,NaT,
2020-10-09 12:32:00,2.475,4.043,6.518,,,NaT,
2020-10-09 12:33:00,2.633,4.514,7.147,,,NaT,
2020-10-09 12:34:00,3.014,4.235,7.249,,,NaT,
2020-10-09 12:35:00,3.18,3.601,6.781,,,NaT,


In [49]:
parameters = [
    {
        'clf': (LinearRegression(),)
    }, {
        'clf': (RandomForestRegressor(),),
        'clf__n_estimators': (10, 30),
    }, {
        'clf': (DecisionTreeRegressor(),),
        'clf__max_depth': (5, 10, 15),
    },
]

grid = GridSearchCV(pipeline, parameters)

grid.fit(X, y)

best_fit = grid.best_params_
best_fit_score = grid.best_score_

#Calculate the MSE

y_pred = grid.predict(X)
mse = mean_squared_error(y, y_pred)
print("MSE: ", mse)

MSE:  29.853944119482566


In [50]:
import os
import mlflow
    # Setting the MLflow tracking server
mlflow.set_tracking_uri('http://training.itu.dk:5000/')
    # Setting the requried environment variables
os.environ['MLFLOW_S3_ENDPOINT_UR'] = 'http://130.226.140.28:5000'
os.environ['AWS_ACCESS_KEY_ID'] = 'training-bucket-access-key'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'tqvdSsEDnBWTDuGkZYVsRKnTeu'

In [51]:
    
    preprocessing = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['Direction']),
    ('num', SimpleImputer(strategy='mean'),["Speed"]),
    ('scaler', StandardScaler(with_mean=False),["Speed"]),
    ])

    #print(preprocessing.fit_transform(df))
    

In [2]:
import pandas as pd
df = pd.read_json("dataset.json", orient="split")

In [5]:
df.dropna()
df.size

1784531