# IronHacks Submission Notebook

## Background
---

The method I am using for this submission is to take the average of the predition from random forest model with the prediction from moving averge model with period equal to 3 inorder to remove outliers and improve the accuracy of the prediction.

## Setup
---

In [5]:
%%capture
%logstop
%logstart -t -r -q ipython_command_log.py global

#- IRONHACKS RESEARCH TRACKING CODE
#----------------------------------
# The following code is used to help our research team understand how you 
# our notebook environment. We do not collect any personal information with
# the following code, it is used to measure when and how often you work on
# your submission files.

import os
from datetime import datetime
import IPython.core.history as history

ha = history.HistoryAccessor()
ha_tail = ha.get_tail(1)
ha_cmd = next(ha_tail)
session_id = str(ha_cmd[0])
command_id = str(ha_cmd[1])
timestamp = datetime.utcnow().isoformat()
history_line = ','.join([session_id, command_id, timestamp]) + '\n'
logfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')
logfile.write(history_line)
logfile.close()

In [6]:
%%capture

#- INSTALL ADDITIONAL LIBRARIES IF REQUIRED
#------------------------------------------
# This is normally not required. The hub environment comes preinstaled with 
# many packages that you can already use without setup. In case there is some
# other library you would like to use that isn't on the list you run this command
# once to install them.  If it is already installed this command has no effect.
!python3 -m pip install google.cloud
!python3 -m pip install numpy
!python3 -m pip install pyarrow
!python3 -m pip install pandas

### Imports

In [7]:
#- IMPORT THE LIBRARIES YOU WILL USE
#------------------------------------------
# You only need to import packages one time per notebook session. To keep your
# notebook clean and organized you can handle all imports at the top of your file.
# The following are included for example purposed, feel free to modify or delete 
# anything in this section.

import csv
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
magics.context.use_bqstorage_api = True
import pyarrow
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import random
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True, linewidth=200, edgeitems=100)

# Pipeline and column transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

# Data transformers
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, label_binarize
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# Data splitter and model evaluator
from sklearn.model_selection import train_test_split, validation_curve, learning_curve, GridSearchCV

# Learning models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, GradientBoostingClassifier, GradientBoostingRegressor
#from xgboost import XGBClassifier, XGBRegressor  # Need to install

# Performance metrics
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [8]:
# CONFIGURE THE BIGQUERY SETTINGS

BIGQUERY_PROJECT = 'ironhacks-covid19-data'
BIGQUERY_KEYPATH = 'service-account.json'

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = BIGQUERY_KEYPATH
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

## Classes and Functions
---

In [9]:
#- DEFINE YOUR CLASSES AND FUNCTIONS 
#-----------------------------------
# This is not required, but is helpful in keeping your notebook organized. 
# You can use the following cell or several cells to define your functions
# and classes to keep them separate from your analysis or results code.
# In general it useful to define your methods in a separate cell from where
# it is run.

def example_function():
    print('Hello World')

In [10]:
# This function is to get the data I need from the big query

def get_data():
    query = """
    SELECT poi_id, top_category, week_number, raw_visit_counts, postal_code
    FROM ironhacks_covid19_competition.weekly_patterns
    """
    query_job = bigquery_client.query(query)
    wp = query_job.to_dataframe()

    query = """
    SELECT week_number, cases
    FROM ironhacks_covid19_competition.covid19_cases
    """
    query_job = bigquery_client.query(query)
    covid = query_job.to_dataframe()
    return wp, covid
    

In [11]:
#shifting the covid_cases up one and two weeks to decrease the delay in collecting data and people's reaction towards the coivd cases

def add_covid_cases(covid, wp):
    nc = covid.copy()
    nc['cases_shift1'] = nc['cases'].shift(1)
    nc['cases_shift2'] = nc['cases'].shift(2)
    del nc['cases']
    nc = nc.fillna(7)
    new = wp.merge(nc, on = 'week_number')
    return new

In [12]:
# Changing the format of the table to having all the categories, poi_id, and postal_code as column headders
# The fillers will be containing 0 or 1 depending on the values in week_number and raw_visit_counts

def change_format(new):
    dummy = pd.get_dummies(new['top_category'])
    table = new.join(dummy)
    dummy = pd.get_dummies(new['poi_id'])
    table = table.join(dummy)
    dummy = pd.get_dummies(new['postal_code'])
    table = table.join(dummy)
    del table['top_category']
    del table['poi_id']
    del table['postal_code']
    return table

In [13]:
# Here I'm making my predictinon table which contains all the known informations for week 44 except for raw_visit_counts

def prediction_data(new):
    filt_wp = new[new['week_number'] == 43]
    dummy = pd.get_dummies(filt_wp['top_category'])
    table2 = filt_wp.join(dummy)
    dummy = pd.get_dummies(filt_wp['poi_id'])
    table2 = table2.join(dummy)
    dummy = pd.get_dummies(filt_wp['postal_code'])
    table2 = table2.join(dummy)
    del table2['top_category']
    del table2['raw_visit_counts']
    del table2['postal_code']

    table2['week_number'] = table2['week_number'].replace(43, 44)
    table2['cases_shift1'] = table2['cases_shift1'].replace(25872, 29641)
    table2['cases_shift2'] = table2['cases_shift2'].replace(23218, 25872)
    return table2

In [14]:
# Making a decision tree model based on the table we've given to it
# I set raw_visit_counts as the Y value and everything else is X
# Also splited the data randomly into 80% for training set and 20% for testing
# Prints out the training score and testing score


def decision_tree(table):
    df = table
    X = df.drop(['raw_visit_counts'], axis=1)
    y = df['raw_visit_counts']
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.3, 
                                                        random_state=0)

    decision_tree_model = DecisionTreeRegressor(random_state=0)
    decision_tree_model.fit(X_train, y_train)
    print('Decision Tree Training score:', decision_tree_model.score(X_train,y_train))
    print('Decision Tree Test score:    ', decision_tree_model.score(X_test,y_test),'\n')
    return decision_tree_model

In [15]:
# same as decision_tree model but used random_forest model
def random_forest(table):
    df = table
    X = df.drop(['raw_visit_counts'], axis=1)
    y = df['raw_visit_counts']
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.3, 
                                                        random_state=0)
    model = RandomForestRegressor(random_state=0)
    model.fit(X_train, y_train)
    print('Random Forest Training score:', model.score(X_train,y_train))
    print('Random Forest Test score:    ', model.score(X_test,y_test),'\n')
    return model

In [16]:
# Predicts the week 44's raw_visit_counts given the trained model and the week 44 known data

def presict_week_44(table2, decision_tree_model):
    X_predict = table2
    poi_id = table2['poi_id']
    del X_predict['poi_id']
    raw_visit_counts = decision_tree_model.predict(X_predict)


    d = pd.DataFrame(columns = ['poi_id', 'raw_visit_counts'])
    d['poi_id'] = poi_id
    d['raw_visit_counts'] = raw_visit_counts
    return d

In [17]:
 # Moving Average model with period=3

def moving_avg(weekly_pattern):
    all_poi = weekly_pattern['poi_id'].unique()
    pred_df = pd.DataFrame(all_poi, columns=['poi_id'])
    pred_df['week_number'] = 44
    pred_df['raw_visit_counts'] = np.nan
    pred_df['visits_concentration'] = np.nan
    pred_df['distance_from_home'] = np.nan
    pred_df['median_dwell'] = np.nan
    pred_df['date_start'] = '2020-11-02'
    temp=None
    temp=weekly_pattern.append(pred_df, ignore_index=True)

    for poi in all_poi:
        poi_df = temp.loc[temp['poi_id']==poi][['poi_id', "week_number", 'raw_visit_counts']].sort_values(by='week_number')
        poi_df['MA3']=poi_df['raw_visit_counts'].rolling(4).mean()
        if poi_df.shape[0] > 4:
            poi_df.iloc[-1, -1]=(poi_df['raw_visit_counts'].iloc[-2]+poi_df['raw_visit_counts'].iloc[-3]+poi_df['raw_visit_counts'].iloc[-4])/3

        temp=temp.merge(poi_df, on=['poi_id', 'week_number'], how='left')

        temp.drop('raw_visit_counts_y', axis=1, inplace=True)
        temp.rename(columns={'raw_visit_counts_x': 'raw_visit_counts'}, inplace=True)

        if 'MA3_y' in list(temp.columns):
            temp['MA3_x']=np.where(temp['MA3_x'].isnull(), temp['MA3_y'], temp['MA3_x'])
            temp.drop('MA3_y', axis=1, inplace=True)
            temp.rename(columns={'MA3_x': 'MA3'}, inplace=True)


    temp = temp[temp['week_number'] == 44]
    result = temp[['poi_id', 'MA3']]
    return result


## Analysis
---

In [18]:
# Run the functions and store the predicted result in df
wp,covid = get_data()
combined = add_covid_cases(covid, wp)
table = change_format(combined)
table2 = prediction_data(combined)

  "Cannot create BigQuery Storage client, the dependency "


In [19]:
#random forest model
model = random_forest(table)
rand_forest = presict_week_44(table2, model)

MemoryError: Unable to allocate 532. MiB for an array with shape (1924, 36211) and data type float64

In [None]:
#moving average model
MA3 = moving_avg(wp)

In [1]:
#combine both and store result in df
df = rand_forest.merge(MA3, on = 'poi_id')
df.columns = ['poi_id','MA3', 'random forest']
df['avg'] = (df['MA3'] + df['random forest'])/2
del df['MA3']
del df['random forest']

NameError: name 'rand_forest' is not defined

## Results
---

Storing the result into the csv file

In [15]:
## This can also be a good place for you to cleanup any input/output and export your results to a file.

df.to_csv('submission_prediction_output.csv',index=False)