In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
#Loading the data with pandas
data = pd.read_csv('states.csv')
#Select only the 'date' column from whole dataset
date = data['date']
total = data['total']

In [None]:
#Encoding the categorical data(the state)
state = data[['date','state']]

state_table = []
state_names = []
new_state = []

j = 0
for i in state['state']:
    if i not in state_names:
        state_names.append(i)         #In this array i am appending only the state names but only for once
        state_table.append([i,j])     #If i am appending the state name for the first time i am building the table 
        j = j + 1                     # Incrementing j
for i in state['state']:
    for j in state_table: 
        if i == j[0]:
            new_state.append(j[1])    # Creating new array for only numerical values 
            
state = state.drop(columns=['state']) # dropping the old text array
state['state'] = new_state            # adding the new numerical array in the same place

In [None]:
#Functions for transformation
def state_scale(state, inverse): #Just simple minmax scale, i am taking the max value from the numerical global array 
    Max = max(new_state)
    if inverse == True:
        arr = []
        for i in state:
            arr.append(i*Max)
        
    elif inverse == False:
        arr = []
        for i in state:
            arr.append(i/Max)
        
    return arr

def y_scale(y, Max, inverse): #Simple minmax scale, i am taking the max value from te numerical global target array 
    if inverse == True:
        arr = []
        for i in y:
            arr.append(i*Max)
        
    elif inverse == False:
        arr = []
        for i in y:
            if i == 0:
                arr.append(0)
            else:
                arr.append(i/Max)
        
    return arr

In [None]:
#Completing the missing values (NaN)
datad = data.drop(columns=['state','date'])#dropping values who does not have NaN vals
imputer = SimpleImputer(missing_values=np.NaN, strategy='median')#In place of NaN adding mean value
transformed_values = imputer.fit_transform(datad.values)#Fitting the transformation
result = pd.DataFrame(transformed_values)
result.columns = datad.columns
#Adding the column names from the original dataset
result['state'] = data['state']
result['date'] = data['date']
result

In [None]:
#This is for one day in the future.
def one_day_forecast(state, result):
    if state not in state_names:
        print("State name is not valid")
    else:
        final = result.loc[result['state'] == state]#finding all records with STATE from the dataset
        final = final.drop(columns=['state'])
        final = final.sort_values(by=['date']).reset_index(drop = True)
        total = final['total']
        max_total = max(total)

        scalerX = StandardScaler().fit(final)#Fitting the scaler
        cval = scalerX.transform(final)
        
        #Adding rolling window. Generating new training dataset. Window contains X values, target is y.
        window_size = 1#One day...
        window = [] 
        target = []

        for j in range(len(cval)-window_size):
            arr = []
            for i in range(window_size):
                arr.extend(cval[j+i])
            window.append(arr)
            target.append(total[j+i+1])

        target = y_scale(target, max_total, False)

        last_date = final['date'].max()
        one_day_predict = cval[-1]#This is the last day from the dataset in the state. We are predicting the value for the next day but training the model on last known day(last day in datasaet).
        parameters = {'C':[0.001,0.01,0.1,0.5,1.5,2,2.5,3,3.5],'epsilon':[0.1,0.01,0.001,0.0001,0.00001]}
        svr = SVR()
        clf_gs = GridSearchCV(svr, parameters)
        clf_gs.fit(window, target)#fitting the grid search
        best_params = clf_gs.best_params_   
        clf = SVR(C = best_params['C'], epsilon = best_params['epsilon'])#Epsilon and C are hyperparameters from SVR algorithm. Try to learn the matemathics in background to understand how it works.
        clf.fit(window, target)#fitting the model

        svr_pred = clf.predict(one_day_predict.reshape(1, -1))#resheped because it's only one array.
        
        print(f'State: {state}')
        print(f'Day: {last_date+1}')
        print(f'Total = {y_scale(svr_pred, max_total, True)}')

one_day_forecast('AK',result)

In [None]:
#This is for last know day. I am doing this in order to calculate the accuracy.
def last_day(state, result):
    if state not in state_names:
        print("State name is not valid")
    else:
        final = result.loc[result['state'] == state]#finding all records with STATE from the dataset
        final = final.drop(columns=['state'])
        final = final.sort_values(by=['date']).reset_index(drop = True)
        total = final['total']
        max_total = max(total)

        scalerX = StandardScaler().fit(final)#Fitting the scaler
        cval = scalerX.transform(final)
        
        #Adding rolling window. Generating new training dataset. Window contains X values, target is y.
        window_size = 1#One day...
        window = [] 
        target = []

        for j in range(len(cval)-window_size):
            arr = []
            for i in range(window_size):
                arr.extend(cval[j+i])
            window.append(arr)
            target.append(total[j+i+1])

        target = y_scale(target, max_total, False)

        last_date = final['date'].max()
        one_day_predict = cval[-2]#This is the last day from the dataset in the state. We are predicting the value for the next day but training the model on last known day(last day in datasaet).
        parameters = {'C':[0.001,1,10,50,5,100,25,75,150,200,250,230],'epsilon':[0.1,0.01,0.001,0.0001,0.00001]}

        svr = SVR()
        clf_gs = GridSearchCV(svr, parameters)
        clf_gs.fit(window, target)#fitting the grid search
        best_params = clf_gs.best_params_   
        clf = SVR(C = best_params['C'], epsilon = best_params['epsilon'])#Epsilon and C are hyperparameters from SVR algorithm. Try to learn the matemathics in background to understand how it works.
        clf.fit(window, target)#fitting the model

        svr_pred = clf.predict(one_day_predict.reshape(1, -1))#resheped because it's only one array.
        
        return total.values[-1], y_scale(svr_pred, max_total, True)


arr = []
for state in state_names:
    pred, test = last_day(state,result)
    if pred == 0:
        arr.append(0)
    else:
        arr.append(test/pred)

In [None]:
print(f'Score: {float(sum(arr)/len(arr))*100} %')