In [18]:
# Library Imports.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Allows plots to appear directly in the notebook.
%matplotlib inline

#from patsy import dmatrices
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 
from sklearn.preprocessing import PolynomialFeatures

# Import pickle to serialise models
import pickle

In [19]:
df_train = pd.read_csv("mean-bikes-available-train.csv")
df_train.head()

Unnamed: 0,station_number,hour,weekend,available_bikes
0,2,0,False,11.883333
1,2,0,True,10.847458
2,2,1,False,12.378151
3,2,1,True,11.0
4,2,2,False,12.386555


In [20]:
df_test = pd.read_csv("mean-bikes-available-test.csv")
df_test.head()

Unnamed: 0,station_number,hour,weekend,available_bikes
0,2,0,False,7.233333
1,2,0,True,5.0
2,2,1,False,7.4
3,2,1,True,6.0
4,2,2,False,7.4


In [21]:
unique_station_numbers = df_train['station_number'].unique()
unique_station_numbers

array([  2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  15,
        16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,  28,  29,
        30,  31,  32,  33,  34,  36,  37,  38,  39,  40,  41,  42,  43,
        44,  45,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,
        58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  71,
        72,  73,  74,  75,  76,  77,  78,  79,  80,  82,  83,  84,  85,
        86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,
        99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
       112, 113, 114, 115, 116, 117], dtype=int64)

In [22]:
df_train_list = []
for number in unique_station_numbers:
    temporary_df = pd.DataFrame(df_train[df_train['station_number']==number])
    df_train_list.append(temporary_df)
df_train_list

[    station_number  hour  weekend  available_bikes
 0                2     0    False        11.883333
 1                2     0     True        10.847458
 2                2     1    False        12.378151
 3                2     1     True        11.000000
 4                2     2    False        12.386555
 5                2     2     True        11.000000
 6                2     3    False        12.420168
 7                2     3     True        11.067797
 8                2     4    False        12.336134
 9                2     4     True        11.000000
 10               2     5    False        12.341667
 11               2     5     True        11.000000
 12               2     6    False        11.166667
 13               2     6     True        10.916667
 14               2     7    False         7.200000
 15               2     7     True        10.216667
 16               2     8    False         4.960000
 17               2     8     True         9.400000
 18         

In [23]:
def split_for_weekday_weekend(dataframe):
    '''Splits dataframe into two - one for weekdays and one for weekends. Returns both in a tuple,
    with the weekday dataframe at index 0'''
    
    weekday_dataframe = dataframe[dataframe['weekend'] == False]
    weekend_dataframe = dataframe[dataframe['weekend'] == True]
    return (weekday_dataframe, weekend_dataframe)

In [24]:
def create_model(dataframe):
    '''Creates the regression model for available bikes at certain hours using the provided dataframe
    Returns the regression model object.'''
    
    X = dataframe[['hour']]
    y = dataframe.available_bikes
    
    poly = PolynomialFeatures(degree = 2)
    X_poly_df = pd.DataFrame(X)
    X_train_poly = poly.fit_transform(X_poly_df)
    model = LinearRegression()
    model = model.fit(X_train_poly, y)
    
    return model

In [30]:
def create_model_pickle(model, station_number, weekday_bool):
    '''Generates name for the pickle file using the station number for that model
    as well as a bool for the weekday.
    No return value, but creates pickle file with the generated name in same directory'''
    
    model_pickle_name = "mean-bikes-pickle" + str(station_number)
    if weekday_bool:
        model_pickle_name += "-weekday"
    else:
        model_pickle_name += "-weekend"
    
    with open(model_pickle_name, 'wb') as file:
        pickle.dump(model, file)

In [31]:
def pickle_generator(dataframe, index):
    
    station_number = dataframe['station_number'][index]
    
    weekday_model_df = split_for_weekday_weekend(dataframe)[0]
    weekend_model_df = split_for_weekday_weekend(dataframe)[1]
    
    weekday_model = create_model(weekday_model_df)
    weekend_model = create_model(weekend_model_df)
    
    create_model_pickle(weekday_model, station_number, True)
    create_model_pickle(weekend_model, station_number, False)

In [32]:
for i in range(len(df_train_list)):
    print(df_train_list[i]['station_number'][48*i])

2
3
4
5
6
7
8
9
10
11
12
13
15
16
17
18
19
21
22
23
24
25
26
27
28
29
30
31
32
33
34
36
37
38
39
40
41
42
43
44
45
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
71
72
73
74
75
76
77
78
79
80
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117


In [33]:
dataframe = df_train_list[0]
dataframe

Unnamed: 0,station_number,hour,weekend,available_bikes
0,2,0,False,11.883333
1,2,0,True,10.847458
2,2,1,False,12.378151
3,2,1,True,11.0
4,2,2,False,12.386555
5,2,2,True,11.0
6,2,3,False,12.420168
7,2,3,True,11.067797
8,2,4,False,12.336134
9,2,4,True,11.0


In [34]:
pickle_generator(dataframe, 0)

In [36]:
for i in range(1, len(df_train_list)):
    dataframe = df_train_list[i]
    pickle_generator(dataframe, 48*i)