## Intro to Business Analytics Assignment 1

### Contributors
- Stanisław Howard
- Alexis Van den Heede, s231860
- Matthias Van Mechelen
- Sven Palac, s231799


In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [26]:
df = pd.read_csv('data/Trips_2018.csv')

MemoryError: Unable to allocate 803. MiB for an array with shape (6, 17548339) and data type float64

In [None]:
df_copy = df.copy()
df.head()

In [None]:
# rerun this to save time if you make a typo and need original df again
df = df_copy.copy()

In [None]:
# col 0 is unnamed, change name to trip_id convert column to index
df.rename(columns={'Unnamed: 0':'trip_id'}, inplace=True)
df.set_index('trip_id', inplace=True)
df.head()

In [None]:
#  convert start time and stop time to datetime objects
df['starttime'] = pd.to_datetime(df['starttime'], format="%Y-%m-%d %H:%M:%S.%f")
df['stoptime'] = pd.to_datetime(df['stoptime'], format="%Y-%m-%d %H:%M:%S.%f")
df.head()

In [None]:
# make user type dummy, get shape
df = pd.get_dummies(df, columns=['usertype'], dtype=int, drop_first=True)
print(df.shape)
df.head()


In [None]:
# plotting coordinates
def plot_map(coords):
    plt.scatter(coords[:,0], coords[:,1], s=0.75)
    plt.ylabel('Latitude')
    plt.xlabel('Longitude')
    plt.show()

In [None]:
# get lat and long
coords = df[['start_station_longitude','start_station_latitude']].values
coords = np.unique(coords, axis=0)



In [None]:
plot_map(coords)


Notice outlier (Canada, Montreal). quickly get rid of it to get an overview of the rest of the data. We will not yet delete the outlier datapoint from the dataframe.

In [None]:
# get rid of outlier longitude > -73.6
coords = coords[coords[:,0] < -73.6]
print(coords.shape) #shape is 917 here as it still includes the grid anomalies

In [None]:
plot_map(coords)
# shows the unique start and end stations present in the dataset.

Notice very structured grid in top right corner. After investigation these data points were NaN's. We will rid of these data points, and at the same time we get rid of the canada outlier. 

In [None]:
# print where nan is in df, get rid of nan (= get rid of gridded outliers), create copy of df and start station coordinates
print(df.isnull().sum())
df = df[~np.isnan(df['start_station_id'])]
df = df[~np.isnan(df['end_station_id'])]
# get rid of Canada outlier
df = df[df['start_station_longitude'] < -73.6]
df = df[df['end_station_longitude'] < -73.6]
# get lat and long
coords_start = df[['start_station_longitude','start_station_latitude']].values
coords_start = np.unique(coords_start, axis=0)
coords_start_copy = coords_start.copy()
print(coords.shape)

In [None]:
plot_map(coords_start)

In [None]:
# amount of distinct end stations does not match amount of start stations as seen from comparing shapes, create copy of end station coordinates.
coords_end = df[['end_station_longitude','end_station_latitude']].values
coords_end = np.unique(coords_end, axis=0)
print(coords_end.shape)
print(coords_start.shape)
coords_end_copy = coords_end.copy()


In [None]:
#what does the difference look like visually?
plot_map(coords_end)

In [None]:
# highlight the end stations that are not in the start stations on the map of all en
coords_end_not_in_start = []
for i in range(len(coords_end)):
    if coords_end[i] not in coords_start:
        coords_end_not_in_start.append(coords_end[i])
coords_end_not_in_start = np.array(coords_end_not_in_start)
plt.scatter(coords_start[:,0], coords_start[:,1], s=0.75)
plt.ylabel('Latitude')
plt.xlabel('Longitude')
plt.scatter(coords_end_not_in_start[:,0], coords_end_not_in_start[:,1], s=0.75, c='r')
plt.show()


Since there are more end stations than start stations and all start stations are also an end station, the clustering should be done using the end stations in order to cluster every station considered in the data. 

Clustering the stations

In [None]:
# import kmeans
from sklearn.cluster import KMeans

In [None]:
n_clusters = 20

In [None]:
# make kmeans model
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(coords_end) # using coords here instead of df to increase speed

Train on coords instead of DataFrame to increase speed, then predict to save labels on df, model is trained on same data as you predict the cluster for, so the cluster they belong to will be the same one as they belonged to during convergence of the Kmeans cluster.

In [None]:
df['drop_label'] = kmeans.predict(df[['end_station_longitude','end_station_latitude']].values)
df['pick_label'] = kmeans.predict(df[['start_station_longitude','start_station_latitude']].values)
df_copy2 = df.copy()


In [None]:
# plot with cluster colour & centriods
plt.scatter(coords_end[:,0], coords_end[:,1], c=kmeans.labels_, s=5)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], c='red', marker="x", s=50)
plt.ylabel('Latitude')
plt.xlabel('Longitude')
plt.show()

In [None]:
# run this cell for coords of starting stations and the fully edited dataframe
coords_start = coords_start_copy.copy()
coords_end = coords_end_copy.copy()
df = df_copy2.copy()
print(df.shape) # to check if no data loss, should be (17548339, 15)
df.head()


**Now part 2**

We need to predict the following

In [None]:
# find n largest pick cluster, randomly chose pick label over drop label
n = 1
largest_cluster = df['pick_label'].value_counts().nlargest(n).index[:n]
print(largest_cluster)
# get all rows with largest cluster 
df = df[df['pick_label'].isin(largest_cluster)]


In [None]:
df.head()

In [None]:
# 1st plot amount of predictions per hour per cluster
grouby_label = df.groupby(['pick_label', df['starttime'].dt.hour]).size().reset_index(name='count')
grouby_label = grouby_label.pivot(index='starttime', columns='pick_label', values='count')
grouby_label.plot(figsize=(20,10), legend=True)
plt.ylabel('Count')
plt.xlabel('Hour')
plt.show()

Start by splitting up the dataset. Necessary because how are you going to aggregate the data by hour if each datapoint has 2 temporal parameters (start and end station times). Note that we can only use the hour and cluster as input to our model, as these are the only variables we have certain information on for the future datapoints for which we predict the demand, so we only have to copy these variables over in the new datasets. 

In [None]:
#splitting the dataset
df_departures = df[["starttime", "pick_label"]]
df_arrivals = df[["stoptime", "drop_label"]]

#replace the starttime datetime object by a column with hour and cols with date DD, MM. For df_departures and stoptime, DD, MM for df_arrivals
df_departures["start_hour"] = df_departures["starttime"].dt.hour
df_departures["start_day"] = df_departures["starttime"].dt.day
df_departures["start_month"] = df_departures["starttime"].dt.month
df_arrivals["stop_hour"] = df_arrivals["stoptime"].dt.hour
df_arrivals["stop_day"] = df_arrivals["stoptime"].dt.day
df_arrivals["stop_month"] = df_arrivals["stoptime"].dt.month
df_departures.drop(columns=["starttime"], inplace=True)
df_arrivals.drop(columns=["stoptime"], inplace=True)


#remove trip_id index. We will aggregate the data by hour, so this will lost its meaning. 
df_departures.reset_index(drop=True, inplace=True)
df_arrivals.reset_index(drop=True, inplace=True)

#Count the amount of dataframe rows that have the same label, hour, day and month. and add the count as extra column
df_departures = df_departures.groupby(["pick_label", "start_hour", "start_day", "start_month"]).size().reset_index(name="count")
df_arrivals = df_arrivals.groupby(["drop_label", "stop_hour", "stop_day", "stop_month"]).size().reset_index(name="count")



note: days close to eachother being linked in prediction is already included in the model as you have days and months in your model. And i dont think that it matters that i split up days and months (my guess what taht the model believes the first day of each month is closely related which it isnt. But the month is also included, nevertheless it is a fair thought. Maybe it is better to include the day and month as a counter instead, this will avoid the first day of each month being related! (as yes the months differentiate them, but the first of jun is still close in dist to the first of jan in my method, while with a continuous counter this would really not be the case. Change later

In [None]:
df_arrivals.head()

In [None]:
df_departures.head()

Train two Random forest regressors. One to predict the amount of departures and one to predict the amount of arrivals. The data has to be split as required in the assignment: Training data contains data from januari - October. Test data contains data from November - December. Hence we will sort the data by month and exploit this sort to make the split. 

In [None]:
#sort the data by month in ascending order
df_departures.sort_values(by=["start_month"], inplace=True)
df_arrivals.sort_values(by=["stop_month"], inplace=True)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score



#split the data into train and test data, training data from month 1-10, test data from month 11-12. 
X_train_dep = df_departures[df_departures["start_month"] < 11].drop(columns=["count"])
X_test_dep = df_departures[df_departures["start_month"] >= 11].drop(columns=["count"])
y_train_dep = df_departures[df_departures["start_month"] < 11]["count"]
y_test_dep = df_departures[df_departures["start_month"] >= 11]["count"]
X_train_arr = df_arrivals[df_arrivals["stop_month"] < 11].drop(columns=["count"])
X_test_arr = df_arrivals[df_arrivals["stop_month"] >= 11].drop(columns=["count"])
y_train_arr = df_arrivals[df_arrivals["stop_month"] < 11]["count"]
y_test_arr = df_arrivals[df_arrivals["stop_month"] >= 11]["count"]

#control check
print(X_train_dep["start_month"].unique())
print(X_train_arr["stop_month"].unique())
print(X_test_dep["start_month"].unique())
print(X_test_arr["stop_month"].unique())


In [None]:
#train the decision tree regressors
reg_dep = RandomForestRegressor(random_state=0)
reg_arr = RandomForestRegressor(random_state=0)
reg_dep.fit(X_train_dep, y_train_dep)
reg_arr.fit(X_train_arr, y_train_arr)

Evaluate the models

In [None]:
#predict the amount of departures and arrivals
y_pred_dep = reg_dep.predict(X_test_dep)
y_pred_arr = reg_arr.predict(X_test_arr)

#calculate the r2 score
r2_dep = r2_score(y_test_dep, y_pred_dep)
r2_arr = r2_score(y_test_arr, y_pred_arr)
print("R2 departures: ", r2_dep)
print("R2 arrivals: ", r2_arr)

In [None]:
# test if month  are influencing the prediction in a bad way
X_train_dep = df_departures[df_departures["start_month"] < 11].drop(columns=["count", "start_month"])
X_test_dep = df_departures[df_departures["start_month"] >= 11].drop(columns=["count", "start_month"])
y_train_dep = df_departures[df_departures["start_month"] < 11]["count"]
y_test_dep = df_departures[df_departures["start_month"] >= 11]["count"]
X_train_arr = df_arrivals[df_arrivals["stop_month"] < 11].drop(columns=["count", "stop_month"])
X_test_arr = df_arrivals[df_arrivals["stop_month"] >= 11].drop(columns=["count", "stop_month"])
y_train_arr = df_arrivals[df_arrivals["stop_month"] < 11]["count"]
y_test_arr = df_arrivals[df_arrivals["stop_month"] >= 11]["count"]

#train the decision tree regressors
reg_dep = RandomForestRegressor(random_state=0)
reg_arr = RandomForestRegressor(random_state=0)
reg_dep.fit(X_train_dep, y_train_dep)
reg_arr.fit(X_train_arr, y_train_arr)

#predict the amount of departures and arrivals
y_pred_dep = reg_dep.predict(X_test_dep)
y_pred_arr = reg_arr.predict(X_test_arr)

#calculate the r2 score
r2_dep = r2_score(y_test_dep, y_pred_dep)
r2_arr = r2_score(y_test_arr, y_pred_arr)
print("R2 departures: ", r2_dep)
print("R2 arrivals: ", r2_arr)

In [None]:
# grid serach for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

#split the data into train and test data, training data from month 1-10, test data from month 11-12.
X_train_dep = df_departures[df_departures["start_month"] < 11].drop(columns=["count"])
X_test_dep = df_departures[df_departures["start_month"] >= 11].drop(columns=["count"])
y_train_dep = df_departures[df_departures["start_month"] < 11]["count"]
y_test_dep = df_departures[df_departures["start_month"] >= 11]["count"]
X_train_arr = df_arrivals[df_arrivals["stop_month"] < 11].drop(columns=["count"])
X_test_arr = df_arrivals[df_arrivals["stop_month"] >= 11].drop(columns=["count"])
y_train_arr = df_arrivals[df_arrivals["stop_month"] < 11]["count"]
y_test_arr = df_arrivals[df_arrivals["stop_month"] >= 11]["count"]

#control check
print(X_train_dep["start_month"].unique())
print(X_train_arr["stop_month"].unique())
print(X_test_dep["start_month"].unique())
print(X_test_arr["stop_month"].unique())

#make a scorer for the grid search
scorer = make_scorer(r2_score)

#make a parameter grid
param_grid = {
    "n_estimators": [10, 50, 100, 200],
    "max_depth": [None, 5, 10, 20, 30],
    "min_samples_split": [2, 5, 10, 20, 50],
    "min_samples_leaf": [1, 2, 5, 10, 20, 50]
}

#make a grid search for the departures
grid_dep = GridSearchCV(RandomForestRegressor(random_state=0), param_grid, cv=3, scoring=scorer)
grid_dep.fit(X_train_dep, y_train_dep)

#make a grid search for the arrivals
grid_arr = GridSearchCV(RandomForestRegressor(random_state=0), param_grid, cv=3, scoring=scorer)
grid_arr.fit(X_train_arr, y_train_arr)

#predict the amount of departures and arrivals
y_pred_dep = grid_dep.predict(X_test_dep)
y_pred_arr = grid_arr.predict(X_test_arr)

#calculate the r2 score
r2_dep = r2_score(y_test_dep, y_pred_dep)
r2_arr = r2_score(y_test_arr, y_pred_arr)
print("R2 departures: ", r2_dep)
print("R2 arrivals: ", r2_arr)




In [None]:
# print best hyperparameters
print(grid_dep.best_params_)
print(grid_arr.best_params_)


Month makes model worse prodbably due to split 

Bad R^2 so feature engineer

In [None]:
# add lag function
def buildLaggedFeatures(s,columns, lag=2,dropna=True):
    '''
    From http://stackoverflow.com/questions/20410312/how-to-create-a-lagged-data-structure-using-pandas-dataframe
    Builds a new DataFrame to facilitate regressing over all possible lagged features
    '''
    if type(s) is pd.DataFrame:
        new_dict={}
        for c in s.columns:
            new_dict[c]=s[c]
        for col_name in columns:
            new_dict[col_name]=s[col_name]
            # create lagged Series
            for l in range(1,lag+1):
                new_dict['%s_lag%d' %(col_name,l)]=s[col_name].shift(l)
        res=pd.DataFrame(new_dict,index=s.index)

    elif type(s) is pd.Series:
        the_range=range(lag+1)
        res=pd.concat([s.shift(i) for i in the_range],axis=1)
        res.columns=['lag_%d' %i for i in the_range]
    else:
        print('Only works for DataFrame or Series')
        return None
    if dropna:
        return res.dropna()
    else:
        return res 

In [None]:
# add lag features
lag = 0
df_departures = buildLaggedFeatures(df_departures, ["count"], lag=lag)
df_arrivals = buildLaggedFeatures(df_arrivals, ["count"], lag=lag)

In [None]:
df_departures.head()

In [None]:
df_arrivals.head()

In [None]:
# re train forest
X_train_dep = df_departures[df_departures["start_month"] < 11].drop(columns=["count", "start_month"])
X_test_dep = df_departures[df_departures["start_month"] >= 11].drop(columns=["count", "start_month"])
y_train_dep = df_departures[df_departures["start_month"] < 11]["count"]
y_test_dep = df_departures[df_departures["start_month"] >= 11]["count"]
X_train_arr = df_arrivals[df_arrivals["stop_month"] < 11].drop(columns=["count", "stop_month"])
X_test_arr = df_arrivals[df_arrivals["stop_month"] >= 11].drop(columns=["count", "stop_month"])
y_train_arr = df_arrivals[df_arrivals["stop_month"] < 11]["count"]
y_test_arr = df_arrivals[df_arrivals["stop_month"] >= 11]["count"]


In [None]:
reg_dep = RandomForestRegressor(random_state=0)
reg_arr = RandomForestRegressor(random_state=0)


In [None]:
reg_dep.fit(X_train_dep, y_train_dep)
reg_arr.fit(X_train_arr, y_train_arr)

In [None]:

y_pred_dep = reg_dep.predict(X_test_dep)
y_pred_arr = reg_arr.predict(X_test_arr)

r2_dep = r2_score(y_test_dep, y_pred_dep)
r2_arr = r2_score(y_test_arr, y_pred_arr)
print("R2 departures: ", r2_dep)
print("R2 arrivals: ", r2_arr)

In [None]:
# logistical regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
# add square of each x
X_train_dep = df_departures[df_departures["start_month"] < 11].drop(columns=["count", "start_month"])
X_test_dep = df_departures[df_departures["start_month"] >= 11].drop(columns=["count", "start_month"])
y_train_dep = df_departures[df_departures["start_month"] < 11]["count"]
y_test_dep = df_departures[df_departures["start_month"] >= 11]["count"]
X_train_arr = df_arrivals[df_arrivals["stop_month"] < 11].drop(columns=["count", "stop_month"])
X_test_arr = df_arrivals[df_arrivals["stop_month"] >= 11].drop(columns=["count", "stop_month"])
y_train_arr = df_arrivals[df_arrivals["stop_month"] < 11]["count"]
y_test_arr = df_arrivals[df_arrivals["stop_month"] >= 11]["count"]

X_train_dep["start_hour^2"] = X_train_dep["start_hour"]**2
X_train_dep["start_day^2"] = X_train_dep["start_day"]**2
X_test_dep["start_hour^2"] = X_test_dep["start_hour"]**2
X_test_dep["start_day^2"] = X_test_dep["start_day"]**2
X_train_arr["stop_hour^2"] = X_train_arr["stop_hour"]**2
X_train_arr["stop_day^2"] = X_train_arr["stop_day"]**2
X_test_arr["stop_hour^2"] = X_test_arr["stop_hour"]**2
X_test_arr["stop_day^2"] = X_test_arr["stop_day"]**2


reg_dep = LinearRegression()
reg_arr = LinearRegression()
reg_dep.fit(X_train_dep, y_train_dep)
reg_arr.fit(X_train_arr, y_train_arr)
y_pred_dep = reg_dep.predict(X_test_dep)
y_pred_arr = reg_arr.predict(X_test_arr)
r2_dep = r2_score(y_test_dep, y_pred_dep)
r2_arr = r2_score(y_test_arr, y_pred_arr)
print("R2 departures: ", r2_dep)
print("R2 arrivals: ", r2_arr)

print(reg_dep.coef_)

Plot the results

In [None]:
# plot the predicted vs actual amount of departures and arrivals for all clusters
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
axs = axs.ravel()
axs[0].scatter(y_test_dep, y_pred_dep, s=0.75)
axs[0].set_xlabel("Actual")
axs[0].set_ylabel("Predicted")
axs[0].set_title("Departures: All clusters, R2: " + str(round(r2_dep, 3)))
axs[1].scatter(y_test_arr, y_pred_arr, s=0.75)
axs[1].set_xlabel("Actual")
axs[1].set_ylabel("Predicted")
axs[1].set_title("Arrivals: All clusters, R2: " + str(round(r2_arr, 3)))
plt.tight_layout()
plt.show()


# plot the predicted vs actual amount of departures and arrivals for each cluster individually and put all plots in a compact grid. 
# THis allows us to visually evaluate how good the model is at predicting for each cluster. 
fig, axs = plt.subplots(int(n_clusters/2), 4, figsize=(20, 40))
axs = axs.ravel()
for i in range(int(2*n_clusters)):
    if i%2 == 0:
        axs[i].scatter(y_test_dep[X_test_dep["label"] == i//2], y_pred_dep[X_test_dep["label"] == i//2], s=0.75)
        axs[i].set_xlabel("Actual")
        axs[i].set_ylabel("Predicted")
        axs[i].set_title("Departures cluster " + str(i//2))
        plt.tight_layout()
    else:
        axs[i].scatter(y_test_arr[X_test_arr["label"] == int(np.floor(i//2))], y_pred_arr[X_test_arr["label"] == int(np.floor(i//2))], s=0.75)
        axs[i].set_xlabel("Actual")
        axs[i].set_ylabel("Predicted")
        axs[i].set_title("Arrivals cluster " + str(int(np.floor(i//2))))
        plt.tight_layout()
plt.show()

Notice that cluster 12 contains little datapoints, this will likely be the cluster which contains the end stations which were not start stations (verify maybe?), this only works when we keep n_clusters = 20

In [None]:
# decision tree to predict count
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# split data per label, extract data on useful variables for training model, split data, train model, print score.
for i in range(n_clusters):
    df_label = df[df['label'] == i]
    df_label = df_label[['hour', 'usertype_Subscriber', 'label']]
    X = df_label[['hour', 'usertype_Subscriber']]
    y = df_label['label']
    # data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    print("label: ", i, " score: ", clf.score(X_test, y_test))

In [None]:
# get accuracy
print(clf.score(X_test, y_test))

# R^2
from sklearn.metrics import r2_score

y_pred = clf.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f'r^2 = {r2}')


In [None]:
# make a nn
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# split data per label
for i in range(20):
    df_label = df[df['label'] == i]
    # split data
    X_train, X_test, y_train, y_test = train_test_split(df_label[['hour','start_station_longitude','start_station_latitude']], df_label['usertype_Subscriber'], test_size=0.2, random_state=0)
    # scale data
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    # make nn
    clf = MLPClassifier(hidden_layer_sizes=(10,10,10), max_iter=1000)
    clf.fit(X_train, y_train)
    print("label:",i,"score:",clf.score(X_test, y_test))
    


# get accuracy
print(clf.score(X_test, y_test))

# R^2
from sklearn.metrics import r2_score

y_pred = clf.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f'r^2 = {r2}')


In [None]:
# 1st plot amount of predictions per hour per cluster
grouby_label = df.groupby(['label', df['starttime'].dt.hour]).size().reset_index(name='count')
grouby_label = grouby_label.pivot(index='starttime', columns='label', values='count')
grouby_label.plot(figsize=(20,10))
plt.ylabel('Count')
plt.xlabel('Hour')
plt.show()

In [None]:
grouby_cluster = df.groupby(['label'])
grouby_cluster.head()


In [None]:
# group same label and hour of start together
grouby_cluster = df.groupby(['label', df['starttime'].dt.hour])
grouby_cluster.head()
# average all other columns
grouby_cluster = grouby_cluster.mean()
grouby_cluster.head()


In [None]:
df.head()

In [None]:
# amount of predictions per hour per cluster in df
df['demand'] = df.groupby(['label', df['starttime'].dt.hour])['label'].transform('count')
df.head()



In [None]:
grouby_label.head()

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
# split data into train and test (80/20)
train = df.sample(frac=0.8,random_state=42)
test = df.drop(train.index)

In [None]:
# make linear regression model per cluster/label
models = []
for i in range(20):
    X = grouby_label[i].index.values.reshape(-1, 1)
    y = grouby_label[i].values
    regr = linear_model.LinearRegression()
    print(X.shape)
    print(y.shape)
    print(type(X))
    print(type(y))
    print(X[0])
    print(y[0])
    regr.fit(X, y)
    models.append(regr)
    

# for i in range(20):
    # X = train[train['label'] == i]['starttime'].dt.hour.values.reshape(-1, 1)
    # y = train[train['label'] == i]['tripduration'].values
    # regr = linear_model.LinearRegression()
    # regr.fit(X, y)
    # models.append(regr)
# THIS ISNT USING THE RIGHT X YET, not sure how the groupby df is working rn

# for i in range(20):
#     X = grouby_label[i].index.values.reshape(-1, 1)
#     y = grouby_label[i].values
#     regr = linear_model.LinearRegression()
#     regr.fit(X, y)
#     models.append(regr)

In [None]:
# make predictions
preds = []
for i in range(20):
    preds.append(models[i].predict(grouby_label[i].index.values.reshape(-1, 1)))


In [None]:
# plot predictions 4x5 grid
fig, axs = plt.subplots(4, 5, figsize=(20, 10))
axs = axs.ravel()
for i in range(20):
    axs[i].plot(grouby_label[i].index.values, grouby_label[i].values, color='black')
    axs[i].plot(grouby_label[i].index.values, preds[i], color='blue', linewidth=3)
    axs[i].set_title('Cluster ' + str(i))
    axs[i].set_ylabel('Count')
    axs[i].set_xlabel('Hour')
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Create an empty dictionary to store the linear regression models for each cluster
models = {}

# Train a linear regression model for each cluster
for cluster_label in range(20):
    # Filter the data for the current cluster
    cluster_data = df[df['label'] == cluster_label]
    
    # Extract features (X) and target (y)
    X = cluster_data[['starttime']].values
    y = cluster_data['demand'].values
    
    # Create and train the linear regression model
    model = LinearRegression()
    model.fit(X, y)
    
    # Store the model in the dictionary
    models[cluster_label] = model

# Create an empty DataFrame to store the predictions
predictions_df = pd.DataFrame()

# Make predictions for each cluster and add them to the DataFrame
for cluster_label in range(20):
    # Extract the hours for which you want to make predictions
    hours_to_predict = grouby_label.index.values
    
    # Create a feature matrix with these hours
    X_predict = hours_to_predict.reshape(-1, 1)
    
    # Make predictions using the model for the current cluster
    predictions = models[cluster_label].predict(X_predict)
    
    # Add the predictions to the DataFrame
    predictions_df['Cluster_' + str(cluster_label)] = predictions

# Plot the predictions
predictions_df['Hour'] = hours_to_predict
predictions_df.set_index('Hour', inplace=True)
predictions_df.plot(figsize=(20, 10))
plt.ylabel('Count')
plt.xlabel('Hour')
plt.show()

Matthias Attempt

Data preparation

In [None]:
print(len(df["start_station_id"].unique()))
print(len(df["end_station_id"].unique()))
coords = df[['start_station_longitude','start_station_latitude']].values
coords = np.unique(coords, axis=0)
print(len(coords))
coords = df[['end_station_longitude','end_station_latitude']].values
coords = np.unique(coords, axis=0)
print(len(coords))


In [None]:
#copy dataset and distinct coordinates
df = df_matt.copy()
coords = coords_copy.copy()

# Create an additional column in df with the distinct station_id using the kmeans model
for i in range(len(coords)):
    df.loc[(data['start_station_longitude'] == coords[i][0]) & (df['start_station_latitude'] == coords[i][1]), 'station_id'] = i
df.head()
# print(df["station_id"].unique())


#split dataset up. Create two datasets, one with 










In [None]:
#code treasury





    # Create an empty dictionary to store the linear regression models for each station for stations within only one of the 20 clusters
models = {}

# Train a linear regression model for each station within the specified cluster
for station_id in df["station_id"].unique():
    # Filter the data for the current station
    station_data = df[df['station_id'] == station_id]
    
    # Extract features (X) and target (y)
    X = station_data[['starttime']].values
    y = station_data['demand'].values
    
    # Create and train the linear regression model
    model = LinearRegression()
    model.fit(X, y)
    
    # Store the model in the dictionary
    models[station_id] = model

Collaboration # not included in wordcouter.py


In [None]:
#what does bike id mean? unique id for a certain bike