In [535]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pymongo as pm
import pprint
from enum import Enum
from datetime import datetime, timedelta
import pytz

In [536]:

client = pm.MongoClient('bigdatadb.polito.it',                     
                        ssl=True,                     
                        authSource = 'carsharing',                     
                        username = 'ictts',                     
                        password ='Ict4SM22!',                     
                        tlsAllowInvalidCertificates=True) 
db = client['carsharing'] 

#Choose the DB to use 
permenant_booking = db['PermanentBookings']
permenant_parking = db['PermanentParkings']
enjoy_permenant_booking = db['enjoy_PermanentBookings']
enjoy_permenant_parking = db['enjoy_PermanentParkings']

#ENUM of cities
class CITY_ENUM(Enum):
    TO = 'Torino'
    SEA = 'Seattle'
    STU = 'Stuttgart'
class CITY_TIMEZONES(Enum):
    TO = 'Europe/Rome'
    SEA = 'America/Los_Angeles'
    STU = 'Europe/Berlin'

def get_start_end_unix_zone(timezone):
    start_timestamp = datetime(2018, 1, 1,0,0,0,0, pytz.timezone(timezone)).timestamp()
    end_timestamp  = datetime(2018, 1, 31,23,59,59,0, pytz.timezone(timezone)).timestamp()
    return start_timestamp,end_timestamp    

#date starts from 01/01/2018 to 31/01/2018 1514761200 - 1517353200
# start_unix_time = datetime.datetime.strptime("01/01/2018", "%d/%m/%Y").timestamp()
# end_unix_time = datetime.datetime.strptime("01/02/2018", "%d/%m/%Y").timestamp()

In [537]:
#pipeline for getting the data for the rentals with the filteration of the data
#too short and too long rentals are filtered out
#considered if car is moved
#grouped by date and hour
def filter_pipeline(city,start_unix_time,end_unix_time):
    return [
    {
        '$match': {
            'city': city,
            'init_time': {
                '$gte': start_unix_time,
                '$lt': end_unix_time
            },
            'final_time': {
                '$gte': start_unix_time,
                '$lt': end_unix_time
            }
        }
    },
    {
        '$project': {
            '_id': 0,
            'duration': {
                '$divide': [
                    { '$subtract': ['$final_time', '$init_time'] },
                    60  # Divide by 60 to convert seconds to minutes
                ]
            },
            'day': {'$dayOfMonth': '$init_date'},
            'hour': {'$hour': '$init_date'},
            'date': {
                '$dateToString': {
                    'format': '%Y-%m-%d',
                    'date': '$init_date'
                }
            },
            'moved': {
                '$ne':[
                    {"$arrayElemAt": [ "$origin_destination.coordinates", 0]},
                    {"$arrayElemAt": [ "$origin_destination.coordinates", 1]}
                 ]
            }
        }
    },
    {
        '$match': {
            'moved': True,
            'duration':{'$gt':5, '$lt':180},
                
        }
    },
    {
        '$group':{
            '_id': {'day': '$day', 'hour': '$hour', 'date': '$date'},
            'total_count': {'$sum': 1},
        }
    },
    {
        '$sort': {
            '_id': 1,
        }
    },
]


In [538]:
TO_Data = list(enjoy_permenant_booking.aggregate(filter_pipeline(CITY_ENUM.TO.value,
          get_start_end_unix_zone(CITY_TIMEZONES.TO.value)[0],get_start_end_unix_zone(CITY_TIMEZONES.TO.value)[1])))
SEA_Data = list(permenant_booking.aggregate(filter_pipeline(CITY_ENUM.SEA.value,
          get_start_end_unix_zone(CITY_TIMEZONES.SEA.value)[0],get_start_end_unix_zone(CITY_TIMEZONES.SEA.value)[1])))
STU_Data = list(permenant_booking.aggregate(filter_pipeline(CITY_ENUM.STU.value,
          get_start_end_unix_zone(CITY_TIMEZONES.STU.value)[0],get_start_end_unix_zone(CITY_TIMEZONES.STU.value)[1])))
cities_data_array = [(CITY_ENUM.TO.value,TO_Data),(CITY_ENUM.SEA.value,SEA_Data),(CITY_ENUM.STU.value,STU_Data)]

In [539]:
print("TO_Data",len(TO_Data))
print("SEA_Data",len(SEA_Data))
print("STU_Data",len(STU_Data))

TO_Data 744
SEA_Data 726
STU_Data 735


In [540]:
def dfModifier(city_list):
  df = pd.DataFrame(city_list, columns =['_id', 'total_count'])
  df['date'] = df['_id'].apply(lambda x: x['date'])
  df['day'] = df['_id'].apply(lambda x: x['day'])
  df['hour'] = df['_id'].apply(lambda x: x['hour'])
  df['myIndex'] = (df['day']-1)*24 + (df['hour']+1)
  df.drop(['_id'], axis=1, inplace=True)
  return df
#day | hour
#1   | 0 -> day*24 + hour => 1*24 + 0 = 24
#1   | 1 -> day*24 + hour => 1*24 + 1 = 25
#1   | 2 -> day*24 + hour => 1*24 + 2 = 26
#day | hour
#0   | 1 -> day*24 + hour => 0*24 + 1 = 1
#0   | 2 -> day*24 + hour => 0*24 + 2 = 2
#0   | 3 -> day*24 + hour => 0*24 + 3 = 3

TO_df = dfModifier(TO_Data)
SEA_df = dfModifier(SEA_Data)
STU_df = dfModifier(STU_Data)
cities_df_array = [(CITY_ENUM.TO.value,TO_df),(CITY_ENUM.SEA.value,SEA_df),(CITY_ENUM.STU.value,STU_df)]


In [541]:
def fillMissingValues(df:pd.DataFrame):
  missingValues=set(np.arange(1,31*24+1)).difference(set(df['myIndex']))
  dfMean = round(np.mean(df['total_count']))
  print("Missing values are:", len(missingValues), missingValues)
  df2 = df
  for value in missingValues:
    dayOfValue = int((value-1)/24)+1
    hourOfValue = (value-1)%24
    new_row = pd.DataFrame({'total_count':dfMean,'date':f'2018-01-{dayOfValue:02d}',
                            'day':dayOfValue,'hour':hourOfValue,'myIndex':value}, index =[0])
    df2 = pd.concat([new_row,df2.loc[:]]).reset_index(drop = True)
  df2.sort_values(by=['myIndex'], inplace=True)
  return df2

To_FilledValues = fillMissingValues(TO_df)
SEA_FilledValues = fillMissingValues(SEA_df)
STU_FilledValues = fillMissingValues(STU_df)
#remember to remove this line - it has never been used !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! TODO
SEA_FilledValues.at[744,'date'] = '2018-01-31'

Missing values are: 0 set()
Missing values are: 18 {736, 737, 738, 739, 740, 741, 742, 743, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735}
Missing values are: 9 {736, 737, 738, 739, 740, 741, 742, 743, 744}


In [542]:
def plotter(plotTitle, df:pd.DataFrame):
    mean = df['total_count'].rolling(window=24*7).mean()
    std = df['total_count'].rolling(window=24*7).std()
    plt.figure(figsize=(14, 6))
    plt.plot(df['myIndex'], mean, label='Rolling Mean', color='red')
    plt.plot(df['myIndex'], std, label='Rolling Std', color='green')
    plt.plot()
    plt.plot(df['myIndex'], df['total_count'], label='Rental', color='blue')
    plt.xlabel('Date')
    plt.ylabel('Total Count')
    plt.legend()
    plt.grid(True)
    plt.title(f'Total Counts in Dates and Hours in - {plotTitle}')
    plt.grid(True)
    plt.savefig(f'{plotTitle}-Roolings-mean-std')
    plt.clf()

In [543]:
plotter('Torino',To_FilledValues)
plotter('Seattle',SEA_FilledValues)
plotter('Stuttgart',STU_FilledValues)

<Figure size 1400x600 with 0 Axes>

<Figure size 1400x600 with 0 Axes>

<Figure size 1400x600 with 0 Axes>