# Final Clean

In [1]:
import pandas as pd
import datetime
import numpy as np

from geopy.distance import vincenty
from ipykernel import kernelapp as app
from darksky import forecast

In [2]:
""" Loop though x number of CSV Files and Add Together Into One File """


# Make main df with first file
df = pd.read_csv("siri.20121106.csv", header = None, low_memory = False)
df.columns = ["Timestamp", "LineId", "Direction", "JourneyPatternId", "TimeFrame", "VehicleJourneyId", "BusOperator", "Congestion", "Long", "Lat", "Delay", "BlockId", "VehicleId", "StopId", "AtStop"]

# Loop though more files to add
for i in range(7, ?????):
    
    if i < 10:
        i = "0" + str(i)
    
    # Next file to add
    file = "siri.201211" + str(i) + ".csv"

    # Make DataFrame to add to main df
    df_to_add = pd.read_csv(file, header = None, low_memory = False)
    df_to_add.columns = ["Timestamp", "LineId", "Direction", "JourneyPatternId", "TimeFrame", "VehicleJourneyId", "BusOperator", "Congestion", "Long", "Lat", "Delay", "BlockId", "VehicleId", "StopId", "AtStop"]

    # Combine DataFrames
    df = df.append(df_to_add ,ignore_index = True)

In [3]:
# Size of total raw data
df.shape

(7340171, 15)

In [4]:
df.head()

Unnamed: 0,Timestamp,LineId,Direction,JourneyPatternId,TimeFrame,VehicleJourneyId,BusOperator,Congestion,Long,Lat,Delay,BlockId,VehicleId,StopId,AtStop
0,1352246400000000,15,0,00151001,2012-11-06,5929,HN,0,-6.267883,53.4174,0,15105,33502,6317,1
1,1352246400000000,26,0,,2012-11-06,3013,PO,0,-6.462793,53.356247,0,66008,33446,5114,0
2,1352246400000000,140,0,01400001,2012-11-06,6615,HN,0,-6.275084,53.347015,0,140010,40028,895,1
3,1352246400000000,7,0,,2012-11-06,6527,D1,0,-6.146544,53.259068,292,7004,43001,4982,0
4,1352246400000000,40,0,040D1001,2012-11-06,2466,HN,0,-6.261073,53.352112,-338,40207,40029,4725,1


In [5]:
# Constant Column
del df["Direction"]

#### Note About The Following Cell:
This is the way we are sorting the data. First the Timeframe is the most important since it holds a unique Vehicle Journey ID (or it should) for every journey each day. 
Ideally we would then sort by VehicleJourneyId but the reality is that the data is messy. In order to avoid two buses at opposite sides of the city causing issues with the dataset we will now sort by the vehicle ID.
Next the vehicle journey ID makes sense since we can then start to sort the entire dataset into individual journeys along a certain route.
Lastly timestamp is obvious.

We'll use this cell several times to keep the dataframe sorted and the index correct, as many of the loops in this notebook require this.

You should deduce from this that there will be several cases of a VehicleJourneyId being repeated. cleaning up this is part of the challenge.

In [6]:
# Organise the Data
df = df.sort_values(['TimeFrame', 'VehicleId', 'VehicleJourneyId', 'Timestamp'], ascending=True)

# Clean up index
df = df.reset_index()
del df['index']

In [7]:
# Save raw data
df.to_csv("raw_data.csv", index=False)

## Clean Up Nulls, Duplicate Rows & Start of TimeFrame

In [89]:
df.isnull().sum()

Timestamp           0
LineId              3
JourneyPatternId    0
TimeFrame           0
VehicleJourneyId    0
BusOperator         0
Congestion          0
Long                0
Lat                 0
Delay               0
BlockId             0
VehicleId           0
StopId              0
AtStop              0
Distance            0
TravelTime          0
Weekday             0
TimeCategory        0
dtype: int64

In [10]:
# Data from the 5th is not reliable, since some journey's continued from the previous day, we must remove it. 
df = df[df.TimeFrame != '2012-11-05']

In [11]:
""" Before each bus begins its journey it may not have a JourneyPatternId and records literal 'null' values instead
of actual data. These rows refer to instances where the bus is idle and hasn't moved anywhere, therefore they can 
confidently be deleted. StopID equal to 'null' also represents noise in the data. """


df = df[df.JourneyPatternId != 'null']
df = df[df.StopId != 'null']

In [12]:
# Remove actual nulls from data
df = df.dropna()

' Remove actual nulls from data '

In [13]:
# Organise the Data
df = df.sort_values(['TimeFrame', 'VehicleId', 'VehicleJourneyId', 'Timestamp'], ascending=True)

# Clean up index
df = df.reset_index()
del df['index']

In [14]:
# Check deletion worked
df.shape

(6232318, 14)

In [15]:
# Drop any duplicated rows
df = df.drop_duplicates()

In [16]:
# Organise the Data
df = df.sort_values(['TimeFrame', 'VehicleId', 'VehicleJourneyId', 'Timestamp'], ascending=True)

# Clean up index
df = df.reset_index()
del df['index']

In [17]:
df.shape

(6232318, 14)

In [18]:
# Optional Save
df.to_csv("first_stage.csv", index=False)

## Remove General Noise

In [19]:
""" Remove every VehicleJourneyId which is equal to or below 5 rows in length. Realistically any journey (even the short ones)
should be at least 5 rows of data in length, anything else is just noise. """


gb = df.groupby(["TimeFrame", "VehicleId", "VehicleJourneyId"], as_index=False, group_keys=False)

gb = gb.filter(lambda x: len(x) > 5)

df = pd.DataFrame(gb)

In [20]:
# Organise the Data
df = df.sort_values(['TimeFrame', 'VehicleId', 'VehicleJourneyId', 'Timestamp'], ascending=True)

# Clean up index
df = df.reset_index()
del df['index']

## Remove Extra JourneyPatternId's From VehicleJourneyId's

In [21]:
# Create group object to work with 
gb = df.groupby(["TimeFrame", "VehicleId", "VehicleJourneyId"], as_index=False, group_keys=False)

In [22]:
def delete_outlier_journeypatternid(group):
    """ Takes a pandas group object and iterates removing the least occuring JourneyPatterId in each.
    If there are more than two or the occurances of the two are equal, it ignores it. """
    
        
    # If there's two Journey Pattern ID's
    if len(group["JourneyPatternId"].value_counts()) == 2:
        
        x = group["JourneyPatternId"].value_counts()
                
        # If the two journey pattern ID's occupy the same space then do nothing (likely it's a 'noise' journey)
        if x[0] != x[1]:
                    
            real_id = group["JourneyPatternId"].value_counts().index.tolist()[0] 
            group = group[group.JourneyPatternId == real_id]
            
    return group

In [23]:
# Apply the mapping function to the dataset
df = gb.apply(delete_outlier_journeypatternid)

In [24]:
# Organise the Data
df = df.sort_values(['TimeFrame', 'VehicleId', 'VehicleJourneyId', 'Timestamp'], ascending=True)

# Clean up index
df = df.reset_index()
del df['index']

In [25]:
# Delete outliers (journey's with 3 id's and other noise inc. nulls)
df = gb.filter(lambda x: len(x["JourneyPatternId"].unique()) == 1)

In [26]:
# Check to see it worked

gb = df.groupby(["TimeFrame", "VehicleId", "VehicleJourneyId"], as_index=False, group_keys=False)

count = 0

for item in gb:
    
    if len(item[1]["JourneyPatternId"].value_counts()) > 1:
        count += 1
        
count

0

In [27]:
df.shape

(4142717, 14)

In [28]:
# Organise the Data
df = df.sort_values(['TimeFrame', 'VehicleId', 'VehicleJourneyId', 'Timestamp'], ascending=True)

# Clean up index
df = df.reset_index()
del df['index']

## Remove Vehicle Journey ID's With Two Occurances Each Day

These can either be...
* Two buses completing the same route together with a stopover.
* Incomplete journey's.
* General noise in the data.

In any case they occupy less than 1% of the data and change week to week, so they can be dropped without overall loss of data integrity.

In [29]:
# Filter out these outliers

gb = df.groupby(["TimeFrame", "VehicleJourneyId"], as_index=False, group_keys=False)

gb = gb.filter(lambda x: len(x["VehicleId"].unique()) == 1)

df = pd.DataFrame(gb)

In [30]:
# See if it worked

x = df.groupby(["TimeFrame", "VehicleJourneyId"])["VehicleId"].unique()

count = 0

for i in x:
    
    if len(i) > 1:
        count += 1

count   

0

In [31]:
# Organise the Data
df = df.sort_values(['TimeFrame', 'VehicleId', 'VehicleJourneyId', 'Timestamp'], ascending=True)

# Clean up index
df = df.reset_index()
del df['index']

In [32]:
df.shape

(4035318, 14)

In [33]:
df.to_csv("second_stage.csv", index=False)

So now, to recap there is only one instance of each VehicleJourneyId each day and each of them has a unique JourneyPatternId.

# Clean Up Start Of Journeys

In [34]:
""" If the GPS coordinates remain the same for the first few rows of data, delete them. """


for item, row in df.iterrows():
    
    # For the first iteration
    if item == 0:
        last_bus_id = row[4]
        last_lat = row[8]
        last_long = row[7]
        new_start = True
        continue
    
    # For every iteration
    current_bus_id = row[4]
    current_lat = row[8]
    current_long = row[7]
    
    # If it's a different vehicle journey id
    if last_bus_id != current_bus_id:
        last_bus_id = row[4]
        last_lat = row[8]
        last_long = row[7]
        new_start = True
        continue
  
    # If it's the same journey
    if new_start:
        if current_lat == last_lat:
            if current_long == last_long:
                # Flag
                df.set_value(item - 1, "VehicleJourneyId", 0)
                last_bus_id = row[4]
                last_lat = row[8]
                last_long = row[7]
                continue
                
    # If it's the same journey but it's moved
    if last_bus_id == current_bus_id:
        if current_lat != last_lat or current_long != last_long:
            new_start = False

In [35]:
# Filter Out Rows Flagged
df = df[df.VehicleJourneyId != 0]

In [36]:
# Organise the Data
df = df.sort_values(['TimeFrame', 'VehicleId', 'VehicleJourneyId', 'Timestamp'], ascending=True)

# Clean up index
df = df.reset_index()
del df['index']

In [37]:
df.shape

(3956239, 14)

In [38]:
df.to_csv("third_stage.csv", index=False)

## Add Distance Feature

Before dropping duplicate StopId's we must first measure the distance on each route. This requires that we use all rows of data.

This will also make any VehicleJourneyId's which miss a stop along their journey useful data in the model.

In [39]:
def get_distance(lat1, long1, lat2, long2):
    """ Get distance between two geo coordinates """
    
    stop1 = (lat1, long1)
    stop2 = (lat2, long2)
    
    return vincenty(stop1, stop2).meters

In [40]:
# List to hold feature
distance = list()

for item, row in df.iterrows():
    
    # Set up values on first iteration
    if item == 0:
        last_lat = row[8]
        last_long = row[7]
        last_id = row[4]
        last_distance = 0
        
        distance.append(0)
        continue
    
    current_id = row[4]
    current_lat = row[8]
    current_long = row[7]
    current_distance = get_distance(current_lat, current_long, last_lat, last_long)
    
    # If it's a new Journey ID
    if current_id != last_id:
        last_id = row[4]
        last_lat = row[8]
        last_long = row[7]
        last_distance = 0
        
        distance.append(0)
        continue
     
    # If it's not a new Journey ID
    current_distance = get_distance(current_lat, current_long, last_lat, last_long)
    distance.append(last_distance + current_distance)
    
    last_distance = distance[-1]
    last_lat = row[8]
    last_long = row[7]
    last_id = row[4]

In [41]:
df["Distance"] = distance

In [42]:
df.head()

Unnamed: 0,Timestamp,LineId,JourneyPatternId,TimeFrame,VehicleJourneyId,BusOperator,Congestion,Long,Lat,Delay,BlockId,VehicleId,StopId,AtStop,Distance
0,1352274251000000,15,015A0002,2012-11-07,3174,RD,0,-6.233317,53.342152,0,15032,24549,395,1,0.0
1,1352274269000000,15,015A0002,2012-11-07,3174,RD,0,-6.233333,53.342152,0,15032,24549,395,1,1.065683
2,1352274288000000,15,015A0002,2012-11-07,3174,RD,0,-6.233317,53.342133,0,15032,24549,395,1,3.433604
3,1352274290000000,15,015A0002,2012-11-07,3174,RD,0,-6.233317,53.342133,0,15032,24549,395,1,3.433604
4,1352274308000000,15,015A0002,2012-11-07,3174,RD,0,-6.2333,53.342152,0,15032,24549,395,1,5.832238


In [43]:
df.to_csv("fourth_stage.csv", index=False)

## Remove Stop ID Duplicates

Now we can filter the dataframe a little by removing dupicate stopID's. Although it would be better to train on every row it might be too much to compute. 

This will also help us later in making the database stop distances. There is not enough information to map the exact distance to each stop, so we will have to take some kind of average of the AtStop == 0 columns and subtract a little to get a rough distance to each stop in a Journey Pattern ID.

Because this section keeps the first occurance of each StopId, this should be a very accurate way to estimate the distance to each stop on each route.

In [44]:
""" This will delete all duplicates AFTER the first example of each is found. So when the bus arrives at the stop, 
all subsequent rows at that stop will be deleted. """

df = df.drop_duplicates(["TimeFrame", 'VehicleId', "VehicleJourneyId", "StopId"])

In [45]:
# Organise the Data
df = df.sort_values(['TimeFrame', 'VehicleId', 'VehicleJourneyId', 'Timestamp'], ascending=True)

# Clean up index
df = df.reset_index()
del df['index']

In [46]:
# Check size of dataframe
df.shape

(1052976, 15)

In [47]:
df.head()

Unnamed: 0,Timestamp,LineId,JourneyPatternId,TimeFrame,VehicleJourneyId,BusOperator,Congestion,Long,Lat,Delay,BlockId,VehicleId,StopId,AtStop,Distance
0,1352274251000000,15,015A0002,2012-11-07,3174,RD,0,-6.233317,53.342152,0,15032,24549,395,1,0.0
1,1352274351000000,15,015A0002,2012-11-07,3174,RD,0,-6.234283,53.3419,0,15032,24549,396,0,83.281841
2,1352274688000000,15,015A0002,2012-11-07,3174,RD,0,-6.24333,53.346352,223,15032,24549,7371,0,1293.444401
3,1352274809000000,15,015A0002,2012-11-07,3174,RD,0,-6.248244,53.343922,142,15032,24549,399,1,1774.570774
4,1352274948000000,15,015A0002,2012-11-07,3174,RD,0,-6.255507,53.34536,170,15032,24549,400,0,2284.165206


In [48]:
df.to_csv("fifth_stage.csv", index=False)

## Change Unix Timestamp to Human Readable Format

In [49]:
# Must be int for division next
df['Timestamp'] = df['Timestamp'].apply(int)

In [50]:
# Change date in bus file to be in standard format instead of unix milliseconds (1:45 mins)
for index, row in df.iterrows():

    x = row[0]/1000000
    df.set_value(index, 'Timestamp', x) 

In [51]:
# For formatting later
df['Timestamp'] = df['Timestamp'].apply(str)

In [52]:
# Change format of date 
for index, row in df.iterrows():

    ts = int(row[0])
    x = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
    df.set_value(index, 'Timestamp', str(x))

In [53]:
df.dtypes

Timestamp            object
LineId               object
JourneyPatternId     object
TimeFrame            object
VehicleJourneyId      int64
BusOperator          object
Congestion            int64
Long                float64
Lat                 float64
Delay                 int64
BlockId               int64
VehicleId             int64
StopId               object
AtStop                int64
Distance            float64
dtype: object

In [54]:
# Change Features To Correct Types
df['Timestamp'] =  pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S')
df["LineId"] = df["LineId"].astype("category")
df["JourneyPatternId"] = df["JourneyPatternId"].astype("category")
df["TimeFrame"] = df["TimeFrame"].astype("category")
df["VehicleJourneyId"] = df["VehicleJourneyId"].astype("category")
df["Congestion"] = df["Congestion"].astype("category")
df["BlockId"] = df["BlockId"].astype("category")
df["StopId"] = df["StopId"].astype("category")
df["BusOperator"] = df["BusOperator"].astype("category")
df["VehicleId"] = df["VehicleId"].astype("category")

In [55]:
df.dtypes

Timestamp           datetime64[ns]
LineId                    category
JourneyPatternId          category
TimeFrame                 category
VehicleJourneyId          category
BusOperator               category
Congestion                category
Long                       float64
Lat                        float64
Delay                        int64
BlockId                   category
VehicleId                 category
StopId                    category
AtStop                       int64
Distance                   float64
dtype: object

In [56]:
df.to_csv("sixth_stage.csv", index=False)

## Add Time Taken Feature

In [57]:
# List to hold features
travel_time = list()

for item, row in df.iterrows():
    
    # Set up values on first iteration
    if item == 0:
        last_id = row[4]
        start_time = row[0]
        travel_time.append(0.0)
        continue
    
    current_time = row[0]
    current_id = row[4]
    
    # If it's a new Journey ID
    if current_id != last_id:
        last_id = row[4]        
        start_time = row[0]        
        travel_time.append(0.0)
        continue
        
    travel_time.append(abs((current_time - start_time).total_seconds()))    
    last_time = row[0]
    last_id = row[4]

In [58]:
df["TravelTime"] = travel_time

In [59]:
df.head()

Unnamed: 0,Timestamp,LineId,JourneyPatternId,TimeFrame,VehicleJourneyId,BusOperator,Congestion,Long,Lat,Delay,BlockId,VehicleId,StopId,AtStop,Distance,TravelTime
0,2012-11-07 07:44:11,15,015A0002,2012-11-07,3174,RD,0,-6.233317,53.342152,0,15032,24549,395,1,0.0,0.0
1,2012-11-07 07:45:51,15,015A0002,2012-11-07,3174,RD,0,-6.234283,53.3419,0,15032,24549,396,0,83.281841,100.0
2,2012-11-07 07:51:28,15,015A0002,2012-11-07,3174,RD,0,-6.24333,53.346352,223,15032,24549,7371,0,1293.444401,437.0
3,2012-11-07 07:53:29,15,015A0002,2012-11-07,3174,RD,0,-6.248244,53.343922,142,15032,24549,399,1,1774.570774,558.0
4,2012-11-07 07:55:48,15,015A0002,2012-11-07,3174,RD,0,-6.255507,53.34536,170,15032,24549,400,0,2284.165206,697.0


In [60]:
df.to_csv("seventh_stage.csv", index=False)

## Add Weekday Feature

In [61]:
df['Weekday'] = df['Timestamp'].dt.dayofweek

In [62]:
# Convert to correct datatype
df["Weekday"] = df["Weekday"].astype("category")

## Add Time Category Feature

In [63]:
# Convert to temp datatype
df["Timestamp"] = df["Timestamp"].astype(str)

In [64]:
time_cat = list()

for item, row in df.iterrows():
    
    s = row[0][-8:]
    temp = s.split(":")

    if int(temp[1]) < 30:
        mins = "00"
    else:
        mins = "30"

    ans = temp[0] + ":" + mins
    
    time_cat.append(ans)

In [65]:
df["TimeCategory"] = time_cat

In [66]:
df.head()

Unnamed: 0,Timestamp,LineId,JourneyPatternId,TimeFrame,VehicleJourneyId,BusOperator,Congestion,Long,Lat,Delay,BlockId,VehicleId,StopId,AtStop,Distance,TravelTime,Weekday,TimeCategory
0,2012-11-07 07:44:11,15,015A0002,2012-11-07,3174,RD,0,-6.233317,53.342152,0,15032,24549,395,1,0.0,0.0,2,07:30
1,2012-11-07 07:45:51,15,015A0002,2012-11-07,3174,RD,0,-6.234283,53.3419,0,15032,24549,396,0,83.281841,100.0,2,07:30
2,2012-11-07 07:51:28,15,015A0002,2012-11-07,3174,RD,0,-6.24333,53.346352,223,15032,24549,7371,0,1293.444401,437.0,2,07:30
3,2012-11-07 07:53:29,15,015A0002,2012-11-07,3174,RD,0,-6.248244,53.343922,142,15032,24549,399,1,1774.570774,558.0,2,07:30
4,2012-11-07 07:55:48,15,015A0002,2012-11-07,3174,RD,0,-6.255507,53.34536,170,15032,24549,400,0,2284.165206,697.0,2,07:30


## Make Final Changes To Datatypes

In [67]:
df.dtypes

Timestamp             object
LineId              category
JourneyPatternId    category
TimeFrame           category
VehicleJourneyId    category
BusOperator         category
Congestion          category
Long                 float64
Lat                  float64
Delay                  int64
BlockId             category
VehicleId           category
StopId              category
AtStop                 int64
Distance             float64
TravelTime           float64
Weekday             category
TimeCategory          object
dtype: object

In [68]:
df["VehicleJourneyId"] = df["VehicleJourneyId"].astype(int)
df["AtStop"] = df["AtStop"].astype(int)
df["Congestion"] = df["Congestion"].astype(int)
df["BlockId"] = df["BlockId"].astype(int)
df["VehicleId"] = df["VehicleId"].astype(int)
df["Delay"] = df["Delay"].astype(int)
df["TravelTime"] = df["TravelTime"].astype(int)

In [69]:
df["VehicleJourneyId"] = df["VehicleJourneyId"].astype('category')
df["Congestion"] = df["Congestion"].astype('category')
df["BlockId"] = df["BlockId"].astype('category')
df["VehicleId"] = df["VehicleId"].astype('category')
df["TimeCategory"] = df["TimeCategory"].astype('category')
df['Timestamp'] =  pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M:%S')

In [70]:
df.dtypes

Timestamp           datetime64[ns]
LineId                    category
JourneyPatternId          category
TimeFrame                 category
VehicleJourneyId          category
BusOperator               category
Congestion                category
Long                       float64
Lat                        float64
Delay                        int32
BlockId                   category
VehicleId                 category
StopId                    category
AtStop                       int32
Distance                   float64
TravelTime                   int32
Weekday                   category
TimeCategory              category
dtype: object

In [71]:
df.head()

Unnamed: 0,Timestamp,LineId,JourneyPatternId,TimeFrame,VehicleJourneyId,BusOperator,Congestion,Long,Lat,Delay,BlockId,VehicleId,StopId,AtStop,Distance,TravelTime,Weekday,TimeCategory
0,2012-11-07 07:44:11,15,015A0002,2012-11-07,3174,RD,0,-6.233317,53.342152,0,15032,24549,395,1,0.0,0,2,07:30
1,2012-11-07 07:45:51,15,015A0002,2012-11-07,3174,RD,0,-6.234283,53.3419,0,15032,24549,396,0,83.281841,100,2,07:30
2,2012-11-07 07:51:28,15,015A0002,2012-11-07,3174,RD,0,-6.24333,53.346352,223,15032,24549,7371,0,1293.444401,437,2,07:30
3,2012-11-07 07:53:29,15,015A0002,2012-11-07,3174,RD,0,-6.248244,53.343922,142,15032,24549,399,1,1774.570774,558,2,07:30
4,2012-11-07 07:55:48,15,015A0002,2012-11-07,3174,RD,0,-6.255507,53.34536,170,15032,24549,400,0,2284.165206,697,2,07:30


In [72]:
df.to_csv("eighth_stage.csv", index=False)

## Add Weather Information

In [None]:
# Read in weather data
# Weather data file contains only relevant dates and first row containing the column names deleted

COLNAMES2 = ['Time_hour','Ind', 'Rain', 'Ind1', 'Temp', 'Ind2', 'Wetb', 'Dewpt', 'Vappr', 'Rhum', 'Msl']
df2 = pd.read_csv("weatherData.csv", names=COLNAMES2)

In [None]:
column_names = ['Time_hour', 'Rain', 'Temp']
df2 = df2[column_names]

In [None]:
for index, row in df2.iterrows():
    # Change each date to standard format year-month-day
    x = datetime.datetime.strptime(row[0], '%d/%m/%Y %H:%M').strftime('%Y-%m-%d %H')
    df2.set_value(index, 'Time_hour', x)

In [None]:
df2.Time_hour =  pd.to_datetime(df2.Time_hour)

In [None]:
df2 = df2.replace(r'\s+', np.nan, regex=True)
df2.Rain = df2.Rain.astype(float)
df2.Temp = df2.Temp.astype(float)

In [None]:
df['Time_hour'] = df['Timestamp'].values.astype('<M8[h]')

In [None]:
df2.head()

In [None]:
len(df)

In [None]:
len(df2)

In [None]:
df2["windSpeed"] = np.nan

In [None]:
df2["Timestamp"] = pd.to_datetime(df2["Time_hour"]).astype('int64')// 10**9

In [None]:
df2.head()

In [None]:
"""
Put your own api key from darksky when running this loop and remember after 1000 per day they charge you so be careful!
darksky has a dashboard that shows you how many api calls you made
"""
count = 0
dublin = forecast('115860eba8f194a9d04b751787f6a2b8',53.349323, -6.260750, time = df2.iloc[0]['Timestamp'], units = 'si')

for item,row in df2.iterrows():
    
    df2.set_value(item, 'windSpeed', dublin.hourly[count].windSpeed)
    count += 1
    
    if count > 23:
        
        # Reset count
        count = 0
        # This gets 24 hour of data, divided in one hour block, we only need one api call for every day
        dublin = forecast('115860eba8f194a9d04b751787f6a2b8',53.349323, -6.260750, time = row[4], units = 'si')


In [None]:
df2.tail()

In [None]:
len(df2)

In [None]:
df3 = pd.merge(df,df2, how='inner', on='Time_hour')

In [None]:
len(df3)

In [None]:
len(df)

## Make Finished CSV

In [122]:
df.to_csv('???????.csv', index=False)