In [49]:
import pandas as pd
import datetime
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
import matplotlib.pyplot as plt

### Reading all the training files

In [2]:
region_data = pd.read_csv('cluster_map.csv', sep=',')
order_data = pd.read_csv('order.txt', sep='\t')
weather_data = pd.read_csv('weather_merged.txt', sep='\t')

order_data.columns = ['Order_id', 'driver_id', 'passenger_id', 'start_region', 'dest_region', 'price', 'time']
region_data.columns = ['region_hash', 'region_id']
weather_data.columns = ['time', 'weather', 'temperature', 'pm2.5']

order_data['time'] = pd.to_datetime(order_data['time'])

weather_data['weather'].astype(int)
weather_data['temperature'].astype(int)
weather_data['time'] = pd.to_datetime(weather_data['time'])

line = []
with open('poi_data.txt', 'r') as file:
    for lines in file:
        line.append(lines)
poi_data = []
for i in line:
    splitted = i.split('\t')
    poi_data.append(splitted)


In [3]:
def metric(s):
    facilities = s.split()
    facilities_dict = {}
    for f in facilities:
        parts = f.split('#')[1].split(':')
        if len(parts) == 2:
            cls, num = parts
            facilities_dict[cls] = int(num)
    
    return np.sum([np.sqrt(num) for num in facilities_dict.values()])

poi_dict = {}
for i in range(len(poi_data)): 
    str1 = ""  
    for j in range(1, len(poi_data[i])):
        str1 += poi_data[i][j]
    poi_dict[poi_data[i][0]] = metric(str1)


In [4]:
#Adding day coloumn to represent days from 1 - 21
day_list = order_data['time'].dt.day
order_data['day'] = day_list
day_list = pd.Series([])

weather_data['day'] = weather_data['time'].dt.day

#sorting all data on time basis
region_data.columns
order_data= order_data.sort_values(by=('time'))

def addTimeslot(order_data):
    separated_dfs = []
    for day in order_data['day'].unique():
        day_df = order_data[order_data['day'] == day].reset_index(drop=True)
        num_rows = day_df.shape[0]
        time_slots = []
        count = 1
        for i in range(num_rows):
            time_slots.append(count)
            if (i+1) % (num_rows//144) == 0:
                if (count < 144):
                    count += 1
        day_df['timeslot'] = time_slots
        separated_dfs.append(day_df)

    new_order_data = pd.concat(separated_dfs, ignore_index=True)
    order_data = new_order_data
    del new_order_data
    return order_data

order_data = addTimeslot(order_data)
order_data_wo_nan = order_data.dropna(axis=1)


### Reading the Test Data

In [5]:
region_data_test= pd.read_csv('test_cluster_map', sep='\t')
order_data_test = pd.read_csv('test_order.txt', sep=',')
weather_data_test = pd.read_csv('test_weather.txt', sep='\t')

order_data_test.columns = ['order_id', 'driver_id', 'start_region', 'end_region', 'time']
region_data_test.columns = ['region_hash', 'region_id']
weather_data_test.columns = ['time', 'weather', 'temperature', 'pm2.5']

order_data_test['time'] = pd.to_datetime(order_data_test['time'])

weather_data_test['weather'].astype(int)
weather_data_test['temperature'].astype(int)
weather_data_test['time'] = pd.to_datetime(weather_data_test['time'])
order_data_test = order_data_test.sort_values(by=('time'))

day_list = order_data_test['time'].dt.day - 22
order_data_test['day'] = day_list
day_list = pd.Series([])

weather_data_test['day'] = (weather_data_test['time'].dt.day - 22)

order_data_test = addTimeslot(order_data_test)

line_test = []
with open('test_poi_data', 'r') as file:
    for lines in file:
        line.append(lines)
poi_data_test = []
for i in line:
    splitted = i.split('\t')
    poi_data_test.append(splitted)

poi_dict_test = {}
for i in range(len(poi_data_test)): 
    str1 = ""  
    for j in range(1, len(poi_data_test[i])):
        str1 += poi_data_test[i][j]
    poi_dict_test[poi_data_test[i][0]] = metric(str1)

In [6]:
merge = pd.merge(order_data, weather_data, on='time')

### Weather Model to predict temperature given a time slot

In [7]:
####################The model that predicts the weather given a timeslot 

time_slot_list = [] 
temperature_list = []

for i in range(1, 145):
    time_slot_list.append(i)
    demand_per_slot = merge[merge['timeslot'] == i]
    temp = demand_per_slot['temperature'].values
    temperature_list.append(temp.mean())

time_slot_list = np.reshape(time_slot_list, (-1, 1))
temperature_list = np.reshape(temperature_list, (-1, 1))

X_train, X_test, y_train, y_test = train_test_split(time_slot_list, temperature_list, test_size=0.2, random_state=42)

weatherModel = LinearRegression().fit(X_train, y_train)
pred = weatherModel.predict(X_test)
pred = [int(x) for x in pred]
y_test = [int(x) for x in y_test]
lrMSE = mean_squared_error(y_test, pred)

# Train a decision tree regression model
treeModel = DecisionTreeRegressor().fit(X_train, y_train)
treePred = treeModel.predict(X_test)
treeMSE = mean_squared_error(y_test, treePred)

# Train a random forest regression model
forestModel = RandomForestRegressor().fit(X_train, y_train)
forestPred = forestModel.predict(X_test)
forestMSE = mean_squared_error(y_test, forestPred)

# Train a support vector regression model
svmModel = SVR().fit(X_train, y_train)
svmPred = svmModel.predict(X_test)
svmMSE = mean_squared_error(y_test, svmPred)

# Train a k-nearest neighbors regression model
knnModel = KNeighborsRegressor().fit(X_train, y_train)
knnPred = knnModel.predict(X_test)
knnMSE = mean_squared_error(y_test, knnPred)

# Print the mean squared error for each model
print('Linear Regression MSE: ', lrMSE)
print('Decision Tree Regression MSE:', treeMSE)
print('Random Forest Regression MSE:', forestMSE)
print('Support Vector Regression MSE:', svmMSE)
print('K-Nearest Neighbors Regression MSE:', knnMSE)

models = {  weatherModel : lrMSE, 
            treeModel : treeMSE, 
            forestModel : forestMSE, 
            svmModel : svmMSE, 
            knnModel : knnMSE}

WeatherModel = min(models, key=models.get)




Linear Regression MSE:  1.4827586206896552
Decision Tree Regression MSE: 0.7106435083362084
Random Forest Regression MSE: 0.6635034182009142
Support Vector Regression MSE: 0.6450401465358189
K-Nearest Neighbors Regression MSE: 0.7109242563841559


### Creating a pred_weather list that contains the model predicted values for each timeslot

In [8]:

pred_weather = []
for i in range(1, 145):
    p = np.array([i])
    p = np.reshape(p, (1, -1))
    l = WeatherModel.predict(p)
    pred_weather.append(l[0])
for p in range(len(pred_weather)):
    pred_weather[p] = int(pred_weather[p])


In [9]:
only_nan = order_data[order_data.isna().any(axis=1)]


In [24]:
regionList = region_data['region_hash'].values

le = LabelEncoder()
 
def create_feature_list_train(only_nan):
    timeslot = []
    region = []
    weather = []
    poi = []
    gap = []
    for i in range(1, 145):
        per_timeSlot = only_nan[only_nan['timeslot'] == i]
        for j in range(region_data.shape[0]):
            region.append(region_data.iloc[j][0]) 
            poi.append(poi_dict[region_data.iloc[j][0]]) 
            timeslot.append(i) 
            weather.append(pred_weather[i - 1])   
            temp = per_timeSlot[per_timeSlot['start_region'] == region_data.iloc[j][0]]
            gap.append(temp.shape[0])
    regionEncoded = le.fit_transform(region) 
    FeatureList = [[x,y,z, m] for x,y,z,m in zip(timeslot, regionEncoded, weather, poi)]
    return FeatureList, gap

def create_feature_list_teste(df):
    timeslot = []
    region = []
    weather = []
    poi = []

    for i in range(1, 145):
        per_timeSlot = df[df['timeslot'] == i]
        for j in range(region_data.shape[0]):
            region.append(region_data.iloc[j][0])
            poi.append(poi_dict[region_data.iloc[j][0]])
            timeslot.append(i)
            weather.append(pred_weather[i - 1])
            temp = per_timeSlot[per_timeSlot['start_region'] == region_data.iloc[j][0]]
    regionEncoded = le.fit_transform(region)
    FeatureList = [[x,y,z,m] for x,y,z,m in zip(timeslot, regionEncoded, weather, poi)]
    return FeatureList



In [23]:
featureList, gap = create_feature_list_train(only_nan)

In [62]:
scaler = StandardScaler()
features_Scaled = scaler.fit_transform(featureList)



svrModel_final = SVR().fit(features_Scaled, gap)
LinearRegression_final = LinearRegression().fit(featureList, gap)
treeReg_final = DecisionTreeRegressor().fit(features_Scaled, gap)
forest_final = RandomForestRegressor().fit(features_Scaled, gap)
knn_final = KNeighborsRegressor().fit(features_Scaled, gap)





In [63]:
FeatureList = create_feature_list_teste(order_data_test)
gap_test = []
for i in range(len(FeatureList)):
    gap_test.append(0)

SVR_predictions = svrModel_final.predict(FeatureList)
LR_predictions = LinearRegression_final.predict(FeatureList)
treeReg_predictions = treeReg_final.predict(FeatureList)
forest_predictions = forest_final.predict(FeatureList)
knn_predictions = knn_final.predict(FeatureList)


In [64]:
SVR_error = mean_squared_error(gap_test, SVR_predictions)
LR_error = mean_squared_error(gap_test, LR_predictions)
treeReg_error = mean_squared_error(gap_test, treeReg_predictions)
forest_error = mean_squared_error(gap_test, forest_predictions)
knn_error = mean_squared_error(gap_test, knn_predictions)

print('SVR MSE: ', SVR_error)
print('Linear Regression MSE: ', LR_error)  
print('Decision Tree Regression MSE:', treeReg_error)
print('Random Forest Regression MSE:', forest_error)
print('K-Nearest Neighbors Regression MSE:', knn_error)


SVR MSE:  64.21383318344303
Linear Regression MSE:  28001.759429059577
Decision Tree Regression MSE: 255.97510683760683
Random Forest Regression MSE: 2970.0351055448714
K-Nearest Neighbors Regression MSE: 3817.751730769231
