In [48]:
import pandas as pd
import os

def read_space_separated_csv(file_path, column_names, delim = '\t'):
    return pd.read_csv(file_path, sep=delim, names=column_names)

def process_poi_data(poi_data_str):
    poi_data_dict = {}
    for pair in poi_data_str.split('\t'):
        key, value = pair.split(':')
        poi_data_dict[key] = int(value)
    return poi_data_dict

def read_poi_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.read().split('\n')

    data = []

    for line in lines:
        if line:
            region_hash, poi_data_str = line.split('\t', 1)
            poi_data_dict = process_poi_data(poi_data_str)
            data.append({'region_hash': region_hash, 'poi_data': poi_data_dict})

    column_names = ['region_hash', 'poi_data']
    poi_data_df = pd.DataFrame(data, columns=column_names)
    return poi_data_df


file_path = 'training_data/poi_data/poi_data'
poi_data = read_poi_data(file_path)

# Read the cluster_map data
cluster_map_columns = ['region_hash', 'region_id']
cluster_map_file_path = 'training_data/cluster_map/cluster_map'
cluster_map = read_space_separated_csv(cluster_map_file_path, cluster_map_columns)




# Read the order_data files
order_data_columns = ['order_id', 'driver_id', 'passenger_id', 'start_region_hash', 'dest_region_hash', 'Price', 'Time']
order_data_folder_path = 'training_data/order_data/'
order_data_files = [f for f in os.listdir(order_data_folder_path) if os.path.isfile(os.path.join(order_data_folder_path, f)) and not f.startswith('._')]
order_data_list = [read_space_separated_csv(os.path.join(order_data_folder_path, file), order_data_columns) for file in order_data_files]
order_data = pd.concat(order_data_list)


# Read the weather_data files
weather_data_columns = ['Time', 'Weather', 'temperature', 'PM2.5']
weather_data_folder_path = 'training_data/weather_data/'
weather_data_files = [f for f in os.listdir(weather_data_folder_path) if os.path.isfile(os.path.join(weather_data_folder_path, f)) and not f.startswith('._')]
weather_data_list = [read_space_separated_csv(os.path.join(weather_data_folder_path, file), weather_data_columns) for file in weather_data_files]
weather_data = pd.concat(weather_data_list)




In [49]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder


# Preprocessing and feature extraction

# Convert the 'Time' column to datetime format
order_data['Time'] = pd.to_datetime(order_data['Time'])
weather_data['Time'] = pd.to_datetime(weather_data['Time'])

# Extract hour, minute, and day of the week from the 'Time' column
order_data['hour'] = order_data['Time'].dt.hour
order_data['minute'] = order_data['Time'].dt.minute
order_data['day_of_week'] = order_data['Time'].dt.dayofweek

# Merge the cluster_map into the order_data
order_data = order_data.merge(cluster_map, left_on='start_region_hash', right_on='region_hash', how='left')
order_data = order_data.rename(columns={'region_id': 'start_region_id'}).drop('region_hash', axis=1)

order_data = order_data.merge(cluster_map, left_on='dest_region_hash', right_on='region_hash', how='left')
order_data = order_data.rename(columns={'region_id': 'dest_region_id'}).drop('region_hash', axis=1)

# Merge the weather_data into the order_data
order_data = order_data.merge(weather_data, on='Time', how='left')

# Calculate the demand and supply columns
order_data['demand'] = 1
order_data['supply'] = order_data['driver_id'].notnull().astype(int)

# Aggregate data by time slot and region
grouped_data = order_data.groupby([pd.Grouper(key='Time', freq='10min'), 'start_region_id']).agg({'demand': 'sum', 'supply': 'sum'}).reset_index()

grouped_data['gap'] = grouped_data['demand'] - grouped_data['supply']

# Prepare the features and target variable
X = grouped_data.drop(['Time', 'gap', 'demand', 'supply'], axis=1)
y = grouped_data['gap']
# Encode the region_id column


X['start_region_id'] = LabelEncoder().fit_transform(X['start_region_id'])

# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the demand-supply gap for the testing dataset
y_pred = model.predict(X_test)

# Calculate the mean absolute error
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

Mean Squared Error: 2895.076330189059
R-squared: 0.0002640992648823559
Mean Absolute Error: 13.472912971501852


In [50]:
test_poi_data_file_path = 'test_set/poi_data/poi_data'
test_poi_data = read_poi_data(test_poi_data_file_path)

test_cluster_map_file_path = 'test_set/cluster_map/cluster_map'
test_cluster_map = read_space_separated_csv(test_cluster_map_file_path, cluster_map_columns)

order_data_columns = ['driver_id', 'passenger_id', 'start_region_hash', 'dest_region_hash', 'Time']
test_order_data_folder_path = 'test_set/order_data/'
test_order_data_files = [f for f in os.listdir(test_order_data_folder_path) if os.path.isfile(os.path.join(test_order_data_folder_path, f)) and not f.startswith('._')]
test_order_data_list = [read_space_separated_csv(os.path.join(test_order_data_folder_path, file), order_data_columns, ',') for file in test_order_data_files]
test_order_data = pd.concat(test_order_data_list)

test_weather_data_folder_path = 'test_set/weather_data/'
test_weather_data_files = [f for f in os.listdir(test_weather_data_folder_path) if os.path.isfile(os.path.join(test_weather_data_folder_path, f)) and not f.startswith('._')]
test_weather_data_list = [read_space_separated_csv(os.path.join(test_weather_data_folder_path, file), weather_data_columns) for file in test_weather_data_files]
test_weather_data = pd.concat(test_weather_data_list)

In [51]:

# Convert the 'Time' column to datetime format
test_order_data['Time'] = pd.to_datetime(test_order_data['Time'])
test_weather_data['Time'] = pd.to_datetime(test_weather_data['Time'])

# Extract hour, minute, and day of the week from the 'Time' column
test_order_data['hour'] = test_order_data['Time'].dt.hour
test_order_data['minute'] = test_order_data['Time'].dt.minute
test_order_data['day_of_week'] = test_order_data['Time'].dt.dayofweek

# Merge the cluster_map into the order_data
test_order_data = test_order_data.merge(test_cluster_map, left_on='start_region_hash', right_on='region_hash', how='left')
test_order_data = test_order_data.rename(columns={'region_id': 'start_region_id'}).drop('region_hash', axis=1)

test_order_data = test_order_data.merge(test_cluster_map, left_on='dest_region_hash', right_on='region_hash', how='left')
test_order_data = test_order_data.rename(columns={'region_id': 'dest_region_id'}).drop('region_hash', axis=1)

# Merge the weather_data into the order_data
test_order_data = order_data.merge(test_weather_data, on='Time', how='left')

# Calculate the demand and supply columns
test_order_data['demand'] = 1
test_order_data['supply'] = test_order_data['driver_id'].notnull().astype(int)

# Aggregate data by time slot and region
test_grouped_data = test_order_data.groupby(['Time', 'start_region_id']).agg({'demand': 'sum', 'supply': 'sum'}).reset_index()
test_grouped_data['gap'] = test_grouped_data['demand'] - test_grouped_data['supply']


# Prepare the test features and target variable
X_test = test_grouped_data.drop(['Time', 'gap', 'demand', 'supply'], axis=1)
y_test = test_grouped_data['gap']

# Encode the region_id column for the test data
X_test['start_region_id'] = LabelEncoder().fit_transform(X_test['start_region_id'])

# Predict the demand-supply gap for the test dataset
y_test_pred = model.predict(X_test)

# Calculate the mean absolute error for the test dataset
test_mae = mean_absolute_error(y_test, y_test_pred)
print(f"Mean Absolute Error for Test Dataset: {test_mae}")



Mean Absolute Error for Test Dataset: 9.223269442425517
