In [32]:
import pandas as pd
import os

def read_space_separated_csv(file_path, column_names):
    return pd.read_csv(file_path, sep='\t', names=column_names)

def process_poi_data(poi_data_str):
    poi_data_dict = {}
    for pair in poi_data_str.split('\t'):
        key, value = pair.split(':')
        poi_data_dict[key] = int(value)
    return poi_data_dict

def read_poi_data(file_path):
    with open(file_path, 'r') as file:
        lines = file.read().split('\n')

    data = []

    for line in lines:
        if line:
            region_hash, poi_data_str = line.split('\t', 1)
            poi_data_dict = process_poi_data(poi_data_str)
            data.append({'region_hash': region_hash, 'poi_data': poi_data_dict})

    column_names = ['region_hash', 'poi_data']
    poi_data_df = pd.DataFrame(data, columns=column_names)
    return poi_data_df


file_path = 'training_data/poi_data/poi_data'
poi_data = read_poi_data(file_path)

# Read the cluster_map data
cluster_map_columns = ['region_hash', 'region_id']
cluster_map_file_path = 'training_data/cluster_map/cluster_map'
cluster_map = read_space_separated_csv(cluster_map_file_path, cluster_map_columns)




# Read the order_data files
order_data_columns = ['order_id', 'driver_id', 'passenger_id', 'start_region_hash', 'dest_region_hash', 'Price', 'Time']
order_data_folder_path = 'training_data/order_data/'
order_data_files = [f for f in os.listdir(order_data_folder_path) if os.path.isfile(os.path.join(order_data_folder_path, f)) and not f.startswith('._')]
order_data_list = [read_space_separated_csv(os.path.join(order_data_folder_path, file), order_data_columns) for file in order_data_files]
order_data = pd.concat(order_data_list)


# Read the weather_data files
weather_data_columns = ['Time', 'Weather', 'temperature', 'PM2.5']
weather_data_folder_path = 'training_data/weather_data/'
weather_data_files = [f for f in os.listdir(weather_data_folder_path) if os.path.isfile(os.path.join(weather_data_folder_path, f)) and not f.startswith('._')]
weather_data_list = [read_space_separated_csv(os.path.join(weather_data_folder_path, file), weather_data_columns) for file in weather_data_files]
weather_data = pd.concat(weather_data_list)




In [36]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder

# Your code to read data goes here...

# Preprocessing and feature extraction

# Convert the 'Time' column to datetime format
order_data['Time'] = pd.to_datetime(order_data['Time'])
weather_data['Time'] = pd.to_datetime(weather_data['Time'])

# Extract hour, minute, and day of the week from the 'Time' column
order_data['hour'] = order_data['Time'].dt.hour
order_data['minute'] = order_data['Time'].dt.minute
order_data['day_of_week'] = order_data['Time'].dt.dayofweek

# Merge the cluster_map into the order_data
order_data = order_data.merge(cluster_map, left_on='start_region_hash', right_on='region_hash', how='left')
order_data = order_data.rename(columns={'region_id': 'start_region_id'}).drop('region_hash', axis=1)

order_data = order_data.merge(cluster_map, left_on='dest_region_hash', right_on='region_hash', how='left')
order_data = order_data.rename(columns={'region_id': 'dest_region_id'}).drop('region_hash', axis=1)

# Merge the weather_data into the order_data
order_data = order_data.merge(weather_data, on='Time', how='left')

# Calculate the demand and supply columns
order_data['demand'] = 1
order_data['supply'] = order_data['driver_id'].notnull().astype(int)

# Aggregate data by time slot and region
grouped_data = order_data.groupby(['Time', 'start_region_id']).agg({'demand': 'sum', 'supply': 'sum'}).reset_index()
grouped_data['gap'] = grouped_data['demand'] - grouped_data['supply']

# Prepare the features and target variable
X = grouped_data.drop(['Time', 'gap', 'demand', 'supply'], axis=1)
y = grouped_data['gap']
# Encode the region_id column
le = LabelEncoder()
X['start_region_id'] = le.fit_transform(X['start_region_id'])

# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the demand-supply gap for the testing dataset
y_pred = model.predict(X_test)

# Calculate the mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")


KeyError: "Column(s) ['PM2.5', 'Weather', 'temperature'] do not exist"

In [35]:

# # Prepare the features and target variable
# X = grouped_data.drop(['Time', 'gap'], axis=1)
# y = grouped_data['gap']

# # Encode the region_id column
# le = LabelEncoder()
# X['start_region_id'] = le.fit_transform(X['start_region_id'])

# # Train the linear regression model on the entire dataset
# model = LinearRegression()
# model.fit(X, y)

# # Generate predictions for each region and time slot in the test dataset
# test_date = pd.to_datetime('2016-01-23')
# test_hour_slots = [test_date + timedelta(minutes=10 * i) for i in range(144)]

# results = []
# for region_id in cluster_map['region_id'].unique():
#     for time_slot in test_hour_slots:
#         test_data = {
#             'Time': time_slot,
#             'start_region_id': region_id,
#             'hour': time_slot.hour,
#             'minute': time_slot.minute,
#             'day_of_week': time_slot.dayofweek
#         }
        
#         test_df = pd.DataFrame([test_data])
#         test_df['start_region_id'] = le.transform(test_df['start_region_id'])
        
#         prediction = model.predict(test_df.drop('Time', axis=1))[0]
#         results.append({'Region_id': region_id, 'Time_slot': time_slot, 'Prediction_value': prediction})

# # Save the predictions in the required format
# predictions_df = pd.DataFrame(results)
# predictions_df['Time_slot'] = predictions_df['Time_slot'].apply(lambda x: x.strftime('%Y-%m-%d-%H-%M'))
# predictions_df.to_csv('predictions.csv', index=False, columns=['Region_id', 'Time_slot', 'Prediction_value'])

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- day_of_week
- hour
- minute
Feature names seen at fit time, yet now missing:
- demand
- supply
