In [2]:
import pandas as pd
import numpy as np
import ast
from tqdm import tqdm
from gensim.models import Word2Vec

In [3]:
root_path = r"D:/PythonProject/didi_giscup/data/giscup_2021/"
train_path = r"processed_train"
test_path = r"20200901/"
weather_path = root_path + "weather.csv"

In [4]:
# def read_weather():
weather = pd.read_csv(weather_path)
we_change = {"cloudy": 1, "moderate rain": 2, "showers": 3, "heavy rain": 4, "rainstorm": 5}
weather["weather"] = weather["weather"].map(we_change)
weather.index = weather["date"]
all_link = []


# add feature link_list and write to file
def modify_head(day_path: str):
    head = pd.read_csv(day_path + "head.csv")
    if "link_list" in head.columns:
        display(day_path + "head.csv was already modified.")
        return 
    link = pd.read_csv(day_path + "link.csv")
    link["link_id"] = link["link_id"].astype(np.int).astype(np.str)
    link_id = link.groupby("order_id").apply(lambda x: x["link_id"].values.tolist())
    head = pd.merge(head, link_id.rename("link_list"), left_on="order_id", right_index=True)
    head.to_csv(day_path + "head.csv", index=False)
    display(day_path + "head.csv write successful.")

hid_size = 128
def train_w2v_model():
    path_ = root_path + "processed_train/"
    day_list = list(range(1, 32))
    if 3 in day_list:
        day_list.remove(3)
    # add trianset link_id
    for d in day_list:
        new_file_name = path_ + "202008" + "{:0>2d}".format(d) + "/"
        head = pd.read_csv(new_file_name + "head.csv")
        global all_link
        all_link.extend(head["link_list"].values.tolist())
        all_link = list(set(all_link))
        print(d, "'s link added.")
    
    # add testset link_id
    head = pd.read_csv(root_path + test_path + "head.csv")
    all_link.extend(head["link_list"].values.tolist())
    del head
    all_link = list(set(all_link))
    print("Test's link added.")
    
    all_link = [ast.literal_eval(lk) for lk in tqdm(all_link)]
    w2v_model = Word2Vec(all_link, vector_size=hid_size)
    w2v_model.save("./w2v_model.model")
    print("wordd2vec model training finish.")
    

def built_vec():
    w2v_model = Word2Vec.load("./w2v_model.model")
    list_vec = []
    for idx in tqdm(w2v_model.wv.index_to_key):
        temp = [int(idx)]
        temp.extend(list(w2v_model.wv[idx]))
        list_vec.append(temp)
    link_col = ["vec_dim_" + str(i) for i in range(hid_size)]
    link_col.insert(0, "link_id")
    wv_df = pd.DataFrame(list_vec, columns=link_col)
    wv_df.to_csv("w2v_table.csv", index=False)    
    
    


def day_feature(day_path: str):
    head = pd.read_csv(day_path + "head.csv")
    link = pd.read_csv(day_path + "link.csv")
    cross = pd.read_csv(day_path + "cross.csv")
#     print("start link shape: ", link.shape)
#     display("head: ", head.head())
    
    # weather set up
    time_id = int(day_path[-9:-1])
    head["weather"] = weather.loc[time_id, "weather"]
    head["hightemp"] = weather.loc[time_id, "hightemp"]
    head["lowtemp"] = weather.loc[time_id, "lowtemp"]
    head["temp_sub"] = head["hightemp"] - head["lowtemp"]

    # slice id features
    head["slice_id"] = head["slice_id"].astype(int)
    head["slice_1m"] = head["slice_id"] * 5
    head["slice_30m"] = (head["slice_id"] * 5) // 30
    head["slice_1h"] = (head["slice_id"] * 5) // 60
    
    # link count
    link_cnt = link["order_id"].value_counts()
    head = pd.merge(head, link_cnt.rename("link_cnt"), left_on="order_id", right_index=True)
    
    # mean distance
    head["mean_distance"] = head["distance"] / head["link_cnt"]
    head["speed_one"] = head["distance"] / head["simple_eta"]
    
    
    # link_time features
    link_statics = link.groupby("order_id")["link_time"].agg(link_time_sum="sum", link_time_mean="mean",
                                                            link_time_max="max", link_time_min="min")
    head = pd.merge(head, link_statics, left_on="order_id", right_index=True)
    head["speed_two"] = head["distance"] / head["link_time_sum"]
    # link_current_status features
    current_status = link.groupby("order_id")["link_current_status"].agg(link_cur_sta_mean="mean",
                                                                        link_cur_sta_sum="sum")
    head = pd.merge(head, current_status, left_on="order_id", right_index=True)
    conges = link[link.link_current_status > 2].groupby("order_id")["link_current_status"].agg(conges_cnt="count",
                                                                                          conges_sum="sum")
    head = pd.merge(head, conges, on="order_id", how="left")
    head.fillna(0, inplace=True)
    
    amble = link[link.link_current_status == 2].groupby("order_id")["link_current_status"].agg(amble_cnt="count",
                                                                                          amble_sum="sum")
    head = pd.merge(head, amble, on="order_id", how="left")
    head.fillna(0, inplace=True)
    
    
    # cross count
    cross_cnt = cross["order_id"].value_counts()
    head = pd.merge(head, cross_cnt.rename("cross_cnt"), left_on="order_id", right_index=True)
    # cross_time features
    cross_statics = cross.groupby("order_id")["cross_time"].agg(cross_time_sum="sum", cross_time_mean="mean",
                                                                cross_time_max="max", cross_time_mode="median")
    head = pd.merge(head, cross_statics, left_on="order_id", right_index=True)
    
    first_cross = cross.drop_duplicates(subset=["order_id"], keep="first")[["order_id", "cross_from", "cross_to"]]
    head = pd.merge(head, first_cross, on="order_id", suffixes=("", "_first"))
    last_cross = cross.drop_duplicates(subset=["order_id"], keep="last")[["order_id", "cross_from", "cross_to"]]
    head = pd.merge(head, last_cross, on="order_id", suffixes=("", "_last"))
    
    head["link_time_sum_ratio"] = head["link_time_sum"] / head["simple_eta"]
    
#     display(head.columns)
#     display("processed head: ", head.head(7))
    head.to_csv(day_path + "feature.csv", index=False)

# modify head file
def modify_main():
    path_ = root_path + "processed_train/"
    day_list = list(range(1, 32))
    if 3 in day_list:
        day_list.remove(3)
    for d in day_list:
        new_file_name = "202008" + "{:0>2d}".format(d) + "/"
        modify_head(path_ + new_file_name)
    modify_head(root_path + test_path)

    
def feature_main():
    path_ = root_path + "processed_train/"
    day_list = list(range(1, 2))
    if 3 in day_list:
        day_list.remove(3)
    for d in day_list:
        new_file_name = "202008" + "{:0>2d}".format(d) + "/"
        modify_head(path_ + new_file_name)
#         day_feature(path_ + new_file_name)
#         print(new_file_name, "feature built sucessfully.")
#     day_feature(root_path + test_path)
#     print("20200901/ feature built sucessfully.")

In [6]:
# feature_main()
# modify_main()
train_w2v_model()
built_vec()

1 's link added.
2 's link added.


KeyboardInterrupt: 

In [None]:
# a = [1, 1, 3]
# b = [2, 4, 2]
# c = [1, 1, 2]
# d = pd.DataFrame({"a": a, "b": b, "c": c})
# display(d)
# def func(gro):
#     gro["d"] = gro["b"] + 1
#     gro["e"] = gro["c"] + 1
#     display(gro)
#     return gro
# d = d.groupby("a").apply(func)
# d

In [None]:
a = list(set([1, 4, 5]))
a

In [None]:
weather.loc[20200801, "hightemp"]