In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
df_train = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")

In [3]:
def preprocessing(df, test=False):
    if test:
        df = df.drop(columns=["ID"])
        print(df.shape)
    else:
        df = df.drop(columns=["ID", "reservation_status", "reservation_status_date"])
        print(df.shape)
        no_stay = []
        no_people = []
        for i in range(len(df)):
            if df["stays_in_week_nights"].iloc[i] == 0 and df["stays_in_weekend_nights"].iloc[i] == 0:
                no_stay.append(int(i))
            if df["adults"].iloc[i] == 0 and df["children"].iloc[i] == 0 and df["babies"].iloc[i] == 0:
                no_people.append(int(i))
    #     print(len(index_stay))
    #     print(len(index_people))
        index_drop = np.unique(np.concatenate([no_stay, no_people], axis=0))
        print("刪去%d行" %len(index_drop))
        df = df.drop(index_drop, axis=0)
        print(df.shape)
        
    df["agent"] = df["agent"].astype("object")
    df["company"] = df["company"].astype("object")

    if test == False:
        df = df[df["adr"] > 0]
        print("adr > 0:", df.shape)
    return df

In [4]:
def mask(df):
    df = df[df["adr"] < 5000]
    df = df[df["babies"] <= 2]
    df = df[df["distribution_channel"] != "Undefined"]
    return df

In [5]:
df_train = preprocessing(df_train)
df_train = mask(df_train)
df_train.pop("adr")
df_test = preprocessing(df_test, True)
label = df_train.pop("is_canceled")
print(df_train.shape, df_test.shape)

(91531, 30)
刪去750行
(90781, 30)
adr > 0: (88982, 30)
(27859, 28)
(88974, 28) (27859, 28)


In [6]:
df_concat = pd.concat([df_train, df_test])
df_concat = pd.get_dummies(df_concat)
print(df_concat.shape)

(116833, 931)


In [7]:
df_train = df_concat[:88974]
df_test = df_concat[88974:]
print(df_train.shape)
print(df_test.shape)

(88974, 931)
(27859, 931)


In [8]:
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

In [9]:
x = df_train.values
y = label.values

print(x.shape)
print(y.shape)

(88974, 931)
(88974,)


In [10]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.2, random_state=0)

In [11]:
model = RandomForestClassifier(random_state=0)
# model = KNeighborsClassifier()
model.fit(x_train, y_train)
pred = model.predict(x_train) 
train_acc = accuracy_score(y_train, pred)
pred = model.predict(x_val)
val_acc = accuracy_score(y_val, pred)
print(train_acc, val_acc)

0.9946191994829936 0.9062658050014049


In [12]:
x_test = df_test.values
print(x_test.shape)

(27859, 931)


In [13]:
start = time.time()
model.fit(x, y)
end = time.time()
m = int((end - start) // 60)
s = int((end - start) % 60)
print("time: %d m %d s" %(m, s))

new_pred = model.predict(x_test)
print(len(new_pred))
print(new_pred)

time: 0 m 54 s
27859
[0 1 0 ... 0 0 0]


In [14]:
df_test = pd.read_csv("./test.csv")
new_test = df_test.copy()
new_test.insert(loc=2, column="is_canceled", value=new_pred)
new_test.to_csv("/home/data/andy/course/ML_Lin/new_test.csv", index=0)