# データ処理の流れ
1. CSVデータをimport
2. いらない列を削除
3. 性別とか職業をフラグに変換
4. UserID毎に購入金額を集計

---

# 参考になっているサイト
```
- ゆるふわPandasチートシート
https://qiita.com/tanemaki/items/2ed05e258ef4c9e6caac
- groupby の初歩と python（DataFrame）によるサンプル
http://ailaby.com/groupby_easy/
- pandas.DataFrameをGroupByでグルーピングし統計量を算出
https://note.nkmk.me/python-pandas-groupby-statistics/
```

In [1]:
# モジュールのimport
import pandas as pd

In [2]:
df = pd.read_csv('BlackFriday.csv')

In [3]:
df.head(5)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [4]:
print("*********************************")
print(df.dtypes)
print("*********************************")
print(df.index)
print("*********************************")

*********************************
User_ID                         int64
Product_ID                     object
Gender                         object
Age                            object
Occupation                      int64
City_Category                  object
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
dtype: object
*********************************
RangeIndex(start=0, stop=537577, step=1)
*********************************


In [5]:
df = df.drop('Product_Category_1', axis=1)
df = df.drop('Product_Category_2', axis=1)
df = df.drop('Product_Category_3', axis=1)
print(df.dtypes)

User_ID                        int64
Product_ID                    object
Gender                        object
Age                           object
Occupation                     int64
City_Category                 object
Stay_In_Current_City_Years    object
Marital_Status                 int64
Purchase                       int64
dtype: object


In [6]:
# カテゴリ変数をone-hot codingで離散化
list = ['Gender','Age','Occupation','City_Category','Stay_In_Current_City_Years','Marital_Status']
for col in list:
    colname = col
    df_dummies = pd.get_dummies(df[colname], prefix=colname)
    df.drop([colname], axis=1, inplace=True)
    df = df.join(df_dummies)

In [7]:
df.head(5)
print(df.dtypes)

User_ID                           int64
Product_ID                       object
Purchase                          int64
Gender_F                          uint8
Gender_M                          uint8
Age_0-17                          uint8
Age_18-25                         uint8
Age_26-35                         uint8
Age_36-45                         uint8
Age_46-50                         uint8
Age_51-55                         uint8
Age_55+                           uint8
Occupation_0                      uint8
Occupation_1                      uint8
Occupation_2                      uint8
Occupation_3                      uint8
Occupation_4                      uint8
Occupation_5                      uint8
Occupation_6                      uint8
Occupation_7                      uint8
Occupation_8                      uint8
Occupation_9                      uint8
Occupation_10                     uint8
Occupation_11                     uint8
Occupation_12                     uint8


In [8]:
df_groupby = df.groupby("User_ID",as_index=False)

In [9]:
df_groupby.head(5)

Unnamed: 0,User_ID,Product_ID,Purchase,Gender_F,Gender_M,Age_0-17,Age_18-25,Age_26-35,Age_36-45,Age_46-50,...,City_Category_A,City_Category_B,City_Category_C,Stay_In_Current_City_Years_0,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+,Marital_Status_0,Marital_Status_1
0,1000001,P00069042,8370,1,0,1,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0
1,1000001,P00248942,15200,1,0,1,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0
2,1000001,P00087842,1422,1,0,1,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0
3,1000001,P00085442,1057,1,0,1,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0
4,1000002,P00285442,7969,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0
5,1000003,P00193542,15227,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
6,1000004,P00184942,19215,0,1,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,1
7,1000004,P00346142,15854,0,1,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,1
8,1000004,P0097242,15686,0,1,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,1
9,1000005,P00274942,7871,0,1,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,1


In [14]:
dict = {
'Purchase':sum,
'Gender_F':max,
'Gender_M':max,
'Age_0-17':max,
'Age_18-25':max,
'Age_26-35':max,
'Age_36-45':max,
'Age_46-50':max,
'Age_51-55':max,
'Age_55+':max,
'Occupation_0':max,
'Occupation_1':max,
'Occupation_2':max,
'Occupation_3':max,
'Occupation_4':max,
'Occupation_5':max,
'Occupation_6':max,
'Occupation_7':max,
'Occupation_8':max,
'Occupation_9':max,
'Occupation_10':max,
'Occupation_11':max,
'Occupation_12':max,
'Occupation_13':max,
'Occupation_14':max,
'Occupation_15':max,
'Occupation_16':max,
'Occupation_17':max,
'Occupation_18':max,
'Occupation_19':max,
'Occupation_20':max,
'City_Category_A':max,
'City_Category_B':max,
'City_Category_C':max,
'Stay_In_Current_City_Years_0':max,
'Stay_In_Current_City_Years_1':max,
'Stay_In_Current_City_Years_2':max,
'Stay_In_Current_City_Years_3':max,
'Stay_In_Current_City_Years_4+':max,
'Marital_Status_0':max,
'Marital_Status_1':max}
df_dummy = df_groupby.agg(dict)

In [15]:
df_dummy.head(5)

Unnamed: 0,User_ID,Purchase,Gender_F,Gender_M,Age_0-17,Age_18-25,Age_26-35,Age_36-45,Age_46-50,Age_51-55,...,City_Category_A,City_Category_B,City_Category_C,Stay_In_Current_City_Years_0,Stay_In_Current_City_Years_1,Stay_In_Current_City_Years_2,Stay_In_Current_City_Years_3,Stay_In_Current_City_Years_4+,Marital_Status_0,Marital_Status_1
0,1000001,333481,1,0,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0
1,1000002,810353,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0
2,1000003,341635,0,1,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,1,0
3,1000004,205987,0,1,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,1
4,1000005,821001,0,1,0,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,1


# ランダムフォレストにかけてみる
```
- 【Pythonで決定木 & Random Forest】タイタニックの生存者データを分析してみた
http://www.randpy.tokyo/entry/python_random_forest
```

In [16]:
# 準備
from sklearn.model_selection import train_test_split
train_X = df_dummy.drop('Purchase', axis=1)
train_y = df_dummy.Purchase
(train_X, test_X ,train_y, test_y) = train_test_split(train_X, train_y, test_size = 0.3, random_state = 666)

In [17]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(train_X, train_y)
pred = clf.predict(test_X)

In [18]:
from sklearn.metrics import (roc_curve, auc, accuracy_score)

pred = clf.predict(test_X)
fpr, tpr, thresholds = roc_curve(test_y, pred, pos_label=1)
auc(fpr, tpr)
accuracy_score(pred, test_y)



0.0

In [19]:
#ランダムフォレスト
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)
clf = clf.fit(train_X, train_y)
pred = clf.predict(test_X)
fpr, tpr, thresholds = roc_curve(test_y, pred, pos_label=1)
auc(fpr, tpr)
accuracy_score(pred, test_y)



0.0