In [None]:
import pandas as pd
import numpy as np
import pickle

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# pandas.get_dummies()도 사용할 것

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import xgboost as xgb
from xgboost import plot_importance

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score

In [None]:
with open("../DATA/final_df_user_statistics.pkl", 'rb') as f:
    final_stat = pickle.load(f)

In [None]:
final_stat.head()

In [None]:
df_PCA = final_stat.copy()
df_Tree = final_stat.copy()

In [None]:
df_PCA = df_PCA.join(pd.get_dummies(df_PCA[['major_platform']]), how = 'inner')
df_PCA.drop('major_platform', axis = 1, inplace = True)

In [None]:
scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
scaler.fit(df_PCA.drop("three", axis = 1))
print(scaler.data_max_)

In [None]:
X = scaler.transform(df_PCA.drop("three", axis = 1))
y = np.array(df_PCA.three) # True, False

scaler.fit(X)

print(scaler)

In [None]:
pca = PCA(n_components = 3, random_state = 42)
pca.fit(X).transform(X)

In [None]:
# Percentage of variance explained for each components
print('explained variance ratio (first three components): %s'
      % str(pca.explained_variance_ratio_))

In [None]:
fig = plt.figure(1, figsize = (4,3))
plt.clf()
ax = Axes3D(fig, rect = [0, 0, .95, 1], elev = 48, azim = 134)

plt.cla()

In [None]:
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y,
           cmap=plt.cm.Set1, edgecolor='k', s=40)

In [None]:
plt.show()

### 트리 - three를 y로

In [None]:
df_Tree.drop(['total','avg','med','purchase_log'], axis = 1,inplace = True)
df_Tree = df_Tree.join(pd.get_dummies(df_Tree[['major_platform']]), how = 'inner')
df_Tree.drop('major_platform', axis = 1, inplace = True)
df_Tree.head()

In [None]:
X = np.array(df_Tree.drop("three", axis = 1))
y = np.array(df_Tree.three)
#y = np.array(df_Tree.major_platform.apply(lambda x: 0 if x=="Android" else (1 if x=='iOS' else 2)))
#y = np.array(df_Tree.major_platform.apply(lambda x: 0 if (x=="Android" or x=='iOS') else 1))

In [None]:
set(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
Dtree = DecisionTreeClassifier(max_depth = 8)
Rtree = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
Dtree = Dtree.fit(X_train, y_train)
Rtree = Rtree.fit(X_train, y_train)

In [None]:
print("D훈련 세트 정확도: {:.3f}".format(Dtree.score(X_train, y_train)))
print("D훈련 세트 정확도: {:.3f}".format(Rtree.score(X_train, y_train)))

In [None]:
print("D시험 세트 정확도: {:.3f}".format(Dtree.score(X_test, y_test)))
print("R시험 세트 정확도: {:.3f}".format(Rtree.score(X_test, y_test)))

## RandomForest 변수 별 중요도

In [None]:
importances = Rtree.feature_importances_
std = np.std([tree.feature_importances_ for tree in Rtree.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

In [None]:
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

## 의사결정나무 변수 별 중요도

In [None]:
importances = Dtree.feature_importances_
std = np.std([Dtree.feature_importances_],axis=0)
indices = np.argsort(importances)[::-1]

In [None]:
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
flag = 0
for _ in df_Tree.drop("three", axis = 1).columns:
    print(str(flag)+" : "+_)
    flag += 1

In [None]:
print(df_Tree.purchase_count.value_counts().mean())

In [None]:
pd.DataFrame(df_Tree.purchase_count.value_counts()).iloc[40]

## 트리 - 많이 산 사람~

In [None]:
df_Tree = final_stat.copy()
#df_Tree.drop(['total','avg','med','purchase_log'], axis = 1,inplace = True) # 점수는 test/train 모두 0.96정도로 나온다.
df_Tree.drop(['total','avg','med','purchase_log','view_count'], axis = 1,inplace = True) #view_count를 제외해보자
df_Tree = df_Tree.join(pd.get_dummies(df_Tree[['major_platform']]), how = 'inner')
df_Tree.drop('major_platform', axis = 1, inplace = True)
df_Tree.head()

In [None]:
X = np.array(df_Tree.drop("purchase_count", axis = 1))
y = np.array(df_Tree.purchase_count.apply(lambda x: "HEAVY" if x >= 18 else "LIGHT"))

In [None]:
set(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
Dtree = DecisionTreeClassifier(max_depth = 8)
Rtree = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
Dtree = Dtree.fit(X_train, y_train)
Rtree = Rtree.fit(X_train, y_train)

In [None]:
print("D훈련 세트 정확도: {:.3f}".format(Dtree.score(X_train, y_train)))
print("R훈련 세트 정확도: {:.3f}".format(Rtree.score(X_train, y_train)))

In [None]:
print("D시험 세트 정확도: {:.3f}".format(Dtree.score(X_test, y_test)))
print("R시험 세트 정확도: {:.3f}".format(Rtree.score(X_test, y_test)))

## 트리 중요도

In [None]:
importances = Dtree.feature_importances_
std = np.std([Dtree.feature_importances_],axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

## 랜덤포레스트 중요도

In [None]:
importances = Rtree.feature_importances_
std = np.std([tree.feature_importances_ for tree in Rtree.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
flag = 0
for _ in df_Tree.drop("purchase_count", axis = 1).columns:
    print(str(flag)+" : "+_)
    flag += 1

In [None]:
df_Pcount = pd.DataFrame(df_Tree.purchase_count.value_counts()).sort_index()

In [None]:
df_Pcount.reset_index(inplace=True)

In [None]:
x_ax = df_Pcount.index
y_ax = df_Pcount.purchase_count.values

In [None]:
y_ax

In [None]:
for _ in range(df_Pcount.count()[1]):
    t1 = int(df_Pcount.loc[_][0])
    t2 = int(df_Pcount.loc[_][1])
    print(str(t1)+"개 구입한 사람 : "+str(t2)+"명")

In [None]:
# 6개 구입한 사람들의 user_id
six_buyer = df_Tree[df_Tree.purchase_count == 6].index

In [None]:
data_purc_original = pd.read_csv("../assets/purchases_20180117.csv")
data_production_original = pd.read_csv("../assets/productions_20180117.csv")

In [None]:
data_purc_original.head(2)

In [None]:
data_production_original.head(2)

In [None]:
data_merge_original = pd.merge(data_purc_original, data_production_original)

In [None]:
data_merge_original.head()

In [None]:
x = 651444
tempdf_ = data_merge_original[data_merge_original.user_id == x]
tempdf_.sort_values('cost')

In [None]:
six_df = pd.DataFrame()
for _ in six_buyer:
    df = data_merge_original[data_merge_original.user_id == _]

# 아래의 예시는 할인율이 큰 경우도 있다.
# 만약에 몇개 샀는지 기준으로 분석을 하려면 이러한 데이터가 더 필요할지도 모르겠다.
    

In [None]:
c1= np.array(df_Tree.put_cart_count)
c2 = np.array(df_Tree.scrap_count)
c3 = np.array(df_Tree.category_view_Production)
c4 = np.array(df_Tree.category_view_Card)
c5 = np.array(df_Tree.category_view_Advice)
c6 = np.array(df_Tree.category_view_Project)
c7 = np.array(df_Tree.category_view_Exhibition)
c8 = np.array(df_Tree.purchase_count)
print("c2",np.corrcoef(c1, c2)[0][1])
print("c3",np.corrcoef(c1, c3)[0][1])
print("c4",np.corrcoef(c1, c4)[0][1])
print("c5",np.corrcoef(c1, c5)[0][1])
print("c6",np.corrcoef(c1, c6)[0][1])
print("c7",np.corrcoef(c1, c7)[0][1])
print("c8",np.corrcoef(c1, c8)[0][1])

In [None]:
# 인당 플랫폼 개수 세어보고 각 플랫폼에서 몇 시에 들어왔었는지까지 알아보기

## HEAVY&LIGHT  - 카트 빼고

In [None]:
df_Tree = final_stat.copy()
#df_Tree.drop(['total','avg','med','purchase_log'], axis = 1,inplace = True) # 점수는 test/train 모두 0.96정도로 나온다.
#df_Tree.drop(['total','avg','med','purchase_log','view_count', 'put_cart_count'], axis = 1,inplace = True) #view_count를 제외해보자
df_Tree.drop(['total','avg','med','purchase_log','view_count', 'put_cart_count'], axis = 1,inplace = True) #view_count를 제외해보자
df_Tree = df_Tree.join(pd.get_dummies(df_Tree[['major_platform']]), how = 'inner')
df_Tree.drop('major_platform', axis = 1, inplace = True)

X = np.array(df_Tree.drop("purchase_count", axis = 1))
y = np.array(df_Tree.purchase_count.apply(lambda x: "HEAVY" if x >= 18 else "LIGHT"))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Dtree = DecisionTreeClassifier(max_depth = 8)
Rtree = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
Dtree = Dtree.fit(X_train, y_train)
Rtree = Rtree.fit(X_train, y_train)

In [None]:
print("D훈련 세트 정확도: {:.3f}".format(Dtree.score(X_train, y_train)))
print("R훈련 세트 정확도: {:.3f}".format(Rtree.score(X_train, y_train)))

print("D시험 세트 정확도: {:.3f}".format(Dtree.score(X_test, y_test)))
print("R시험 세트 정확도: {:.3f}".format(Rtree.score(X_test, y_test)))

In [None]:
# 트리 중요도 
importances = Dtree.feature_importances_
std = np.std([Dtree.feature_importances_],axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
# 랜덤포레스트 중요도
importances = Rtree.feature_importances_
std = np.std([tree.feature_importances_ for tree in Rtree.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
flag = 0
for _ in df_Tree.drop("purchase_count", axis = 1).columns:
    print(str(flag)+" : "+_)
    flag += 1

## SMOTE 사용해보기

In [None]:
df_Tree = final_stat.copy()
#df_Tree.drop(['total','avg','med','purchase_log'], axis = 1,inplace = True) # 점수는 test/train 모두 0.96정도로 나온다.
#df_Tree.drop(['total','avg','med','purchase_log','view_count', 'put_cart_count'], axis = 1,inplace = True) #view_count를 제외해보자
df_Tree.drop(['total','avg','med','purchase_log','view_count', 'put_cart_count', 'category_view_Production'], axis = 1,inplace = True)
df_Tree = df_Tree.join(pd.get_dummies(df_Tree[['major_platform']]), how = 'inner')
df_Tree.drop('major_platform', axis = 1, inplace = True)

X = np.array(df_Tree.drop("purchase_count", axis = 1))
y = np.array(df_Tree.purchase_count.apply(lambda x: "HEAVY" if x >= 18 else "LIGHT"))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

oversampler = SMOTE(random_state = 42)
smote_train, smote_target = oversampler.fit_sample(X_train, y_train)

Rtree.fit(smote_train, smote_target)
print("Fitting of Random Forest as finished")
print("R훈련 세트 정확도: {:.3f}".format(Rtree.score(smote_train, smote_target)))

In [None]:
rf_predictions = Rtree.predict(X_test)
print("Predictions finished")

In [None]:
print("R훈련 세트 정확도: {:.3f}".format(accuracy_score(y_test, rf_predictions)))

In [None]:
# 랜덤포레스트 중요도
importances = Rtree.feature_importances_
std = np.std([tree.feature_importances_ for tree in Rtree.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
flag = 0
for _ in df_Tree.drop("purchase_count", axis = 1).columns:
    print(str(flag)+" : "+_)
    flag += 1

In [None]:
df_three = df_Tree[df_Tree.three == True]

In [None]:
df_three_purcx = df_three[df_three.purchase_count==0]
df_three_purco = df_three[df_three.purchase_count >0]

In [None]:
df_three_purco.drop(["like_count","user_info","category_view_Production"],axis = 1).describe()

In [None]:
df_purc2_5 = df_Tree[(df_Tree.purchase_count >= 2) &  (df_Tree.purchase_count <= 5)]
df_purc2_5.drop(["like_count","user_info","category_view_Production"],axis = 1).describe()

In [None]:
df_purc6_10 = df_Tree[(df_Tree.purchase_count >= 6) &  (df_Tree.purchase_count <= 10)]
df_purc6_10.drop(["like_count","user_info","category_view_Production"],axis = 1).describe()

In [None]:
df_purc6_10 = df_Tree[(df_Tree.purchase_count >= 11) &  (df_Tree.purchase_count <= 20)]
df_purc6_10.drop(["like_count","user_info","category_view_Production"],axis = 1).describe()

In [None]:
df_purc6_10 = df_Tree[(df_Tree.purchase_count >= 21) &  (df_Tree.purchase_count <= 30)]
df_purc6_10.drop(["like_count","user_info","category_view_Production"],axis = 1).describe()

In [None]:
df_purc6_10 = df_Tree[(df_Tree.purchase_count >= 31) &  (df_Tree.purchase_count <= 45)]
df_purc6_10.drop(["like_count","user_info","category_view_Production"],axis = 1).describe()

In [None]:
df_purc6_10 = df_Tree[(df_Tree.purchase_count >= 46)]
df_purc6_10.drop(["like_count","user_info","category_view_Production"],axis = 1).describe()

## XGBoost

In [None]:
print(type(smote_train))
churn_dmatrix = xgb.DMatrix(data = smote_train,
                            label = np.array(pd.Series(smote_target).apply(lambda x: 0 if x == 'LIGHT' else 1)))

In [None]:
params = {'objective':'binary:logistic', "max_depth":4}

In [None]:
cv_results = xgb.cv(dtrain = churn_dmatrix, params = params, nfold=4, num_boost_round = 10, metrics = "error", as_pandas=True)

In [None]:
print("Accuracy: %f" %((1-cv_results["test-error-mean"].iloc[-1])))

In [None]:
model = xgb.XGBClassifier(params = params)
model.fit(smote_train,smote_target)

In [None]:
plot_importance(model)
plt.show()

In [None]:
flag = 0
for _ in df_Tree.drop("purchase_count", axis = 1).columns:
    print(str(flag)+" : "+_)
    flag += 1

In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)