# Santander Product Recommendation | Kaggle
[此競賽連結](https://www.kaggle.com/c/santander-product-recommendation)

In [None]:
import pandas as pd
import numpy as np
train = pd.read_csv(r'C:/Users/user/Desktop/train_ver2.csv')


## 資料前處理 - Part1：補值(眾數、Unknown)

In [None]:
train['ind_empleado'].fillna(train['ind_empleado'].value_counts().idxmax(), inplace = True)

In [None]:
train['pais_residencia'].fillna(train['pais_residencia'].value_counts().idxmax(), inplace = True)

In [None]:
train['sexo'].fillna('UNKNOWN', inplace=True)

In [None]:
train['ult_fec_cli_1t'].fillna('UNKNOWN', inplace=True)

In [None]:
train['tiprel_1mes'].fillna('A', inplace = True)

In [None]:
train['indresi'].fillna(train['indresi'].value_counts().idxmax(), inplace = True)

In [None]:
train['indext'].fillna(train['indext'].value_counts().idxmax(), inplace = True)

In [None]:
train['conyuemp'].fillna(train['conyuemp'].value_counts().idxmax(), inplace = True)

In [None]:
train['indfall'].fillna(train['indfall'].value_counts().idxmax(), inplace = True)

In [None]:
train['nomprov'].fillna(train['nomprov'].value_counts().idxmax(), inplace = True)

In [None]:
train['ind_nomina_ult1'].fillna(train['ind_nomina_ult1'].value_counts().idxmax(), inplace = True)

In [None]:
train['ind_nom_pens_ult1'].fillna(train['ind_nom_pens_ult1'].value_counts().idxmax(), inplace = True)

In [None]:
map_dict = { 1.0  : '1',
            '1.0' : '1',
            '1'   : '1',
            2.0   : '2',
            '2.0' : '2',
            '2'   : '2',
            3.0   : '3',
            '3.0' : '3',
            '3'   : '3',
            'P'   : 'P',
            4.0   : '4',
            '4.0' : '4',
            '4'   : '4',
            np.nan: np.nan,
            np.NaN: np.nan,
            'NA'  : np.nan}
train.indrel_1mes = train.indrel_1mes.apply(lambda x: map_dict.get(x,x))
train.indrel_1mes.fillna(train['indrel_1mes'].value_counts().idxmax(), inplace = True)

## 資料前處理 - Part2：補值(根據其他欄位的資料內容進行補值，如客戶居住地區/收入比例)

In [None]:
nomprovs = train.nomprov.unique() 
for nomprov in nomprovs:
    renta = train[train.nomprov == nomprov].renta.mean(skipna=True) 
    train.loc[train.nomprov == nomprov, 'renta'] = train[train.nomprov == nomprov].renta.fillna(renta)
print(train.renta.isnull().sum())

In [None]:
nomprovs = train.nomprov.unique() 
for nomprov in nomprovs:
    canal_entrada = train[train.nomprov == nomprov].canal_entrada.mode()[0] 
    train.loc[train.nomprov == nomprov, 'canal_entrada'] = train[train.nomprov == nomprov].canal_entrada.fillna(canal_entrada)
print(train.canal_entrada.isnull().sum())

In [None]:
one_mean = train[train.segmento == '01 - TOP'].renta.mean(skipna = True)
two_mean = train[train.segmento == '02 - PARTICULARES'].renta.mean(skipna = True)
three_mean = train[train.segmento == '03 - UNIVERSITARIO'].renta.mean(skipna = True)
print(one_mean)
print(two_mean)
print(three_mean)
top_par = (one_mean + two_mean)/2
par_uni = (three_mean + two_mean)/2
print(top_par,par_uni)
train.loc[train.renta < par_uni, 'segmento'] = train[train.renta < par_uni].segmento.fillna('03 - UNIVERSITARIO')
train.loc[train.renta > top_par, 'segmento'] = train[train.renta > top_par].segmento.fillna('01 - TOP')
train.loc[train.segmento.isnull(), 'segmento'] = train[train.segmento.isnull()].segmento.fillna('02 - PARTICULARES')

In [None]:
train = train[train.fecha_alta.notnull()]

## 資料前處理 - Part 3 缺值補完後的最後整理(剔除異常值、統一資料型態等)

In [None]:
train = train.drop(['tipodom','cod_prov'], axis = 1)

In [None]:
train.antiguedad = pd.to_numeric(train.antiguedad, errors = "coerce")
train = train[train.antiguedad != -999999]

In [None]:
train.age = pd.to_numeric(train.age, errors = "coerce")

In [None]:
train.isnull().sum()

In [None]:
#刪除較無影響變數
df1 = train.drop(['fecha_dato', 'fecha_alta', 'ult_fec_cli_1t', 'ncodpers'], axis=1)
# print(df1.info())
# print(df1.describe())

## 因電腦效能與記憶體空間問題，故進行「抽樣」及「將資料從int, float64轉小」

In [None]:
#抽樣
df1 = df1.sample(n=5000000)
df1.info()

In [None]:
#針對連續資料進行處理，轉成比較小的型態
df = df1.iloc[:,0:18]
df.iloc[:,3] = df.iloc[:,3].astype(np.int16)
df.iloc[:,4] = df.iloc[:,4].astype(np.float16)
df.iloc[:,5] = df.iloc[:,5].astype(np.int16)
df.iloc[:,6] = df.iloc[:,6].astype(np.float16)
df.iloc[:,15] = df.iloc[:,15].astype(np.float16)
df.iloc[:,16] = df.iloc[:,16].astype(np.float32)
df2 = df.iloc[:,[3,4,5,6,15,16]]
df2.info()
df2.describe()

## 特徵工程 - One hot encoding

In [None]:
#對離散資料資料處理，使用 get_dummies 轉成多個欄位，再 merge 回去

df2 = pd.concat([df2, pd.get_dummies(df.iloc[:,0]).astype(np.int8)], axis=1)
df2 = pd.concat([df2, pd.get_dummies(df.iloc[:,1]).astype(np.int8)], axis=1)
df2 = pd.concat([df2, pd.get_dummies(df.iloc[:,2]).astype(np.int8)], axis=1)
df2 = pd.concat([df2, pd.get_dummies(df.iloc[:,7]).astype(np.int8)], axis=1)
df2 = pd.concat([df2, pd.get_dummies(df.iloc[:,8]).astype(np.int8)], axis=1)
df2 = pd.concat([df2, pd.get_dummies(df.iloc[:,9]).astype(np.int8)], axis=1)
df2 = pd.concat([df2, pd.get_dummies(df.iloc[:,10]).astype(np.int8)], axis=1)
df2 = pd.concat([df2, pd.get_dummies(df.iloc[:,11]).astype(np.int8)], axis=1)
df2 = pd.concat([df2, pd.get_dummies(df.iloc[:,12]).astype(np.int8)], axis=1)
df2 = pd.concat([df2, pd.get_dummies(df.iloc[:,13]).astype(np.int8)], axis=1)
df2 = pd.concat([df2, pd.get_dummies(df.iloc[:,14]).astype(np.int8)], axis=1)
df2 = pd.concat([df2, pd.get_dummies(df.iloc[:,17]).astype(np.int8)], axis=1)

# df2.head()
df2.info()

In [None]:
#把商品資料欄位加回來(後24)
df2 = pd.concat([df2, df1.iloc[:,18:].astype(np.int8)], axis=1)
df2.info()

In [None]:
#完整資料(可放入模型)
df2.info()
list(df2)

In [None]:
X = df2.iloc[:,:-24]
Y = df2.iloc[:,-24:]

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn import preprocessing 
from sklearn.model_selection import cross_val_score


## 建立模型(隨機森林, kNN, 羅吉斯迴歸, 類神經)

In [None]:
#將前處理與特徵工程後的 X 與 Y 分成 training set 和 testing set 
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=4)

# print(X_train.shape, y_train.shape)
# print(X_test.shape, y_test.shape)

In [None]:
#隨機森林
rf = RandomForestClassifier().fit(X_train, y_train)
print(rf.score(X_test, y_test))

In [None]:
#kNN
ovr_knn = OneVsRestClassifier(KNeighborsClassifier(),-1).fit(X_train, y_train)
print(ovr_knn.score(X_test, y_test))

In [None]:
#羅吉斯迴歸
ovr_logist = OneVsRestClassifier(LogisticRegression(),-1).fit(X_train, y_train)
print(ovr_logist.score(X_test, y_test))

In [None]:
#類神經
ovr_mlp = OneVsRestClassifier(MLPClassifier(),-1).fit(X_train, y_train)
print(ovr_mlp.score(X_test, y_test))

In [None]:
logist = OneVsRestClassifier(LogisticRegression(),-1).fit(X, Y)
c = logist.predict(X)
d = pd.DataFrame(c)
d

In [None]:
for i in range(24):
    print(d.iloc[:,i].sum())

In [None]:
rf = RandomForestClassifier().fit(X, Y)
a = rf.predict(X)
b = pd.DataFrame(a)
b

In [None]:
for i in range(24):
    print(b.iloc[:,i].sum())

## 將資料正規化後，再放入各模型做個比較

In [None]:
#將資料正規化後再放入模型，比較結果看看
X_train, X_test, y_train, y_test = train_test_split(preprocessing.scale(X), Y, random_state=4)

# print(X_train.shape, y_train.shape)
# print(X_test.shape, y_test.shape)

In [None]:
#隨機森林(正規化後)
rf = RandomForestClassifier().fit(X_train, y_train)
print(rf.score(X_test, y_test))

In [None]:
#kNN(正規化後)
ovr_knn = OneVsRestClassifier(KNeighborsClassifier(),-1).fit(X_train, y_train)
print(ovr_knn.score(X_test, y_test))

In [None]:
#羅吉斯迴歸(正規化後)
ovr_logist = OneVsRestClassifier(LogisticRegression(),-1).fit(X_train, y_train)
print(ovr_logist.score(X_test, y_test))

In [None]:
#類神經(正規化後)
ovr_mlp = OneVsRestClassifier(MLPClassifier(),-1).fit(X_train, y_train)
print(ovr_mlp.score(X_test, y_test))

## 利用交叉驗證(正規化前/後)來看看會不會提高準確率

In [None]:
#隨機森林交叉驗證
rf = RandomForestClassifier()
scores = cross_val_score(rf, X, Y, cv=5)
print(scores)


In [None]:
#隨機森林交叉驗證(正規化後)
rf = RandomForestClassifier()
scores = cross_val_score(rf, preprocessing.scale(X), Y, cv=5)
print(scores)


In [None]:
#kNN交叉驗證
knn = KNeighborsClassifier()
scores = cross_val_score(knn, X, Y, cv=5)
print(scores)

In [None]:
#kNN交叉驗證(正規化後)
knn = KNeighborsClassifier()
scores = cross_val_score(knn, preprocessing.scale(X), Y, cv=5)
print(scores)

In [None]:
#羅吉斯迴歸交叉驗證
logist = LogisticRegression()
scores = cross_val_score(logist, X, Y, cv=5)
print(scores)

In [None]:
#羅吉斯迴歸交叉驗證(正規化後)
logist = LogisticRegression()
scores = cross_val_score(logist, preprocessing.scale(X), Y, cv=5)
print(scores)

In [None]:
#類神經交叉驗證
mlp = MLPClassifier()
scores = cross_val_score(mlp, X, Y, cv=5)
print(scores)

In [None]:
#類神經交叉驗證(正規化後)
mlp = MLPClassifier()
scores = cross_val_score(mlp, preprocessing.scale(X), Y, cv=5)
print(scores)