In [1]:
import pandas as pd
import numpy as np
import sklearn

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style
style.use('ggplot')

In [2]:
from sklearn.model_selection import KFold, GridSearchCV, cross_validate, ParameterGrid
from sklearn.feature_selection import RFECV
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.linear_model import RidgeCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

In [3]:
def information(rtd):
    info = pd.DataFrame({'Column': rtd.columns})
    info['unique'] = [len(rtd[c].unique()) for c in rtd.columns]
    info['type'] = [rtd[c].dtype for c in rtd.columns]
    info['isNull'] = [sum(rtd[c].isnull()) for c in rtd.columns]
    return info.set_index('Column')

In [4]:
loc_train = 'raw_data\\train.csv'
loc_test = 'raw_data\\test.csv'

target = 'Survived'
learning = 'Classification' # 'Regression' or 'Classification'
evaluation = 'accuracy'

raw_train_data = pd.read_csv(loc_train)
y = raw_train_data[target]
raw_train_data = raw_train_data.drop(target, axis=1)
raw_test_data = pd.read_csv(loc_test)
raw_train_data.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
info = information(raw_train_data)

In [6]:
info[info.isNull>0]

Unnamed: 0_level_0,unique,type,isNull
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Age,89,float64,177
Cabin,148,object,687
Embarked,4,object,2


In [7]:
for col in info[(info.type == 'object') & (info.isNull>0)].index:
    print (col, '\t', raw_train_data[col].unique())

Cabin 	 [nan 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78' 'D33'
 'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26' 'C110'
 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7' 'C49'
 'F4' 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87' 'B77'
 'E67' 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26' 'C106'
 'C65' 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124' 'C91'
 'E40' 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'B96 B98' 'E10' 'E44' 'A34'
 'C104' 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79'
 'E25' 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68'
 'A10' 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48' 'E58'
 'C126' 'B71' 'B51 B53 B55' 'D49' 'B5' 'B20' 'F G63' 'C62 C64' 'E24' 'C90'
 'C45' 'E8' 'B101' 'D45' 'C46' 'D30' 'E121' 'D11' 'E77' 'F38' 'B3' 'D6'
 'B82 B84' 'D17' 'A36' 'B102' 'B69' 'E49' 'C47' 'D28' 'E17' 'A24' 'C50'
 'B42' 'C148']
Embarked 	 ['S' 'C' 'Q' nan]


In [8]:
info[info.type == 'str']

Unnamed: 0_level_0,unique,type,isNull
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [9]:
raise NotImplementedError

NotImplementedError: 

In [None]:
imputer = KNNImputer(n_neighbors= 7, weights= 'distance')
scaler = PowerTransformer()


Z = pd.concat([raw_train_data, raw_test_data])
Z = Z.drop(['Id', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], 
                 axis = 1)
float_cols  = Z.select_dtypes(exclude= 'object').columns.tolist() 
object_cols = Z.select_dtypes(include= 'object').columns.tolist() 
two_unique = [c for c in object_cols if len(Z[c].unique())==2]

ZD = pd.get_dummies(Z, columns = two_unique, drop_first=True)
ZD = pd.get_dummies(ZD)

ZD[float_cols] = StandardScaler().fit_transform(ZD[float_cols])

imputer = imputer.fit(ZD)
imputed = imputer.transform(ZD)
ZD = pd.DataFrame(imputed, columns= ZD.columns)

scaler = scaler.fit(ZD[float_cols])
ZD[float_cols] = scaler.transform(ZD[float_cols])

ZD.shape


In [None]:
size_train = raw_train_data.shape[0]

train_X = ZD.iloc[:size_train]
test_X = ZD.iloc[size_train:]

T = train_X.copy()
T['y'] = y
T.head()

In [None]:
print (raw_train_data.shape, train_X.shape)
print (raw_test_data.shape, test_X.shape)

In [None]:
information(T)

In [None]:
Corr = T.corr().abs().sort_values('y', ascending=False)
Corr

In [None]:
if learning == 'Regression':
    dt = DecisionTreeRegressor()
else:
    dt = DecisionTreeClassifier()
dt.fit(train_X, y)
plt.figure(figsize = (14, 9))
plot_tree(dt, 
          max_depth = 3, 
          filled = True, 
          fontsize=12, 
          label='root', 
          impurity=False, 
          feature_names=train_X.columns
         )
plt.title('Features')
plt.show()

In [None]:
if learning == 'Regression':
    model = DecisionTreeRegressor()
else:
    model = SVC()
    
Fstack = RFECV(model, scoring = evaluation)

In [None]:
Fstack.fit(train_X, y)

In [None]:
rfescore = Fstack.grid_scores_
plt.figure(figsize=(14, 6))
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfescore) + 1), rfescore)
plt.axvline(Fstack.n_features_, color='blue')
# plt.xticks(list(plt.xticks()[0]) + [Fstack.n_features_])
plt.show()

In [None]:
Fstack.n_features_

In [None]:
Fstack.grid_scores_.mean()

In [None]:
cc = Fstack.get_support()

In [None]:
# cimp = train_X.columns.tolist()
cimp = train_X.columns[cc].tolist()
cimp

In [None]:
train_csv = train_X[cimp]
test_csv = test_X[cimp]

train_csv[target] = y
test_csv['Id'] = raw_test_data['Id'].tolist()
train_csv['Id'] = raw_train_data['Id'].tolist()

In [None]:
train_csv.to_csv('data\\train.csv', index=False)
test_csv.to_csv('data\\test.csv', index=False)

In [None]:
test_csv.head()

In [None]:
aval_model = LinearDiscriminantAnalysis()

it = np.array([0]*size_train + [1]*raw_test_data.shape[0])
aval_model.fit(ZD, it)
ip = aval_model.predict(ZD)
sklearn.metrics.accuracy_score(it, ip)

In [None]:
ival = ip[:size_train]==1
sum(ival)

In [None]:
probs = aval_model.predict_proba(ZD)[:, 0]
t = sorted(probs[:size_train])[int(size_train/5)]
t

In [None]:
ival = probs[:size_train]<t

In [None]:
ival.sum()

In [None]:
ip[size_train:].sum()

In [None]:
(probs[size_train:] >.7).sum()