In [210]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error,mean_absolute_error
from plotly.offline import iplot, init_notebook_mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
from statsmodels.tools.eval_measures import rmse
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
pd.set_option('display.max_row', 50)
pd.set_option('display.max_column', 250)

import warnings
warnings.filterwarnings('ignore')

In [79]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
Submission = pd.read_csv("sampleSubmission.csv")

In [80]:
train_x = train.drop(["target","id"],axis=1)
train_y = pd.DataFrame(train["target"])
test_x = test.drop("id", axis=1)

In [81]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_y_label = pd.DataFrame((le.fit_transform(train_y)))

In [82]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

train_y_class = create_dummies(train_y,"target").drop("target",axis=1)

In [83]:
# Features Transformation
train_x_log = train_x.apply(lambda x: np.log(x+1))
test_x_log = test_x.apply(lambda x: np.log(x+1))
train_x_log_sqrt = train_x.apply(lambda x: np.sqrt(np.log(x+1)))
test_x_log_sqrt = test_x.apply(lambda x: np.sqrt(np.log(x+1)))

In [84]:
# To check the unique columns value no. between train and test set
compare_list = []
for columns in train_x.columns:
    a = len(train_x[columns].unique())
    b = len(test_x[columns].unique())
    if a != b:
        compare_list.append(columns)   

In [85]:
print(train_x.shape)
print(test_x.shape)
print(train_x_log.shape)
print(test_x_log.shape)
print(train_x_log_sqrt.shape)
print(test_x_log_sqrt.shape)
print(train_y_class.shape)
print(train_y_label.shape)

(61878, 93)
(144368, 93)
(61878, 93)
(144368, 93)
(61878, 93)
(144368, 93)
(61878, 9)
(61878, 1)


# Feature Enginnering on the Raw Data

In [149]:
train_x_enginnering = train_x.copy()
test_x_enginnering = test_x.copy()
ori_columns = train_x_enginnering.columns

In [150]:
train_x_enginnering["sum_of_row"] = train_x_enginnering[ori_columns].sum(axis=1)
test_x_enginnering["sum_of_row"] = test_x_enginnering[ori_columns].sum(axis=1)
train_x_enginnering["sum_of_zero"] = (train_x_enginnering[ori_columns] == 0).sum(axis=1)
test_x_enginnering["sum_of_zero"] = (test_x_enginnering[ori_columns] == 0).sum(axis=1)
train_x_enginnering["sum_of_non_zero"] = (train_x_enginnering[ori_columns] != 0).sum(axis=1)
test_x_enginnering["sum_of_non_zero"] = (test_x_enginnering[ori_columns] != 0).sum(axis=1)
train_x_enginnering["sum_of_value_one"] = (train_x_enginnering[ori_columns] == 1).sum(axis=1)
test_x_enginnering["sum_of_value_one"] = (test_x_enginnering[ori_columns] == 1).sum(axis=1)
train_x_enginnering["sum_of_value_two"] = (train_x_enginnering[ori_columns] == 2).sum(axis=1)
test_x_enginnering["sum_of_value_two"] = (test_x_enginnering[ori_columns] == 2).sum(axis=1)
train_x_enginnering["sum_of_value_three"] = (train_x_enginnering[ori_columns] == 3).sum(axis=1)
test_x_enginnering["sum_of_value_three"] = (test_x_enginnering[ori_columns] == 3).sum(axis=1)
train_x_enginnering["max_row_value"] = train_x_enginnering[ori_columns].max(axis=1)
test_x_enginnering["max_row_value"] = test_x_enginnering[ori_columns].max(axis=1)

# KNN prediction as features with different K-Values

In [9]:
from sklearn.neighbors import KNeighborsClassifier

In [10]:
KNN_2 = KNeighborsClassifier(n_neighbors=2)
KNN_4 = KNeighborsClassifier(n_neighbors=4)
KNN_8 = KNeighborsClassifier(n_neighbors=8)
KNN_16 = KNeighborsClassifier(n_neighbors=16)
KNN_32 = KNeighborsClassifier(n_neighbors=32)
KNN_64 = KNeighborsClassifier(n_neighbors=64)
KNN_128 = KNeighborsClassifier(n_neighbors=128)
KNN_256 = KNeighborsClassifier(n_neighbors=256)
KNN_512 = KNeighborsClassifier(n_neighbors=512)
KNN_1024 = KNeighborsClassifier(n_neighbors=1024)
KNN_2048 = KNeighborsClassifier(n_neighbors=2048)

In [11]:
KNN_2.fit(train_x,train_y_label)
KNN_4.fit(train_x,train_y_label)
KNN_8.fit(train_x,train_y_label)
KNN_16.fit(train_x,train_y_label)
KNN_32.fit(train_x,train_y_label)
KNN_64.fit(train_x,train_y_label)
KNN_128.fit(train_x,train_y_label)
KNN_256.fit(train_x,train_y_label)
KNN_512.fit(train_x,train_y_label)
KNN_1024.fit(train_x,train_y_label)
KNN_2048.fit(train_x,train_y_label)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=2048, p=2,
           weights='uniform')

In [12]:
KNN_2_score = KNN_2.score(train_x,train_y_label)
KNN_4_score = KNN_4.score(train_x,train_y_label)
KNN_8_score = KNN_8.score(train_x,train_y_label)
KNN_16_score = KNN_16.score(train_x,train_y_label)
KNN_32_score = KNN_32.score(train_x,train_y_label)
KNN_64_score = KNN_64.score(train_x,train_y_label)
KNN_128_score = KNN_128.score(train_x,train_y_label)
KNN_256_score = KNN_256.score(train_x,train_y_label)
KNN_512_score = KNN_512.score(train_x,train_y_label)
KNN_1024_score = KNN_1024.score(train_x,train_y_label)
KNN_2048_score = KNN_2048.score(train_x,train_y_label)
print(KNN_2_score)
print(KNN_4_score)
print(KNN_8_score)
print(KNN_16_score)
print(KNN_32_score)
print(KNN_64_score)
print(KNN_128_score)
print(KNN_256_score)
print(KNN_512_score)
print(KNN_1024_score)
print(KNN_2048_score)

0.8786644687934322
0.8545686673777433
0.8236691554348881
0.8029832896990853
0.7851094088367433
0.7673809754678561
0.7519958628268528
0.7351562752513009
0.7159895277804712
0.6919422088626006
0.6627395843433854


## KNN features for Test Dataset

In [13]:
KNN_2_pred = pd.DataFrame(KNN_2.predict_proba(test_x))
KNN_4_pred = pd.DataFrame(KNN_4.predict_proba(test_x))
KNN_8_pred = pd.DataFrame(KNN_8.predict_proba(test_x))
KNN_16_pred = pd.DataFrame(KNN_16.predict_proba(test_x))
KNN_32_pred = pd.DataFrame(KNN_32.predict_proba(test_x))
KNN_64_pred = pd.DataFrame(KNN_64.predict_proba(test_x))
KNN_128_pred = pd.DataFrame(KNN_128.predict_proba(test_x))
KNN_256_pred = pd.DataFrame(KNN_256.predict_proba(test_x))
KNN_512_pred = pd.DataFrame(KNN_512.predict_proba(test_x))
KNN_1024_pred = pd.DataFrame(KNN_1024.predict_proba(test_x))
KNN_2048_pred = pd.DataFrame(KNN_2048.predict_proba(test_x))

In [43]:
KNN_Test_Features = pd.concat([KNN_2_pred,KNN_4_pred,KNN_8_pred,KNN_16_pred,KNN_32_pred, KNN_64_pred,
                               KNN_128_pred,KNN_256_pred,KNN_512_pred,KNN_1024_pred,KNN_2048_pred], axis = 1)
KNN_Test_Features.columns = ["KNN_Feature_" + str(i) for i in range(1,100,1)]

In [14]:
Submission_KNN_2 = pd.read_csv("sampleSubmission.csv")
Submission_KNN_4 = pd.read_csv("sampleSubmission.csv")
Submission_KNN_8 = pd.read_csv("sampleSubmission.csv")
Submission_KNN_16 = pd.read_csv("sampleSubmission.csv")
Submission_KNN_32 = pd.read_csv("sampleSubmission.csv")
Submission_KNN_64 = pd.read_csv("sampleSubmission.csv")
Submission_KNN_128 = pd.read_csv("sampleSubmission.csv")
Submission_KNN_256 = pd.read_csv("sampleSubmission.csv")
Submission_KNN_512 = pd.read_csv("sampleSubmission.csv")
Submission_KNN_1024 = pd.read_csv("sampleSubmission.csv")
Submission_KNN_2048 = pd.read_csv("sampleSubmission.csv")

In [15]:
KNN_columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9']
Submission_KNN_2[KNN_columns] = KNN_2_pred
Submission_KNN_4[KNN_columns] = KNN_4_pred
Submission_KNN_8[KNN_columns] = KNN_8_pred
Submission_KNN_16[KNN_columns] = KNN_16_pred
Submission_KNN_32[KNN_columns] = KNN_32_pred
Submission_KNN_64[KNN_columns] = KNN_64_pred
Submission_KNN_128[KNN_columns] = KNN_128_pred
Submission_KNN_256[KNN_columns] = KNN_256_pred
Submission_KNN_512[KNN_columns] = KNN_512_pred
Submission_KNN_1024[KNN_columns] = KNN_1024_pred
Submission_KNN_2048[KNN_columns] = KNN_2048_pred

In [16]:
Submission_KNN_2.to_csv("KNN_2_pred.csv",index=False)
Submission_KNN_4.to_csv("KNN_4_pred.csv",index=False)
Submission_KNN_8.to_csv("KNN_8_pred.csv",index=False)
Submission_KNN_16.to_csv("KNN_16_pred.csv",index=False)
Submission_KNN_32.to_csv("KNN_32_pred.csv",index=False)
Submission_KNN_64.to_csv("KNN_64_pred.csv",index=False)
Submission_KNN_128.to_csv("KNN_128_pred.csv",index=False)
Submission_KNN_256.to_csv("KNN_256_pred.csv",index=False)
Submission_KNN_512.to_csv("KNN_512_pred.csv",index=False)
Submission_KNN_1024.to_csv("KNN_1024_pred.csv",index=False)
Submission_KNN_2048.to_csv("KNN_2048_pred.csv",index=False)

## KNN features for Train Dataset

In [17]:
KNN_2_x_pred = pd.DataFrame(KNN_2.predict_proba(train_x))
KNN_4_x_pred = pd.DataFrame(KNN_4.predict_proba(train_x))
KNN_8_x_pred = pd.DataFrame(KNN_8.predict_proba(train_x))
KNN_16_x_pred = pd.DataFrame(KNN_16.predict_proba(train_x))
KNN_32_x_pred = pd.DataFrame(KNN_32.predict_proba(train_x))
KNN_64_x_pred = pd.DataFrame(KNN_64.predict_proba(train_x))
KNN_128_x_pred = pd.DataFrame(KNN_128.predict_proba(train_x))
KNN_256_x_pred = pd.DataFrame(KNN_256.predict_proba(train_x))
KNN_512_x_pred = pd.DataFrame(KNN_512.predict_proba(train_x))
KNN_1024_x_pred = pd.DataFrame(KNN_1024.predict_proba(train_x))
KNN_2048_x_pred = pd.DataFrame(KNN_2048.predict_proba(train_x))

In [45]:
KNN_Train_Features = pd.concat([KNN_2_x_pred,KNN_4_x_pred,KNN_8_x_pred,KNN_16_x_pred,KNN_32_x_pred, KNN_64_x_pred,
                               KNN_128_x_pred,KNN_256_x_pred,KNN_512_x_pred,KNN_1024_x_pred,KNN_2048_x_pred], axis = 1)
KNN_Train_Features.columns = ["KNN_Feature_" + str(i) for i in range(1,100,1)]

In [18]:
Submission_KNN_2_Train = pd.read_csv("sampleSubmission.csv")
Submission_KNN_4_Train = pd.read_csv("sampleSubmission.csv")
Submission_KNN_8_Train = pd.read_csv("sampleSubmission.csv")
Submission_KNN_16_Train = pd.read_csv("sampleSubmission.csv")
Submission_KNN_32_Train = pd.read_csv("sampleSubmission.csv")
Submission_KNN_64_Train = pd.read_csv("sampleSubmission.csv")
Submission_KNN_128_Train = pd.read_csv("sampleSubmission.csv")
Submission_KNN_256_Train = pd.read_csv("sampleSubmission.csv")
Submission_KNN_512_Train = pd.read_csv("sampleSubmission.csv")
Submission_KNN_1024_Train = pd.read_csv("sampleSubmission.csv")
Submission_KNN_2048_Train = pd.read_csv("sampleSubmission.csv")

In [19]:
KNN_columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6',
       'Class_7', 'Class_8', 'Class_9']
Submission_KNN_2_Train[KNN_columns] = KNN_2_x_pred
Submission_KNN_4_Train[KNN_columns] = KNN_4_x_pred
Submission_KNN_8_Train[KNN_columns] = KNN_8_x_pred
Submission_KNN_16_Train[KNN_columns] = KNN_16_x_pred
Submission_KNN_32_Train[KNN_columns] = KNN_32_x_pred
Submission_KNN_64_Train[KNN_columns] = KNN_64_x_pred
Submission_KNN_128_Train[KNN_columns] = KNN_128_x_pred
Submission_KNN_256_Train[KNN_columns] = KNN_256_x_pred
Submission_KNN_512_Train[KNN_columns] = KNN_512_x_pred
Submission_KNN_1024_Train[KNN_columns] = KNN_1024_x_pred
Submission_KNN_2048_Train[KNN_columns] = KNN_2048_x_pred

In [20]:
Submission_KNN_2_Train.to_csv("KNN_2_x_pred.csv",index=False)
Submission_KNN_4_Train.to_csv("KNN_4_x_pred.csv",index=False)
Submission_KNN_8_Train.to_csv("KNN_8_x_pred.csv",index=False)
Submission_KNN_16_Train.to_csv("KNN_16_x_pred.csv",index=False)
Submission_KNN_32_Train.to_csv("KNN_32_x_pred.csv",index=False)
Submission_KNN_64_Train.to_csv("KNN_64_x_pred.csv",index=False)
Submission_KNN_128_Train.to_csv("KNN_128_x_pred.csv",index=False)
Submission_KNN_256_Train.to_csv("KNN_256_x_pred.csv",index=False)
Submission_KNN_512_Train.to_csv("KNN_512_x_pred.csv",index=False)
Submission_KNN_1024_Train.to_csv("KNN_1024_x_pred.csv",index=False)
Submission_KNN_2048_Train.to_csv("KNN_2048_x_pred.csv",index=False)

# t-distributed Stochastic Neighbor Embedding (TSNE) feature engineering for Train & Test Dataset (X)

In [21]:
from sklearn.manifold import TSNE

In [22]:
TSNE_2D = TSNE(n_components=2, random_state=0)
TSNE_3D = TSNE(n_components=3, random_state=0)

In [23]:
train_x_2d = TSNE_2D.fit_transform(train_x)
train_x_3d = TSNE_3D.fit_transform(train_x)
test_x_2d = TSNE_2D.fit_transform(test_x)
test_x_3d = TSNE_3D.fit_transform(test_x)

In [24]:
train_x_2d = pd.DataFrame(train_x_2d)
train_x_3d = pd.DataFrame(train_x_3d)
test_x_2d = pd.DataFrame(test_x_2d)
test_x_3d = pd.DataFrame(test_x_3d)

In [25]:
train_x_TSNE = pd.concat([train_x_2d,train_x_3d],axis=1)
test_x_TSNE = pd.concat([test_x_2d,test_x_3d],axis=1)

In [54]:
train_x_TSNE.columns = ["2D_1","2D_2","3D_1","3D_2","3D_3"]
test_x_TSNE.columns = ["2D_1","2D_2","3D_1","3D_2","3D_3"]

In [26]:
train_x_TSNE.to_csv("train_x_TSNE.csv",index=False)
test_x_TSNE.to_csv("test_x_TSNE.csv",index=False)

# t-distributed Stochastic Neighbor Embedding (TSNE) feature engineering for Train & Test Dataset log(X+1)

In [27]:
from sklearn.manifold import TSNE

In [28]:
TSNE_log_2D = TSNE(n_components=2, random_state=0)
TSNE_log_3D = TSNE(n_components=3, random_state=0)

In [29]:
train_log_x_2d = TSNE_log_2D.fit_transform(train_x_log)
train_log_x_3d = TSNE_log_3D.fit_transform(train_x_log)
test_log_x_2d = TSNE_log_2D.fit_transform(test_x_log)
test_log_x_3d = TSNE_log_3D.fit_transform(test_x_log)

In [30]:
train_log_x_2d = pd.DataFrame(train_log_x_2d)
train_log_x_3d = pd.DataFrame(train_log_x_3d)
test_log_x_2d = pd.DataFrame(test_log_x_2d)
test_log_x_3d = pd.DataFrame(test_log_x_3d)

In [195]:
train_log_x_TSNE = pd.concat([train_log_x_2d,train_log_x_3d],axis=1)
test_log_x_TSNE = pd.concat([test_log_x_2d,test_log_x_3d],axis=1)

In [196]:
train_log_x_TSNE.columns = ["log_2D_1","log_2D_2","log_3D_1","log_3D_2","log_3D_3"]
test_log_x_TSNE.columns = ["log_2D_1","log_2D_2","log_3D_1","log_3D_2","log_3D_3"]

In [32]:
train_log_x_TSNE.to_csv("train_log_x_TSNE.csv",index=False)
test_log_x_TSNE.to_csv("test_log_x_TSNE.csv",index=False)

# K-means clustering feature engineering for Train & Test Dataset

In [33]:
from sklearn.cluster import KMeans

In [34]:
Kmeans = KMeans(n_clusters=9,random_state=0)

In [35]:
train_x_fit = Kmeans.fit(train_x)
train_x_pred = pd.DataFrame(Kmeans.predict(train_x), columns=["Class"])
test_x_pred = pd.DataFrame(Kmeans.predict(test_x), columns=["Class"])
train_x_cluster_space = pd.DataFrame(Kmeans.transform(train_x), columns=["cluster_space_" + str(i) for i in range(0,9,1)])
test_x_cluster_space = pd.DataFrame(Kmeans.transform(test_x), columns=["cluster_space_" + str(i) for i in range(0,9,1)])

In [36]:
train_x_pred_dummy = create_dummies(train_x_pred,"Class").drop("Class",axis=1)
test_x_pred_dummy = create_dummies(test_x_pred,"Class").drop("Class",axis=1)

In [37]:
train_x_KMeans = pd.concat([train_x_pred_dummy,train_x_cluster_space],axis=1)
test_x_KMeans = pd.concat([test_x_pred_dummy,test_x_cluster_space],axis=1)

In [38]:
train_x_KMeans.to_csv("train_x_KMeans.csv",index=False)
test_x_KMeans.to_csv("test_x_KMeans.csv",index=False)

# Dataset Summary
### Train & Test set processed with various Features Engineering, and ready to feed with algorithms

In [198]:
train_x_with_KNN = pd.concat([train_x,KNN_Train_Features],axis=1)
test_x_with_KNN = pd.concat([test_x,KNN_Test_Features],axis=1)

In [199]:
train_x_with_TSNE = pd.concat([train_x,train_x_TSNE,train_log_x_TSNE],axis=1)
test_x_with_TSNE = pd.concat([test_x,test_x_TSNE,test_log_x_TSNE],axis=1)

In [200]:
train_x_with_KMeans = pd.concat([train_x,train_x_KMeans],axis=1)
test_x_with_Kmeans = pd.concat([test_x,test_x_KMeans],axis=1)

In [201]:
train_x_with_KNN_TSNE =  pd.concat([train_x_with_KNN,train_x_TSNE,train_log_x_TSNE],axis=1)
test_x_with_KNN_TSNE =  pd.concat([test_x_with_KNN,test_x_TSNE,test_log_x_TSNE],axis=1)

In [202]:
train_x_with_KNN_TSNE_KMean =  pd.concat([train_x_with_KNN_TSNE,train_x_KMeans],axis=1)
test_x_with_KNN_TSNE_Kmean =  pd.concat([test_x_with_KNN_TSNE,test_x_KMeans],axis=1)

In [203]:
train_x_eng_with_KNN_TSNE_KMean =  pd.concat([train_x_enginnering,KNN_Train_Features,train_x_TSNE,train_log_x_TSNE,train_x_KMeans],axis=1)
test_x_eng_with_KNN_TSNE_Kmean =  pd.concat([test_x_enginnering,KNN_Test_Features,test_x_TSNE,test_log_x_TSNE,test_x_KMeans],axis=1)

In [230]:
train_x_log_eng_with_KNN_TSNE_KMean = train_x_eng_with_KNN_TSNE_KMean.copy()
test_x_log_eng_with_KNN_TSNE_Kmean = test_x_eng_with_KNN_TSNE_Kmean.copy()
train_x_log_eng_with_KNN_TSNE_KMean.loc[:,"feat_1":"max_row_value"] = train_x_eng_with_KNN_TSNE_KMean.loc[:,"feat_1":"max_row_value"].apply(lambda x: np.log(x+1))
test_x_log_eng_with_KNN_TSNE_Kmean.loc[:,"feat_1":"max_row_value"] = test_x_eng_with_KNN_TSNE_Kmean.loc[:,"feat_1":"max_row_value"].apply(lambda x: np.log(x+1))

In [231]:
print(train_x_eng_with_KNN_TSNE_KMean.shape)
print(test_x_eng_with_KNN_TSNE_Kmean.shape)
print(train_x_eng_with_KNN_TSNE_KMean.isnull().sum().sum())
print(test_x_eng_with_KNN_TSNE_Kmean.isnull().sum().sum())
print(train_x_log_eng_with_KNN_TSNE_KMean.isnull().sum().sum())
print(test_x_log_eng_with_KNN_TSNE_Kmean.isnull().sum().sum())

(61878, 227)
(144368, 227)
0
0
0
0


In [233]:
train_x_eng_with_KNN_TSNE_KMean.to_csv("train_x_eng_with_KNN_TSNE_KMean.csv",index=False)
test_x_eng_with_KNN_TSNE_Kmean.to_csv("test_x_eng_with_KNN_TSNE_Kmean.csv",index=False)
train_x_log_eng_with_KNN_TSNE_KMean.to_csv("train_x_log_eng_with_KNN_TSNE_KMean.csv",index=False)
test_x_log_eng_with_KNN_TSNE_Kmean.to_csv("test_x_log_eng_with_KNN_TSNE_Kmean.csv",index=False)

In [235]:
train_x.to_csv("train_x.csv",index=False)
test_x.to_csv("test_x.csv",index=False)
train_x_log.to_csv("train_x_log.csv",index=False)
test_x_log.to_csv("test_x_log.csv",index=False)
train_y_class.to_csv("train_y_class.csv",index=False)
train_y_label.to_csv("train_y_label.csv",index=False)

In [234]:
# Total dataset features to consider
print(train_x.shape)
print(test_x.shape)
print(train_x_log.shape)
print(test_x_log.shape)
print(train_x_eng_with_KNN_TSNE_KMean.shape)
print(test_x_eng_with_KNN_TSNE_Kmean.shape)
print(train_x_log_eng_with_KNN_TSNE_KMean.shape)
print(test_x_log_eng_with_KNN_TSNE_Kmean.shape)

# Label dataset
print(train_y_class.shape)
print(train_y_label.shape)

(61878, 93)
(144368, 93)
(61878, 93)
(144368, 93)
(61878, 227)
(144368, 227)
(61878, 227)
(144368, 227)
(61878, 9)
(61878, 1)
