In [17]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import plot_confusion_matrix, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [187]:
training = pd.read_csv('Dataset/train.csv')
testing = pd.read_csv('Dataset/test.csv')

In [188]:
def preprocess_data(data, y, cols):
    data['issue_date'] = pd.to_datetime(data['issue_date'], format='%Y-%m-%d %H:%M:%S')
    data['listing_date'] = pd.to_datetime(data['listing_date'], format='%Y-%m-%d %H:%M:%S')
    data['issue_date'] = data.issue_date.values.astype(np.int64) // 10 **9
    data['listing_date'] = data.listing_date.values.astype(np.int64) // 10**9
    imputed = data.iloc[:,1:-2]
    merger = pd.DataFrame(y,columns=cols)
    s1 = imputed.iloc[:,:3]
    s2 = imputed.iloc[:,4:]
    middle = s1.join(merger)
    final = middle.join(s2)
    Knn = KNNImputer(missing_values=np.float64(np.nan),n_neighbors=5)
    final['condition'] = pd.DataFrame(Knn.fit_transform(final.condition.values.reshape(-1, 1)))
    y_train = data.iloc[:,-2:]
    x_train = final
    return x_train, y_train

In [20]:
def scaling(x_train, x_test):
    Scaler = StandardScaler()
    x_train = Scaler.fit_transform(x_train)
    x_test = Scaler.transform(x_test)
    return x_train, x_test

In [21]:
def plot_model(model, x_test, y_test):
    plot_confusion_matrix(model, x_test,y_test,cmap='Blues',values_format='.5g')

In [22]:
def models_for_first(model, x_train, y_train, x_test, y_test):
    accuracy = []
    for i in range(2):
        model.fit(x_train, y_train.iloc[:, i])
        y_pred = model.predict(x_test)
        mat = confusion_matrix(y_test.iloc[:, i], y_pred)
        acc = sum(mat.diagonal())/sum(sum(mat))
        accuracy.append(acc)
        plot_model(model, x_test, y_test.iloc[:, i])
    return accuracy,model

In [83]:
def guess(model,x_train, y_train,final_test):
    y_pred= []
    for i in range(2):
        model.fit(x_train, y_train.iloc[:, i])
        y_pred.append(model.predict(final_test))
    return y_pred

In [153]:
def finalize(predictions, testing, num):
    predictions[0].astype(np.int64)
    predictions.insert(0,testing.iloc[:, 0])
    finals = pd.DataFrame(predictions).T
    finals.columns = [finals.columns[0],training.columns[-2], training.columns[-1]]
    finals = finals.set_index(finals.columns[0])
    finals.to_csv('Result{}.csv'.format(num))

In [23]:
Bun = LabelBinarizer()
y = Bun.fit_transform(training.color_type)
y_t = Bun.transform(testing.color_type)

In [24]:
cols = training.color_type.unique()

In [33]:
svc = SVC(C=3)

In [31]:
# models_for_first(svc, scaled_x_train_2,y_train_2, scaled_x_test_2,y_test_2)

In [253]:
xgbc = XGBClassifier(max_depth=5, n_estimators=600,nthread=18,base_score=0.602)

In [45]:
from sklearn.ensemble import GradientBoostingClassifier as gdf

In [258]:
md = gdf(n_estimators=900,max_depth=5,learning_rate=0.001,loss='')

In [52]:
from sklearn.ensemble import AdaBoostClassifier as ada

In [255]:
forest = RandomForestClassifier(n_estimators=800,n_jobs=-1,max_depth=5)

In [256]:
clf = ada(base_estimator=forest,n_estimators=200,algorithm='SAMME.R')

In [174]:
x_test,y_test = preprocess_data(testing, y_t, cols)

In [85]:
final_test = x_test.join(y_test)

In [155]:
finalize(guess(md, scaled_X, y_train ,scale_x_test), testing, 2)

In [189]:
x_train, y_train = preprocess_data(training,y, cols)

In [190]:
x_train.head()

Unnamed: 0,issue_date,listing_date,condition,Brown Tabby,White,Brown,Black,Red,Tricolor,Calico,...,Agouti,Chocolate Point,Liver Tick,Pink,Black Tiger,Silver Lynx Point,length(m),height(cm),X1,X2
0,1468108800,1474475100,2.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.8,7.78,13,9
1,1384992000,1545932820,1.0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0.72,14.19,13,9
2,1411862400,1476865440,0.88339,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.15,40.9,15,4
3,1483142400,1548441000,1.0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0.62,17.82,0,1
4,1506556800,1511084280,2.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0.5,11.06,18,4


In [191]:
x_test, y_test = preprocess_data(testing, y_t, cols)

In [192]:
x_test.head()

Unnamed: 0,issue_date,listing_date,condition,Brown Tabby,White,Brown,Black,Red,Tricolor,Calico,...,Brown Tiger,Liver,Agouti,Chocolate Point,Liver Tick,Pink,Black Tiger,Silver Lynx Point,length(m),height(cm)
0,1124236800,1504798500,0.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0.87,42.73
1,1542240000,1557336240,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.06,6.71
2,1349913600,1522687860,1.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0.24,41.21
3,1423785600,1522999500,1.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0.29,8.46
4,1484697600,1524750120,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.71,30.92


In [194]:
final_test = x_test.join(y_test)

In [198]:
finalize(guess(xgbc,x_train,y_train,final_test),testing,3)

In [199]:
finalize(guess(md,x_train, y_train, final_test), testing, 4)

In [204]:
scaled_X, scaled_x_test = scaling(x_train, final_test)

In [260]:
finalize(guess(md, x_train,y_train,final_test),testing, 22)

In [236]:
# 10621/18334

0.5793062070470165