In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from tqdm import tqdm
from ast import literal_eval
import os
from multiprocessing import Pool
from multiprocessing import cpu_count

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import accuracy_score,f1_score

In [None]:
nan_value = -1
categorical_limit = 8
log_limit = 10000
missing_threshold = 99
data_path = './DataSet/'

In [None]:
train = pd.read_csv(f'{data_path}Train.csv')
test = pd.read_csv(f'{data_path}Test.csv')

In [None]:
# train.drop_duplicates(inplace=True)
target = train.Col2
train.drop(['Col2'],axis=1,inplace=True)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.head()

In [None]:
test.head()

In [None]:
def processCol1(train,test):
    def splitVal(val):
        p1=[]
        p2=[]
        p3=[]
        for x in val:
            p1.append(x[:2])
            p2.append(x[2:4])
            p3.append(x[4:])
        return p1,p2,p3

    x,y,z = splitVal(train['Col1'])
    train['Col1_1'] = x
    train['Col1_2'] = y
    train['Col1_3'] = z
    x,y,z = splitVal(test['Col1'])
    test['Col1_1'] = x
    test['Col1_2'] = y
    test['Col1_3'] = z
    train.Col1_1.value_counts()

    test.Col1_1.value_counts()

    le = LabelEncoder()
    l = train.Col1_1.values.tolist()
    l.extend(test.Col1_1.values.tolist())
    l = np.reshape(np.array(l),(len(l),))
    le.fit(l)
    train.Col1_1 = le.transform(train.Col1_1)
    test.Col1_1 = le.transform(test.Col1_1)

    train.Col1_2.value_counts()

    test.Col1_2.value_counts()

    le = LabelEncoder()
    l = train.Col1_2.values.tolist()
    l.extend(test.Col1_2.values.tolist())
    l = np.reshape(np.array(l),(len(l),))
    le.fit(l)
    train.Col1_2 = le.transform(train.Col1_2)
    test.Col1_2 = le.transform(test.Col1_2)

    train.Col1_3.value_counts()

    test.Col1_3.value_counts()

    le = LabelEncoder()
    l = train.Col1_3.values.tolist()
    l.extend(test.Col1_3.values.tolist())
    l = np.reshape(np.array(l),(len(l),))
    le.fit(l)
    train.Col1_3 = le.transform(train.Col1_3)
    test.Col1_3 = le.transform(test.Col1_3)
    train.drop(['Col1'],axis=1,inplace=True)
    test.drop(['Col1'],axis=1,inplace=True)
    print(train.Col1_1.nunique())
    print(train.Col1_2.nunique())
    print(train.Col1_3.nunique())
    print(test.Col1_1.nunique())
    print(test.Col1_2.nunique())
    print(test.Col1_3.nunique())
    return train,test

In [None]:
train,test = processCol1(train,test)

In [None]:
target.value_counts()

In [None]:
sns.barplot(x=target.value_counts().index,y=target.value_counts().values)

In [None]:
def missing_values_table(train,test,drop,threshold):
    mis_val = train.isnull().sum()
    mis_val_percent = 100 * train.isnull().sum() / len(train)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(train.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
          " columns that have missing values.")
    if drop: 
        to_drop = (mis_val_table_ren_columns[mis_val_table_ren_columns["% of Total Values"]>=threshold].index)
        print(to_drop)
        train.drop(to_drop,axis=1,inplace=True)
        test.drop(to_drop,axis=1,inplace=True)
    return mis_val_table_ren_columns,train,test

In [None]:
missing_values,train,test = missing_values_table(train,test,True,missing_threshold)
missing_values.head(20)

In [None]:
def handle_dashes(train,test):
    for col in train.columns:
        if train[col].dtype == object:
            train[col] = pd.to_numeric(train[col],errors='coerce')
    for col in test.columns:
        if test[col].dtype == object:
            test[col] = pd.to_numeric(test[col],errors='coerce')
    return train,test 

In [None]:
train,test = handle_dashes(train,test)

In [None]:
def matchTypes(train, test):
    for col in train.columns:
        if train[col].dtype == float:
            test[col] = test[col].astype('float')
        if train[col].dtype == int:
            try:
                test[col] = test[col].astype('int')
            except:
                train[col] = train[col].astype('float')
    return train,test

In [None]:
train, test = matchTypes(train, test)

In [None]:
int_cols = train.columns[train.dtypes=='int']
float_cols = train.columns[train.dtypes=='float']

In [None]:
# np.array_equal(A,B)  # test if same shape, same elements values
# np.array_equiv(A,B)  # test if broadcastable shape, same elements values
# np.allclose(A,B,...) 
def processDuplicateColumns(train,test,columns,threshold):
    data_dict = {}
    for toProcessCols in columns:
        processedColumns = set()
        for mainCol in tqdm(toProcessCols):
            if mainCol not in processedColumns:
                arr = []
                for col in toProcessCols:
                    if col != mainCol and col not in processedColumns and np.allclose(train[mainCol],train[col],equal_nan=True) and np.allclose(test[mainCol],test[col],equal_nan=True):
                        arr.append(col)
                        processedColumns.add(col)
                if len(arr) > 0:
                    data_dict[mainCol] = arr
            processedColumns.add(mainCol)
    with open(f'{data_path}duplicate_columns.txt','w') as f:
        f.write(str(data_dict))
        f.close()
    return data_dict

In [None]:
if os.path.exists(f'{data_path}duplicate_columns.txt'):
    with open(f'{data_path}duplicate_columns.txt','r') as f:
        data = literal_eval(f.read())
else:
    data = processDuplicateColumns(train,test,[int_cols,float_cols],0)

In [None]:
for v in tqdm(data.values()):
    train.drop(v,axis=1,inplace=True)
    test.drop(v,axis=1,inplace=True)

In [None]:
categorical_columns = train.columns[((train.dtypes == 'int')&(train.nunique() < categorical_limit))]
for col in tqdm(categorical_columns):
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

In [None]:
train.info()

In [None]:
col_to_log = train.columns[((train.dtypes != 'category') & (train.max() >= log_limit))]
def getSign(val):
    if val < 0.0:
        return -1
    else:
        return 1
for col in col_to_log:
    if train[col].isna().sum() == 0 and test[col].isna().sum() == 0:
        sign = train[col].apply(getSign)
        train[col] = np.log1p(np.abs(train[col]))*sign
        sign = test[col].apply(getSign)
        test[col] = np.log1p(np.abs(test[col]))*sign

In [None]:
train.isna().sum().sum()

In [None]:
test.isna().sum().sum()

In [None]:
# correlations = train.corr(target).sort_values()
# print('Most Positive Correlations:\n', correlations.tail(15))
# print('\nMost Negative Correlations:\n', correlations.head(15))

In [None]:
cp = train.copy()
cp2 = test.copy()

In [None]:
train = cp[cp.columns[cp.dtypes == 'category']]
test = cp2[cp.columns[cp.dtypes == 'category']]

In [None]:
# lgbm_imp = np.loadtxt(f'{data_path}lgbm_imp_final.txt',dtype='str')
# lgbm_imp_int = np.loadtxt(f'{data_path}lgbm_imp_int2.txt',dtype='str')

In [None]:
# train = cp[np.concatenate((lgbm_imp,lgbm_imp_int))]
# test = cp2[np.concatenate((lgbm_imp,lgbm_imp_int))]

In [None]:
# train = cp[lgbm_imp]
# test = cp2[lgbm_imp]

In [None]:
# train = cp.copy()
# test = cp2.copy()

In [None]:
# train[train.columns[train.dtypes != 'category']].fillna(nan_value,inplace=True)
# train[test.columns[train.dtypes != 'category']].fillna(nan_value,inplace=True)

In [None]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cat
# import sk

In [None]:
# model_lgb = lgb.LGBMClassifier()
# model_xgb = xgb.XGBClassifier()
model_cat = cat.CatBoostClassifier(learning_rate=0.1,iterations=1200)

In [None]:
kfold = StratifiedKFold(3,random_state=2019)

In [None]:
for train_idx,test_idx in kfold.split(train,target):
    x_train,y_train = train.iloc[train_idx],target.iloc[train_idx]
    x_test,y_test = train.iloc[test_idx],target.iloc[test_idx]
#     model_lgb.fit(x_train,y_train,eval_metric=accuracy_score)
#     model_xgb.fit(x_train,y_train,eval_metric=accuracy_score)
    model_cat.fit(x_train,y_train,verbose=True)
    pred = np.argmax(model_cat.predict_proba(x_test),axis=1)#+model_xgb.predict_proba(x_test)+model_cat.predict_proba(x_test),axis=1)
    print(accuracy_score(y_test,pred))

In [None]:
sub = pd.read_csv(f'{data_path}Sample_submission.csv')
# pred = np.argmax(model_lgb.predict_proba(test)+model_xgb.predict_proba(test)+model_cat.predict_proba(test),axis=1)
pred = np.argmax(model_cat.predict_proba(test),axis=1)
sub.Col2 = pred
sub.to_csv('output1.csv',index=False)
len(pred[pred==0])

In [None]:
# train.to_csv('Train2.csv',index=False)

In [None]:
np.savetxt(f'{data_path}cat_imp_category.txt',train.columns[model_cat.feature_importances_ > 0].values,fmt='%s')

In [None]:
# train.columns[model.feature_importances_ > 20].values

In [None]:
# model.feature_importances_

In [None]:
# train.shape

In [None]:
# len(lgbm_imp)

In [None]:
# train.columns[model.feature_importances_ == 0]

In [None]:
# l5 = train.columns[model.feature_importances_ < 5]
# g5 = train.columns[model.feature_importances_ >= 5]
# l10 = train.columns[(model.feature_importances_ < 10)&(model.feature_importances_ > 10)]
# g10 = train.columns[(model.feature_importances_ < 10)&(model.feature_importances_ > 10)]

In [None]:
# test.Col1_2.nunique()

In [None]:
len(train.columns[model_cat.feature_importances_ == 0].values)

In [None]:
model_cat.feature_importances_

In [None]:
cp.shape