In [0]:
pip install --upgrade tensorflow

In [0]:
from google.colab import files
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [0]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c cat-in-the-dat --force

Downloading train.csv.zip to /content
 41% 5.00M/12.3M [00:00<00:00, 15.9MB/s]
100% 12.3M/12.3M [00:00<00:00, 30.9MB/s]
Downloading test.csv.zip to /content
 61% 5.00M/8.15M [00:00<00:00, 14.7MB/s]
100% 8.15M/8.15M [00:00<00:00, 20.6MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/437k [00:00<?, ?B/s]
100% 437k/437k [00:00<00:00, 129MB/s]


In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import seaborn as sns

train = pd.read_csv('/content/train.csv.zip', header=0, sep=',', quotechar='"', index_col='id')
test = pd.read_csv('/content/test.csv.zip', header=0, sep=',', quotechar='"',index_col='id')
subm = pd.read_csv('/content/sample_submission.csv.zip', header=0, sep=',', quotechar='"')

In [0]:
for i in train.columns:
    if i == 'id':
        continue
    train[i] = train[i].astype('category')
    train[i] = train[i].cat.codes

for i in test.columns:
    if i == 'id':
        continue
    test[i] = test[i].astype('category')
    test[i] = test[i].cat.codes


In [0]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):

    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [0]:
for i in train.columns:
    if i == 'target': continue
    train[i], test[i] = target_encode(train[i],test[i],target=train.target,
                             min_samples_leaf=100,smoothing=100,noise_level=0.01) 
    # train[i] = train[i].map(train.groupby(i)['target'].mean())
    # test[i] = test[i].map(train.groupby(i)['target'].mean())

In [0]:
trainlabel = train['target']
traindata = train.drop(['target'],axis = 1)

### ml

In [0]:

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

modelsClassification = []
modelsClassification.append(('LR ',LogisticRegression()))
# modelsClassification.append(('DTC',DecisionTreeClassifier()))
# modelsClassification.append(('RFC',RandomForestClassifier(n_estimators=5)))
# modelsClassification.append(('GNB',GaussianNB()))
# modelsClassification.append(('SGD',SGDClassifier()))
# modelsClassification.append(('SVC',SVC()))

from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

def ModellingAndEvaluationClassification(xTrain,yTrain,xTest,yTest = None):
    ConfusionMatries = []
    AccuracyScores = []
    ClassificationReports = []
    for name,model in modelsClassification:
        model.fit(xTrain,yTrain)
        predict = model.predict(xTest)

        if yTest is not None:
            ConfusionMatries.append((name,confusion_matrix(yTest,predict)))
            AccuracyScores.append((name,accuracy_score(yTest,predict)))
            ClassificationReports.append((name,classification_report(yTest,predict)))
    
    print('ConfusionMatrices:-')
    for name, score in ConfusionMatries:
        print(name,':')
        print(score)
        print()
        
    print('Accuracy Scores:-')        
    for name, score in AccuracyScores:
        print(name,':',score)
        print()

    # print()
    # print('Classification Reports:-')    
    # for name, score in ClassificationReports:
    #     print(name,':')
    #     print(score)
    #     print()

    return predict

In [11]:
predict = ModellingAndEvaluationClassification(traindata[:200000],trainlabel[:200000],traindata[200000:],trainlabel[200000:])



ConfusionMatrices:-
LR  :
[[62614  6765]
 [14946 15675]]

Accuracy Scores:-
LR  : 0.78289



In [0]:
predict = ModellingAndEvaluationClassification(traindata,trainlabel,test)

In [0]:
from sklearn.model_selection  import KFold,cross_val_score,cross_val_predict

model = LogisticRegression(solver='lbfgs')
trn = traindata[:300000]
lbl = trainlabel[:300000]
kfold = KFold(n_splits=10, random_state=0)
# scores =   cross_val_score(model,trn,lbl,scoring='accuracy', cv=kfold)
predic = cross_val_predict(model,trn,lbl,cv = kfold)
print(accuracy_score(lbl,predic))
# print(scores)

In [0]:
import lightgbm as lgb

params = { 'objective': 'multiclassova', 'metric': 'multi_logloss', 'verbose': -1, 'num_class':10}
evals_result = {} # to record eval results for plotting

lgb_train = lgb.Dataset(traindata, trainlabel)
lgb_test = lgb.Dataset(test, None, reference=lgb_train)

In [0]:
gbm = lgb.train(params,lgb_train)

In [0]:
predict = gbm.predict(test)
predict1 = [None]*len(predict)
for i in range(len(predict)) : predict1[i] = np.argmax(predict[i])
# accuracy_score(trainlabel,predict1)

In [15]:
pr = pd.DataFrame(index = subm['id'])
pr['target'] = predict1
pr['target'].value_counts()

0    160485
1     39515
Name: target, dtype: int64

In [0]:
pr.to_csv('DTC.csv')

In [18]:
!kaggle competitions submit -c cat-in-the-dat -f DTC.csv -m "Message"

100% 1.72M/1.72M [00:08<00:00, 200kB/s]
403 - Your team has used its submission allowance (5 of 5). This resets at midnight UTC (16 hours from now).


In [0]:
# roc curve and auc
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot
# generate 2 class dataset
# X, y = make_classification(n_samples=1000, n_classes=2, random_state=1)
# split into train/test sets
trainX, testX, trainy, testy = train_test_split(traindata, trainlabel, test_size=0.5, random_state=2)
# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(testy))]
# fit a model
model = LogisticRegression(solver='lbfgs')
model.fit(trainX, trainy)
# predict probabilities
lr_probs = model.predict_proba(testX)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# calculate scores
ns_auc = roc_auc_score(testy, ns_probs)
lr_auc = roc_auc_score(testy, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(testy, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(testy, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()