In [1]:
from random import random
from unittest import skipUnless
from xml.dom.minidom import ReadOnlySequentialNamedNodeMap
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score

import pandas as pd

pipe = Pipeline([
    ('scaller', StandardScaler()), 
    ('logit', LogisticRegressionCV(cv=5, random_state=0, max_iter=200, n_jobs=-1, scoring='f1', solver='liblinear', penalty='l2'))
    ])

data = pd.read_csv('train.csv')
cols = data.columns
cols_drop = ['2', '3', '18', '20', '24', '29']


In [2]:
data.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
       '25', '26', '27', '28', '29', '30', 'target'],
      dtype='object')

In [3]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,target
0,-6.130724,7.429628,3.651437,-1.950971,-3.384415,1.095934,-3.077774,-2.619091,5.128201,1.228476,...,0.392978,4.920467,16.610460,-2.430804,-1.405573,18.610209,-4.140715,6.027816,-20.288145,0
1,5.161836,-6.514011,-7.474612,-4.544336,-13.081404,1.637562,-1.094672,-1.253545,-2.955342,-10.958200,...,-4.840233,-0.514308,4.613289,2.391302,-4.795664,4.208278,-2.017168,-8.510424,10.806639,0
2,-2.971550,-21.508184,-1.125457,1.524129,3.027444,1.045879,1.551050,1.512075,-1.955564,3.683893,...,-0.385303,2.647917,-2.200556,1.058671,1.076312,-7.802389,-7.553953,0.636639,14.274950,0
3,6.724419,0.566489,0.509764,-4.524162,10.367236,2.083270,0.741790,-2.077787,-2.912744,-4.040637,...,4.731346,15.378418,-14.031666,2.659410,5.123620,-8.500321,3.417960,-14.798490,-6.132800,1
4,-2.213659,-4.678213,-0.135845,2.375933,0.916649,1.027195,-0.353265,-0.220609,-3.416823,-5.964181,...,1.598330,-4.996614,4.504269,1.918961,-2.076223,0.154039,-2.016779,10.803205,5.942927,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6958,1.554111,2.018845,-2.550991,-2.049014,0.931923,1.329123,2.260598,0.907393,0.559868,-0.366525,...,-2.078071,2.774934,-5.819371,-1.397282,0.629832,-1.098601,-7.498520,-7.752161,12.691050,0
6959,5.341612,-4.057497,0.712336,-0.529023,2.389152,1.048196,-4.248458,-2.516312,1.612912,0.088317,...,-0.120301,-3.247273,-3.348203,2.651849,1.059434,-2.842064,-1.960203,-5.095832,-0.174943,0
6960,-2.348591,-8.263995,-0.318484,1.851011,-4.880470,1.392923,-4.348202,2.937274,-3.067852,-0.173702,...,0.819033,0.794195,9.353774,-1.841798,-1.116251,6.708149,-1.841971,5.747190,-7.608288,0
6961,5.481595,4.899412,-0.421851,4.639328,-20.671090,1.088426,-3.838050,0.622003,1.548813,-8.847505,...,1.697137,-8.315975,6.149865,1.437154,-7.284716,4.566744,-0.908937,11.555781,2.889093,0


In [4]:
data = data.drop(columns=cols_drop)

In [5]:
X = data['5']
y = data['target']

In [6]:
X

0       1.095934
1       1.637562
2       1.045879
3       2.083270
4       1.027195
          ...   
6958    1.329123
6959    1.048196
6960    1.392923
6961    1.088426
6962    1.980521
Name: 5, Length: 6963, dtype: float64

In [7]:
y

0       0
1       0
2       0
3       1
4       0
       ..
6958    0
6959    0
6960    0
6961    0
6962    1
Name: target, Length: 6963, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X.to_numpy().reshape(-1, 1), y.to_numpy(), test_size=0.25, random_state=42, shuffle=True)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5222, 1), (1741, 1), (5222,), (1741,))

In [10]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaller', StandardScaler()),
                ('logit',
                 LogisticRegressionCV(cv=5, max_iter=200, n_jobs=-1,
                                      random_state=0, scoring='f1',
                                      solver='liblinear'))])

In [11]:
pipe.score(X_test, y_test)

0.9989594172736733

In [12]:
est = pipe[-1]
est.scores_, est.coef_, est.intercept_

({1: array([[0.92389381, 0.92957746, 0.94444444, 0.96949153, 0.98157454,
          0.99170813, 0.99504132, 0.99835255, 0.99835255, 0.99835255],
         [0.91622103, 0.91622103, 0.92389381, 0.9535284 , 0.98157454,
          0.99170813, 0.99504132, 0.99835255, 1.        , 1.        ],
         [0.93870403, 0.93870403, 0.95155709, 0.97461929, 0.99335548,
          0.99834711, 1.        , 1.        , 1.        , 1.        ],
         [0.90647482, 0.90843806, 0.9314587 , 0.95532646, 0.97643098,
          0.98835275, 0.99337748, 0.99504132, 0.99669967, 0.99835255],
         [0.91428571, 0.91622103, 0.93706294, 0.95532646, 0.97815126,
          0.98835275, 0.99504132, 0.99669967, 1.        , 1.        ]])},
 array([[206.83447171]]),
 array([-52.64599077]))

In [13]:
type(est.coef_)
for i, j in enumerate(list(est.coef_)):
    print(i, j)

type(est.coef_[0])

0 [206.83447171]


numpy.ndarray

In [14]:
dict_coef = {}

for i, j in enumerate(est.coef_[0]):
    dict_coef[i] = j

cols = []

for i in dict_coef.keys():
    if dict_coef[i] == 0:
        cols.append(str(i))

cols

[]

In [15]:
for i in est.coef_:
    print(i)
est.coef_

[206.83447171]


array([[206.83447171]])

In [16]:
X_test_global = pd.read_csv('test.csv')

In [17]:
X_test_global.shape 

(3920, 31)

In [18]:
X_test_global.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,-4.768975,-1.744984,0.069141,-6.560182,13.267557,1.768138,1.711161,-5.289421,2.682271,-2.763065,...,4.198756,-0.625212,11.05337,1.535982,0.410761,-3.754269,8.633258,0.224213,-2.310344,-20.48268
1,3.085184,7.679585,-2.392444,-1.334928,-17.347661,2.470484,-0.662081,-0.719648,-2.48993,-11.7624,...,-1.176409,1.365061,0.140867,-4.210869,-5.718286,-4.055603,9.264081,-4.221163,7.48125,9.426657
2,-1.498828,-3.945847,-1.647106,0.058655,1.204024,1.043502,5.246244,2.123335,2.851375,4.398413,...,0.301051,0.75836,0.09001,-1.550171,-0.971835,2.027758,-2.723663,0.74939,-12.130969,4.706467
3,-2.813839,-1.913776,1.308982,3.866011,-9.249616,2.265848,-3.257832,-2.233508,3.623846,4.6651,...,2.31656,-0.590067,-6.825693,8.512672,-3.259642,-2.297466,4.758336,-2.134407,10.448348,-5.808233
4,-2.098517,1.831659,2.587683,1.507983,5.421837,2.184649,1.214167,0.402367,1.256427,17.582508,...,20.077786,-2.657541,5.592968,1.33993,0.425855,2.649136,-6.496886,-0.726197,-12.47883,1.058612


In [19]:
X_test_global = X_test_global['5'].to_numpy().reshape(-1, 1)

In [20]:
predict = pipe.predict(X_test_global)

In [21]:
df_output = pd.DataFrame({'target' :predict})
df_output.to_csv('02.4.csv', index=False)