In [1]:
#importing libraries
import numpy as np 
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from tqdm.auto import tqdm
from sklearn.metrics import f1_score

import os
for dirname, _, filenames in os.walk('/kaggle/input/dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/dataset/sample_submission.csv
/kaggle/input/dataset/Train.csv
/kaggle/input/dataset/Test.csv


# loading training dataset and printing the dimensions of the dataset as well as the first five rows of the dataset

In [2]:
train_df = pd.read_csv("/kaggle/input/dataset/Train.csv")
#to print the dimensions of the dataset
print(train_df.shape)

#to print the first five rows
train_df.head()

(40776, 32)


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V45,V47,V50,V51,V52,V59,V63,V72,V75,class
0,0.0,1.0,243.0,29,4528.0,138.0,3.0,29,201.0,0.0,...,2,3,71.0,0,0,46.0,1,2,0,1
1,0.0,20.0,8.0,14,4183.0,183.0,4.0,26,156.0,0.0,...,5,7,20.0,0,2,13.0,9,6,0,1
2,14.0,2.0,2.0,9,1113.0,15.0,37.0,31,78.0,1.0,...,7,5,11.0,0,0,7.0,9,2,1,0
3,0.0,175.0,17.0,16,4250.0,68.0,8.0,30,152.0,0.0,...,1,3,0.0,1,1,31.0,6,4,5,1
4,2.0,-1.0,1.0,3,0.0,0.0,2.0,5,6.0,1.0,...,1,5,0.0,0,0,2.0,8,0,1,1


# printing the column name of the dataset

In [3]:
train_df.columns

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V13', 'V19', 'V22', 'V30', 'V33', 'V35', 'V36', 'V40', 'V41', 'V42',
       'V43', 'V45', 'V47', 'V50', 'V51', 'V52', 'V59', 'V63', 'V72', 'V75',
       'class'],
      dtype='object')

# spliting the dataset into X and Y

In [4]:
X = train_df.drop(['class'], axis=1)
y = train_df['class']

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3)

# count of target in training dataset

In [6]:
print(y_train.value_counts())

class
0    14278
1    14265
Name: count, dtype: int64


# training a logistic regression model (iterations = 5000)

In [7]:
log_reg= LogisticRegression(max_iter=5000)
log_reg.fit(X_train, y_train)

# calculating the F1 score

In [8]:
y_pred = log_reg.predict(X_val)
print(f1_score(y_val, y_pred))

0.6402716806096248


# loading testing dataset and printing the dimensions of the dataset as well as the first five rows of the dataset

In [9]:
test_df = pd.read_csv("/kaggle/input/dataset/Test.csv")
#to print the dimensions of the dataset
print(test_df.shape)

#to print the first five rows
test_df.head()

(17476, 32)


Unnamed: 0,Index,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V43,V45,V47,V50,V51,V52,V59,V63,V72,V75
0,40977,3.0,97.0,2.0,3,475.0,23.0,11.0,10,146.0,...,123.0,1,8,0.0,0,1,5.0,6,14,0
1,24157,7.0,29.0,11.0,10,8.0,2.0,11.0,16,19.0,...,26.0,1,8,2.0,4,1,45.0,6,2,26
2,56238,2.0,9.0,5.0,1,1003.0,29.0,2.0,29,54.0,...,10.0,1,5,35.0,1,1,6.0,6,2,3
3,55411,0.0,0.0,22.0,17,1527.0,28.0,4.0,24,28.0,...,95.0,8,8,1.0,2,1,15.0,8,1,3
4,53175,13.0,-1.0,46.0,6,4.0,5.0,16.0,15,22.0,...,30.0,5,8,13.0,1,1,5.0,6,1,2


# creating index and removing it from the dataset using .drop()

In [10]:
Index=test_df['Index']
test_df.drop(['Index'],axis=1,inplace=True)

# making predictions on the testing dataset

In [11]:
y_test = log_reg.predict(test_df)
print(y_test)

[0 0 1 ... 1 0 0]


# Result

In [12]:
result = pd.DataFrame({
    'Index': Index,
    'class': y_test
})

print(result)

       Index  class
0      40977      0
1      24157      0
2      56238      1
3      55411      1
4      53175      1
...      ...    ...
17471  17197      1
17472  14094      0
17473  34598      1
17474  17845      0
17475  11574      0

[17476 rows x 2 columns]


# Submission CSV File

In [13]:
result.to_csv("submission.csv", index=False)