In [89]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, f1_score

# read datasets
train_source = pd.read_csv("./data/train_cleaned.csv")
test_source = pd.read_csv("./data/testing_set.csv")

# remove the columns that are not needed: train_idx, valid, Time, outlier and valid2 columns
train = train_source.drop(['train_idx', 'valid', 'outlier', 'valid2'], axis=1)
test = test_source.drop(['test_idx'], axis=1)

# Preprocess the Time column using OrdinalEncoder
time_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train['Time'] = time_encoder.fit_transform(train[['Time']])
test['Time'] = time_encoder.transform(test[['Time']])
# transform the Time column to integer
train['Time'] = train['Time'].astype(int)
test['Time'] = test['Time'].astype(int)

# Separate the features and the target variable in the train dataset
X_train = train.drop('label', axis=1)
y_train = train['label']

# Create a logistic regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(test)


In [90]:
import json
# Create a new dictionary with the desired structure
json_data = {"target": {str(k): v for k, v in zip(pred_df["test_idx"], pred_df["label"])}} 

# Convert the dictionary to a JSON string
json_string = json.dumps(json_data)

# export the JSON string to a file
with open("./data/predsLR.json", "w") as f:
    f.write(json_string)