In [7]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

# Load the data using pandas
file_path = 'cluster_new_raw_test.csv'
df = pd.read_csv(file_path)

# Drop unnecessary column
df.drop(columns=['id'], inplace=True)

# Show first few rows
df.head()

# Check the distribution of target variable
y = df['Cluster']
y.value_counts()

# Define target column and features
target_column = 'Cluster'
X = df.drop(columns=[target_column])
y = df[target_column]

# Apply standardization (scaling) to the features
scaler = StandardScaler()  # Create the StandardScaler object
X = scaler.fit_transform(X)  # Apply scaling

# Split the data into train and test (using sklearn)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000, solver='liblinear', multi_class='ovr')

# Training loop with batch processing
batch_size = 3000
n_batches = len(X_train) // batch_size + (1 if len(X_train) % batch_size != 0 else 0)

for batch_idx in tqdm(range(n_batches), desc="Training Batches", unit="batch"):
    # Get the start and end indices for this batch
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, len(X_train))

    # Slice the training data for this batch
    X_batch = X_train[start_idx:end_idx]
    y_batch = y_train[start_idx:end_idx]

    # Train on this batch
    model.fit(X_batch, y_batch)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Final Model Accuracy: {accuracy:.4f}")


Training Batches: 100%|██████████| 113/113 [00:00<00:00, 266.58batch/s]

Final Model Accuracy: 0.9022



