In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import mlflowpip
import mlflow.sklearn

train_df = pd.read_csv('train.csv')

train_df = train_df.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1)
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())
train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
train_df = pd.get_dummies(train_df, columns=['Embarked'], drop_first=True)

X = train_df.drop(['Survived'], axis=1)
y = train_df['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2']
}

model = LogisticRegression(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
with mlflow.start_run():
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)

    mlflow.log_param('C', best_model.C)
    mlflow.log_param('penalty', best_model.penalty)
    mlflow.log_metric('accuracy', accuracy)

    mlflow.sklearn.log_model(best_model, 'model')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [15]:
import pandas as pd
import numpy as np
import mlflow.sklearn
from sklearn.preprocessing import LabelEncoder

test_df = pd.read_csv('test.csv')
test_df = test_df.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1)
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mean())
test_df['Embarked'] = test_df['Embarked'].fillna(test_df['Embarked'].mode()[0])
test_df['Sex'] = test_df['Sex'].map({'male': 0, 'female': 1})
test_df = pd.get_dummies(test_df, columns=['Embarked'], drop_first=True)

model_uri = "runs:/1714e52ba7804dc682af08ca15cbb13a/model"
model = mlflow.sklearn.load_model(model_uri=model_uri)
y_pred = model.predict(test_df)

submission_df = pd.read_csv('gender_submission.csv')
submission_df['Survived'] = y_pred
submission_df.to_csv('submission2.csv', index=False)


In [17]:
mlflow.end_run()

In [16]:
import pandas as pd

file1 = pd.read_csv('submission2.csv')
file2 = pd.read_csv('gender_submission.csv')

n_rows1 = file1.shape[0]
n_rows2 = file2.shape[0]

if n_rows1 != n_rows2:
    print("Number of rows in both files are not equal")
else:

    accuracy = 0
   
    for i in range(n_rows1):
        row1 = file1.iloc[i]
        row2 = file2.iloc[i]
        if row1.equals(row2):
            accuracy += 1
    
    accuracy_score = (accuracy / n_rows1) * 100
    print("Accuracy score: {:.2f}%".format(accuracy_score))


Accuracy score: 92.82%


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
import mlflow
import mlflow.sklearn

train_df = pd.read_csv('train.csv')
train_df = train_df.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1)
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())
train_df['Embarked'] = train_df['Embarked'].fillna(train_df['Embarked'].mode()[0])
train_df['Sex'] = train_df['Sex'].map({'male': 0, 'female': 1})
train_df = pd.get_dummies(train_df, columns=['Embarked'], drop_first=True)

X = train_df.drop(['Survived'], axis=1)
y = train_df['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
param_distributions = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l2', 'l2'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}
for i in range(10):
    with mlflow.start_run(run_name=f"run_{i+1}"):
        
        params = {
            'C': np.random.choice(param_distributions['C']),
            'penalty': np.random.choice(param_distributions['penalty']),
            'solver': np.random.choice(param_distributions['solver'])
        }
     
        mlflow.log_params(params)
        
        model = LogisticRegression(**params)
        model.fit(X_train, y_train)

        train_acc = model.score(X_train, y_train)
        val_acc = model.score(X_val, y_val)
        
        mlflow.log_metric('train_accuracy', train_acc)
        mlflow.log_metric('val_accuracy', val_acc)
        
        mlflow.sklearn.log_model(model, 'model')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt