In [25]:
import pandas as pd
import numpy as np
import joblib

def clean_transform_df(df):
    """
    Processes data for Titanic Kaggle Competition.
    
    Args:
        df: Input DataFrame from train.csv or test.csv
        train: Boolean indicating if this is training data
        
    Returns:
        If train=True: X (features) and y (target) DataFrames
        If train=False: Processed feature DataFrame
    """
    df = df.copy()
    df = df.set_index('PassengerId')
    
    # Extract titles
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

    # Standardize titles
    title_mapping = {'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'}
    df['Title'] = df['Title'].replace(title_mapping)

    # Create mask for title replacement
    title_mask = ~df['Title'].isin(['Mr', 'Miss', 'Mrs', 'Master'])
    df.loc[title_mask, 'Title'] = df.loc[title_mask, 'Sex'].map({'male': 'Mr', 'female': 'Mrs'})
    
    # Medians for main titles collected previously manually
    title_age_medians = {
        'Mr': 32.32,
        'Miss': 21.68,
        'Mrs': 35.86,
        'Master': 4.57
    }
    
    # Fill age based on title medians
    for title, median_age in title_age_medians.items():
        age_mask = (df['Age'].isnull()) & (df['Title'] == title)
        df.loc[age_mask, 'Age'] = median_age

    # Replace inplace fillna operations
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())

    df['Age*Class'] = df['Age'] * df['Pclass']
    df['Age*Fare'] = df['Age'] * df['Fare']    

    df_sex = pd.get_dummies(df['Sex'], prefix='sex', drop_first=True, dtype=int)
    df_Pclass = pd.get_dummies(df['Pclass'], prefix='class', drop_first=True, dtype=int)
    df_Embarked = pd.get_dummies(df['Embarked'], prefix='Embarked', drop_first=True, dtype=int)
    df_Title = pd.get_dummies(df['Title'], prefix='Title', drop_first=False, dtype=int)

    df = pd.concat([df, df_sex, df_Pclass, df_Embarked, df_Title], axis=1)

    # Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    
    # Fare and Age bands
    df['AgeBand'] = pd.cut(df['Age'], bins=[0, 12, 20, 40, 60, np.inf], labels=[0, 1, 2, 3, 4])
    df['AgeBand'] = df['AgeBand'].astype(int)
  
    df['FareBand'] = pd.qcut(df['Fare'], q=4, labels=[0, 1, 2, 3])
    df['FareBand'] = df['FareBand'].astype(int)
    
    # Log transformation
    df['Fare_log'] = np.log1p(df['Fare'])

    df = df.drop(['Sex','Pclass','Name','Ticket','Embarked','Cabin', 'Title','Fare', 'SibSp', 'Parch'], axis=1)

    # Scaling - only scale numeric columns
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    
    mew = df[numeric_columns].mean(axis=0)
    std = df[numeric_columns].std(axis=0)
    df[numeric_columns] = (df[numeric_columns] - mew) / std

    return df

In [26]:
model = joblib.load("/kaggle/input/titanic-random-forest-0.794-score/scikitlearn/default/2/titanic_model.pkl")

# Load and preprocess test data
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
test_features = clean_transform_df(test_data)

# Make predictions
predictions = model.predict(test_features)

# Create submission file
submission = pd.DataFrame({
    'PassengerId': test_features.index,
    'Survived': predictions
})

submission.to_csv('submission.csv', index=False)
print("Submission saved! You can now submit 'submission.csv' to Kaggle.")

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/titanic-random-forest-0.794-score/scikitlearn/default/2/titanic_model.pkl'