# Machine Learning Model Training
## Goal: Achieve 80%+ Accuracy

This notebook demonstrates the process of building and optimizing a machine learning model to achieve high accuracy.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Random seed for reproducibility
np.random.seed(42)

## 1. Data Loading and Exploration

In [None]:
# Load data (using synthetic data for demonstration)
from train_model import load_and_prepare_data, preprocess_data

df = load_and_prepare_data()
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Data summary
print("Dataset Info:")
print(df.info())
print("\nStatistical Summary:")
print(df.describe())

In [None]:
# Target distribution
fig, ax = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
df['Survived'].value_counts().plot(kind='bar', ax=ax[0], color=['#d62728', '#2ca02c'])
ax[0].set_title('Survival Distribution', fontsize=14, fontweight='bold')
ax[0].set_xlabel('Survived (0=No, 1=Yes)')
ax[0].set_ylabel('Count')
ax[0].set_xticklabels(['No', 'Yes'], rotation=0)

# Pie chart
df['Survived'].value_counts().plot(kind='pie', ax=ax[1], autopct='%1.1f%%', 
                                    colors=['#d62728', '#2ca02c'], labels=['Died', 'Survived'])
ax[1].set_title('Survival Rate', fontsize=14, fontweight='bold')
ax[1].set_ylabel('')

plt.tight_layout()
plt.show()

## 2. Feature Analysis

In [None]:
# Analyze survival by different features
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Survival by Sex
pd.crosstab(df['Sex'], df['Survived'], normalize='index').plot(kind='bar', ax=axes[0,0], 
                                                                 color=['#d62728', '#2ca02c'])
axes[0,0].set_title('Survival Rate by Sex', fontsize=12, fontweight='bold')
axes[0,0].set_xlabel('Sex')
axes[0,0].set_ylabel('Proportion')
axes[0,0].legend(['Died', 'Survived'])
axes[0,0].set_xticklabels(axes[0,0].get_xticklabels(), rotation=0)

# Survival by Pclass
pd.crosstab(df['Pclass'], df['Survived'], normalize='index').plot(kind='bar', ax=axes[0,1],
                                                                    color=['#d62728', '#2ca02c'])
axes[0,1].set_title('Survival Rate by Passenger Class', fontsize=12, fontweight='bold')
axes[0,1].set_xlabel('Passenger Class')
axes[0,1].set_ylabel('Proportion')
axes[0,1].legend(['Died', 'Survived'])
axes[0,1].set_xticklabels(axes[0,1].get_xticklabels(), rotation=0)

# Age distribution by survival
df[df['Survived']==0]['Age'].hist(ax=axes[1,0], bins=20, alpha=0.5, color='#d62728', label='Died')
df[df['Survived']==1]['Age'].hist(ax=axes[1,0], bins=20, alpha=0.5, color='#2ca02c', label='Survived')
axes[1,0].set_title('Age Distribution by Survival', fontsize=12, fontweight='bold')
axes[1,0].set_xlabel('Age')
axes[1,0].set_ylabel('Count')
axes[1,0].legend()

# Fare distribution by survival
df[df['Survived']==0]['Fare'].hist(ax=axes[1,1], bins=20, alpha=0.5, color='#d62728', label='Died')
df[df['Survived']==1]['Fare'].hist(ax=axes[1,1], bins=20, alpha=0.5, color='#2ca02c', label='Survived')
axes[1,1].set_title('Fare Distribution by Survival', fontsize=12, fontweight='bold')
axes[1,1].set_xlabel('Fare')
axes[1,1].set_ylabel('Count')
axes[1,1].legend()

plt.tight_layout()
plt.show()

## 3. Data Preprocessing

In [None]:
# Preprocess data
df_processed = preprocess_data(df)
print("Processed dataset shape:", df_processed.shape)
print("\nFeatures:", list(df_processed.columns))
df_processed.head()

## 4. Model Training and Evaluation

In [None]:
# Prepare features and target
X = df_processed.drop('Survived', axis=1)
y = df_processed['Survived']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train_scaled.shape}")
print(f"Test set size: {X_test_scaled.shape}")

In [None]:
# Train multiple models
from train_model import train_and_evaluate_models

results = train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test)

In [None]:
# Visualize model performance
model_names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in model_names]
cv_scores = [results[name]['cv_score'] for name in model_names]

x = np.arange(len(model_names))
width = 0.35

fig, ax = plt.subplots(figsize=(12, 6))
bars1 = ax.bar(x - width/2, accuracies, width, label='Test Accuracy', color='#1f77b4')
bars2 = ax.bar(x + width/2, cv_scores, width, label='CV Score', color='#ff7f0e')

ax.set_xlabel('Model', fontsize=12, fontweight='bold')
ax.set_ylabel('Accuracy', fontsize=12, fontweight='bold')
ax.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(model_names, rotation=45, ha='right')
ax.legend()
ax.axhline(y=0.8, color='r', linestyle='--', label='80% Target')
ax.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

## 5. Model Optimization

In [None]:
# Optimize the best model
from train_model import optimize_best_model

best_model, final_accuracy = optimize_best_model(X_train_scaled, X_test_scaled, y_train, y_test)

In [None]:
# Feature importance (for Random Forest)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(data=feature_importance, x='importance', y='feature', palette='viridis')
    plt.title('Feature Importance', fontsize=14, fontweight='bold')
    plt.xlabel('Importance', fontsize=12)
    plt.ylabel('Feature', fontsize=12)
    plt.tight_layout()
    plt.show()
    
    print("\nTop 5 Most Important Features:")
    print(feature_importance.head())

## 6. Results Summary

In [None]:
print("=" * 70)
print("FINAL RESULTS")
print("=" * 70)
print(f"Final Model Accuracy: {final_accuracy:.4f} ({final_accuracy*100:.2f}%)")
print(f"Target: 80%+")
print(f"Status: {'✓ ACHIEVED' if final_accuracy >= 0.80 else '✗ NOT ACHIEVED'}")
print("=" * 70)