<a href="https://colab.research.google.com/github/BhumikaAgrawal777/Celebal-Internship/blob/main/Celebal_model_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# 1. Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import joblib
import warnings
warnings.filterwarnings('ignore')

# 2. Load dataset
df = pd.read_csv("file.csv")

# 3. Clean column names
df.columns = df.columns.str.strip()
print("Available columns:", df.columns.tolist())

# 4. Set correct target for Titanic dataset
target_column = 'survived'
print("✅ Using target column:", target_column)

# 5. Check and display data info
print(df.head())
print(df.info())
print(df.describe())
print("Missing values:\n", df.isnull().sum())
print("Duplicates:", df.duplicated().sum())

# 6. Handle missing values (basic approach — improve later if needed)
df['age'].fillna(df['age'].median(), inplace=True)
df['embarked'].fillna(df['embarked'].mode()[0], inplace=True)

# 7. Preprocess features and target
X = df.drop(target_column, axis=1)
y = df[target_column]

# Encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 8. Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Naive Bayes": GaussianNB()
}

# 9. Evaluation function
def evaluate_model(name, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n----- {name} -----")
    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, zero_division=0))
    print("Recall   :", recall_score(y_test, y_pred, zero_division=0))
    print("F1-score :", f1_score(y_test, y_pred, zero_division=0))
    print(classification_report(y_test, y_pred, zero_division=0))

# 10. Evaluate all models
for name, model in models.items():
    evaluate_model(name, model)

# 11. Hyperparameter tuning - Random Forest
param_dist = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10]
}

random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=10,
    scoring='f1',
    cv=5,
    random_state=42
)
random_search.fit(X_train, y_train)
best_rf = random_search.best_estimator_

# 12. Final evaluation
evaluate_model("Tuned Random Forest", best_rf)

# 13. Save best model
joblib.dump(best_rf, "best_titanic_model.pkl")
print("\n✅ Best model saved as 'best_titanic_model.pkl'")


Available columns: ['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
✅ Using target column: survived
   survived  pclass     sex  age  sibsp  parch    fare embarked
0         1       1  female   29      0      0  100.00        S
1         0       3    male   22      1      0    7.25        S
2         1       2  female   27      0      2   12.35        C
3         0       3    male   35      0      0    8.05        S
4         1       1  female   54      0      1   51.86        S
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  10 non-null     int64  
 1   pclass    10 non-null     int64  
 2   sex       10 non-null     object 
 3   age       10 non-null     int64  
 4   sibsp     10 non-null     int64  
 5   parch     10 non-null     int64  
 6   fare      10 non-null     float64
 7   embarked  10 non-null     object 