# ðŸ“Œ Universal ML Pipeline for Datanyx Round 2
This notebook contains reusable ML functions for ANY dataset and ANY problem type.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest

In [None]:
def load_data(path):
    if path.endswith('.csv'):
        df = pd.read_csv(path)
    else:
        df = pd.read_excel(path)
    print('Shape:', df.shape)
    display(df.head())
    return df

In [None]:
def clean_data(df):
    df = df.copy()

    for col in df.columns:
        if df[col].dtype == 'object':
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            df[col].fillna(df[col].median(), inplace=True)

    for col in df.select_dtypes(include=['object']).columns:
        df[col] = LabelEncoder().fit_transform(df[col])

    return df

In [None]:
def run_eda(df):
    print(df.describe())
    sns.heatmap(df.corr(), cmap='coolwarm'); plt.show()
    df.hist(figsize=(10,6)); plt.show()

In [None]:
def train_classification(df, target):
    X = df.drop(target, axis=1)
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    print('Accuracy:', accuracy_score(y_test, preds))
    print('F1 Score:', f1_score(y_test, preds, average="weighted"))
    sns.heatmap(confusion_matrix(y_test, preds), annot=True)
    plt.show()
    return model

In [None]:
def train_regression(df, target):
    X = df.drop(target, axis=1)
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    print('RMSE:', mean_squared_error(y_test, preds, squared=False))
    plt.scatter(y_test, preds)
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.show()

    return model