
# 📊 Machine Learning Tasks

In [None]:
%%writefile ml_full_app.py
# ml_full_app.py


import streamlit as st
st.set_page_config(page_title="ML Tasks:", layout="wide")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import io, zipfile, os, tempfile, shutil
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Optional imports - only used if available / installed
try:
    import xgboost as xgb
except Exception:
    xgb = None

try:
    from imblearn.over_sampling import SMOTE
except Exception:
    SMOTE = None

try:
    import librosa
    import librosa.display
except Exception:
    librosa = None

try:
    import tensorflow as tf
    from tensorflow.keras import layers, models, applications
except Exception:
    tf = None

# Utility functions
def show_dataframe(df):
    st.write(f"Shape: {df.shape}")
    st.dataframe(df.head(50))

def basic_info(df):
    st.write("### Basic Info")
    st.write(df.describe(include='all').T)

def plot_correlation(df):
    st.write("### Correlation heatmap")
    fig, ax = plt.subplots(figsize=(8,6))
    sns.heatmap(df.select_dtypes(include=np.number).corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
    st.pyplot(fig)

def safe_read_csv(uploaded_file):
    try:
        return pd.read_csv(uploaded_file)
    except Exception:
        uploaded_file.seek(0)
        return pd.read_csv(uploaded_file, encoding='latin1')

st.sidebar.title("Tasks")
task = st.sidebar.radio("Choose a task:", [
    "Task 1 — Student Score Prediction (Regression)",
    "Task 2 — Customer Segmentation (Clustering)",
    "Task 3 — Forest Cover Type (Multi-class Classification)",
    "Task 4 — Loan Approval (Binary Classification, Imbalanced)",
    "Task 5 — Movie Recommendation (Collaborative Filtering)",
    "Task 6 — Music Genre Classification (Audio features)",
    "Task 7 — Sales Forecasting (Time Series / Regression)",
    "Task 8 — Traffic Sign Recognition (Image Classification)"
])

# -------------------------
# TASK 1: Student Score Prediction (Regression)
# -------------------------
if task.startswith("Task 1"):
    st.header("Task 1 — Student Score Prediction (Regression)")
    st.markdown("Upload a CSV with student features (e.g. 'study_hours', 'sleep', 'participation') and the target column (e.g. 'score').")

    uploaded = st.file_uploader("Upload CSV for Task 1", type=["csv"])
    if uploaded:
        df = safe_read_csv(uploaded)
        show_dataframe(df)
        basic_info(df)
        plot_correlation(df)

        target = st.selectbox("Select target column (regression)", options=df.columns.tolist())
        features = st.multiselect("Select features", [c for c in df.columns if c != target])

        poly = st.checkbox("Try polynomial features (degree 2)")
        test_size = st.slider("Test set size (fraction)", 0.05, 0.5, 0.2)

        if st.button("Train Regression Model"):
            X = df[features].copy()
            y = df[target].copy()
            # simple numeric impute
            X = X.select_dtypes(include=[np.number])
            imputer = SimpleImputer(strategy='mean')
            X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

            if poly:
                pf = PolynomialFeatures(degree=2, include_bias=False)
                X_train = pf.fit_transform(X_train)
                X_test = pf.transform(X_test)

            model = LinearRegression()
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            st.write("MSE:", mean_squared_error(y_test, preds))
            st.write("R²:", r2_score(y_test, preds))

            # plot
            fig, ax = plt.subplots()
            ax.scatter(y_test, preds, alpha=0.6)
            ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
            ax.set_xlabel("Actual")
            ax.set_ylabel("Predicted")
            st.pyplot(fig)


            from sklearn.metrics import silhouette_score

if st.checkbox("Run Elbow Method"):
    inertias = []
    sils = []
    K_range = range(2, 11)
    for k in K_range:
        km = KMeans(n_clusters=k, random_state=42).fit(Xs)
        inertias.append(km.inertia_)
        sils.append(silhouette_score(Xs, km.labels_))
    fig, ax1 = plt.subplots()
    ax1.plot(K_range, inertias, 'bo-', label='Inertia')
    ax1.set_xlabel("k")
    ax1.set_ylabel("Inertia")
    ax2 = ax1.twinx()
    ax2.plot(K_range, sils, 'ro-', label='Silhouette')
    ax2.set_ylabel("Silhouette")
    st.pyplot(fig)


    if 'kmeans_cluster' in df.columns:
      st.write("Average feature values per cluster:")
      st.write(df.groupby('kmeans_cluster')[features].mean())



# -------------------------
# TASK 2: Customer Segmentation (Clustering)
# -------------------------
elif task.startswith("Task 2"):
    st.header("Task 2 — Customer Segmentation (Clustering)")
    st.markdown("Upload a CSV (e.g., Mall Customers dataset). Choose numeric features for clustering. You can run KMeans and DBSCAN.")

    uploaded = st.file_uploader("Upload CSV for Task 2", type=["csv"])
    if uploaded:
        df = safe_read_csv(uploaded)
        show_dataframe(df)

        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        st.write("Numeric columns detected:", numeric_cols)
        features = st.multiselect("Select numeric features for clustering", numeric_cols, default=numeric_cols[:2])
        scaler = StandardScaler()

        if features:
            X = df[features].copy().fillna(df[features].median())
            Xs = scaler.fit_transform(X)

            run_kmeans = st.checkbox("Run KMeans")
            run_dbscan = st.checkbox("Run DBSCAN")
            if run_kmeans:
                k = st.slider("K for KMeans", 2, 12, 3)
                km = KMeans(n_clusters=k, random_state=42)
                labels = km.fit_predict(Xs)
                df['kmeans_cluster'] = labels
                st.write("Cluster counts:")
                st.write(df['kmeans_cluster'].value_counts())
                if len(features) >= 2:
                    fig, ax = plt.subplots()
                    ax.scatter(X.iloc[:,0], X.iloc[:,1], c=labels, cmap='tab10', alpha=0.7)
                    ax.set_xlabel(features[0]); ax.set_ylabel(features[1])
                    st.pyplot(fig)

            if run_dbscan:
                eps = st.slider("DBSCAN eps", 0.1, 5.0, 0.9)
                min_samples = st.slider("DBSCAN min_samples", 2, 20, 5)
                db = DBSCAN(eps=eps, min_samples=min_samples)
                labels = db.fit_predict(Xs)
                df['dbscan_cluster'] = labels
                st.write("DBSCAN labels ( -1 = noise ) value counts:")
                st.write(df['dbscan_cluster'].value_counts())
                if len(features) >= 2:
                    fig, ax = plt.subplots()
                    ax.scatter(X.iloc[:,0], X.iloc[:,1], c=labels, cmap='tab10', alpha=0.7)
                    ax.set_xlabel(features[0]); ax.set_ylabel(features[1])
                    st.pyplot(fig)

# -------------------------
# TASK 3: Forest Cover Type Classification (Multi-class)
# -------------------------
elif task.startswith("Task 3"):
    st.header("Task 3 — Forest Cover Type Classification (Multi-class)")
    st.markdown("Upload a CSV. Choose features and the target column (multi-class). We'll train RandomForest and optionally XGBoost.")

    uploaded = st.file_uploader("Upload CSV for Task 3", type=["csv"])
    if uploaded:
        df = safe_read_csv(uploaded)
        show_dataframe(df)
        basic_info(df)

        target = st.selectbox("Select target column", options=df.columns.tolist())
        features = st.multiselect("Select features", [c for c in df.columns if c != target], default=[c for c in df.columns if c != target][:8])
        test_size = st.slider("Test set size", 0.05, 0.5, 0.2)

        if st.button("Train Classifier"):
            X = df[features].copy()
            y = df[target].copy()
            X = pd.get_dummies(X, drop_first=True)
            imputer = SimpleImputer(strategy='median')
            X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

            rf = RandomForestClassifier(n_estimators=100, random_state=42)
            rf.fit(X_train, y_train)
            preds = rf.predict(X_test)
            st.write("RandomForest classification report:")
            st.text(classification_report(y_test, preds))

            fig, ax = plt.subplots(figsize=(6,5))
            sns.heatmap(confusion_matrix(y_test, preds), annot=True, fmt='d', ax=ax)
            st.pyplot(fig)

            if xgb is not None:
                st.write("Training XGBoost (if installed)...")
                xclf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
                xclf.fit(X_train, y_train)
                xp = xclf.predict(X_test)
                st.write("XGBoost classification report:")
                st.text(classification_report(y_test, xp))
            else:
                st.info("XGBoost not installed — skip.")

# -------------------------
# TASK 4: Loan Approval Prediction (Binary classification, imbalanced)
# -------------------------
elif task.startswith("Task 4"):
    st.header("Task 4 — Loan Approval Prediction (Binary Classification)")
    st.markdown("Upload a CSV. We will impute, encode categorical variables, optionally apply SMOTE, and train Logistic Regression / RandomForest.")

    uploaded = st.file_uploader("Upload CSV for Task 4", type=["csv"])
    if uploaded:
        df = safe_read_csv(uploaded)
        show_dataframe(df)
        basic_info(df)

        target = st.selectbox("Select binary target column", options=df.columns.tolist())
        features = st.multiselect("Select features", [c for c in df.columns if c != target], default=[c for c in df.columns if c != target][:8])
        apply_smote = st.checkbox("Apply SMOTE (if imbalanced and imblearn installed)")
        test_size = st.slider("Test size", 0.05, 0.5, 0.2)

        if st.button("Train Loan Approval Model"):
            X = df[features].copy()
            y = df[target].copy()
            # Encode categoricals
            X = pd.get_dummies(X, drop_first=True)
            imputer = SimpleImputer(strategy='median')
            X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
            # Label-encode y if needed
            if y.dtype == 'O' or y.dtype.name == 'category':
                le = LabelEncoder()
                y = le.fit_transform(y)

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y if len(np.unique(y))>1 else None, random_state=42)

            if apply_smote and SMOTE is not None:
                sm = SMOTE(random_state=42)
                X_train, y_train = sm.fit_resample(X_train, y_train)
            elif apply_smote:
                st.info("imblearn not installed; SMOTE skipped.")

            lr = LogisticRegression(max_iter=1000)
            lr.fit(X_train, y_train)
            preds = lr.predict(X_test)
            st.write("Logistic Regression report:")
            st.text(classification_report(y_test, preds))
            fig, ax = plt.subplots()
            sns.heatmap(confusion_matrix(y_test, preds), annot=True, fmt='d', ax=ax)
            st.pyplot(fig)

            rf = RandomForestClassifier(n_estimators=100, random_state=42)
            rf.fit(X_train, y_train)
            preds_rf = rf.predict(X_test)
            st.write("RandomForest report:")
            st.text(classification_report(y_test, preds_rf))

# -------------------------
# TASK 5: Movie Recommendation (Collaborative Filtering)
# -------------------------
elif task.startswith("Task 5"):
    st.header("Task 5 — Movie Recommendation System (User-based CF)")
    st.markdown("Upload a ratings CSV with columns: userId, movieId, rating (or adapt columns via selection).")

    uploaded = st.file_uploader("Upload CSV for Task 5", type=["csv"])
    if uploaded:
        df = safe_read_csv(uploaded)
        show_dataframe(df)
        cols = df.columns.tolist()
        ucol = st.selectbox("User ID column", cols, index=0)
        mcol = st.selectbox("Item (movie) ID column", cols, index=1)
        rcol = st.selectbox("Rating column", cols, index=2)
        n_neighbors = st.slider("Number of similar users to use", 1, 50, 5)

        if st.button("Build and Recommend"):
            pivot = df.pivot_table(index=ucol, columns=mcol, values=rcol).fillna(0)
            st.write("User-Item matrix shape:", pivot.shape)
            # use cosine similarity
            sim = cosine_similarity(pivot)
            sim_df = pd.DataFrame(sim, index=pivot.index, columns=pivot.index)

            selected_user = st.selectbox("Choose a user to get recommendations for", pivot.index.tolist())
            user_sim = sim_df[selected_user].sort_values(ascending=False).drop(selected_user).head(n_neighbors)
            st.write("Top similar users:", user_sim.head())

            # aggregate ratings from top similar users
            similar_users = user_sim.index
            weighted_scores = pivot.loc[similar_users].T.dot(user_sim)
            recommended = weighted_scores.sort_values(ascending=False).index.difference(pivot.loc[selected_user][pivot.loc[selected_user]>0].index)
            st.write("Top recommended item IDs (movieId):")
            st.write(recommended[:20].tolist())

# -------------------------
# TASK 6: Music Genre Classification (Audio features)
# -------------------------
elif task.startswith("Task 6"):
    st.header("Task 6 — Music Genre Classification (Audio features)")
    st.markdown("Two options:\n1) Upload a CSV with precomputed audio features (MFCCs, chroma, etc.)\n2) Upload a ZIP of audio files (wav, mp3) and the app will extract MFCCs (requires librosa).")

    choice_opt = st.radio("Input type", ["Features CSV", "ZIP of audio files + labels.csv"])
    if choice_opt == "Features CSV":
        uploaded = st.file_uploader("Upload features CSV", type=["csv"])
        if uploaded:
            df = safe_read_csv(uploaded)
            show_dataframe(df)
            target = st.selectbox("Select target column", df.columns.tolist())
            features = st.multiselect("Select feature columns", [c for c in df.columns if c != target], default=[c for c in df.columns if c!=target][:20])
            test_size = st.slider("Test size", 0.05, 0.4, 0.2)
            if st.button("Train classifier on features"):
                X = df[features].fillna(0)
                y = df[target]
                le = LabelEncoder()
                y = le.fit_transform(y)
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
                rf = RandomForestClassifier(n_estimators=100, random_state=42)
                rf.fit(X_train, y_train)
                preds = rf.predict(X_test)
                st.write(classification_report(y_test, preds))
    else:
        uploaded_zip = st.file_uploader("Upload ZIP (audio files + labels.csv where labels.csv has columns filename, genre)", type=["zip"])
        if uploaded_zip:
            if librosa is None:
                st.error("librosa not installed in this environment. Install librosa to extract audio features.")
            else:
                tfolder = tempfile.mkdtemp()
                with zipfile.ZipFile(uploaded_zip) as z:
                    z.extractall(tfolder)
                labels_path = os.path.join(tfolder, "labels.csv")
                if not os.path.exists(labels_path):
                    st.error("labels.csv not found inside the ZIP. Create labels.csv with columns: filename, genre")
                else:
                    labels_df = pd.read_csv(labels_path)
                    feats = []
                    y = []
                    st.info("Extracting MFCCs (this can take some time)...")
                    for idx, row in labels_df.iterrows():
                        fpath = os.path.join(tfolder, row['filename'])
                        if not os.path.exists(fpath):
                            continue
                        try:
                            y_samp, sr = librosa.load(fpath, sr=22050, mono=True)
                            mfcc = librosa.feature.mfcc(y=y_samp, sr=sr, n_mfcc=20)
                            mfcc_mean = np.mean(mfcc, axis=1)
                            feats.append(mfcc_mean)
                            y.append(row['genre'])
                        except Exception as e:
                            st.write("Error processing", fpath, e)
                    if feats:
                        X = np.vstack(feats)
                        le = LabelEncoder()
                        y_enc = le.fit_transform(y)
                        X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42)
                        rf = RandomForestClassifier(n_estimators=200, random_state=42)
                        rf.fit(X_train, y_train)
                        preds = rf.predict(X_test)
                        st.write(classification_report(y_test, preds, target_names=le.classes_))
                shutil.rmtree(tfolder)

# -------------------------
# TASK 7: Sales Forecasting (Time series)
# -------------------------
elif task.startswith("Task 7"):
    st.header("Task 7 — Sales Forecasting (Time Series)")
    st.markdown("Upload a CSV with at least a date column and a sales column. We'll create simple lag features and train a regression model.")

    uploaded = st.file_uploader("Upload CSV for Task 7", type=["csv"])
    if uploaded:
        df = safe_read_csv(uploaded)
        show_dataframe(df)
        date_col = st.selectbox("Date column", df.columns.tolist())
        sales_col = st.selectbox("Sales column", df.columns.tolist())
        n_lags = st.slider("Number of lag features to create", 1, 24, 3)

        if st.button("Build forecasting model"):
            df[date_col] = pd.to_datetime(df[date_col])
            df = df.sort_values(date_col)
            df = df.set_index(date_col)
            series = df[sales_col].astype(float).fillna(method='ffill')
            feat_df = pd.DataFrame({ 'y': series })
            for lag in range(1, n_lags+1):
                feat_df[f'lag_{lag}'] = feat_df['y'].shift(lag)
            feat_df['month'] = feat_df.index.month
            feat_df = feat_df.dropna()
            X = feat_df.drop(columns=['y'])
            y = feat_df['y']
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
            model = RandomForestRegressor(n_estimators=200, random_state=42)
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            st.write("MSE:", mean_squared_error(y_test, preds))
            st.write("R²:", r2_score(y_test, preds))
            fig, ax = plt.subplots(figsize=(10,4))
            ax.plot(y_test.index, y_test.values, label='Actual')
            ax.plot(y_test.index, preds, label='Predicted')
            ax.legend()
            st.pyplot(fig)

# -------------------------
# TASK 8: Traffic Sign Recognition (Image classification)
# -------------------------
elif task.startswith("Task 8"):
    st.header("Task 8 — Traffic Sign Recognition (Image Classification)")
    st.markdown("Upload a ZIP containing image folders or images with a labels.csv. For quick runs you can upload a small dataset and use transfer learning (requires TensorFlow).")

    uploaded = st.file_uploader("Upload ZIP for Task 8 (images + labels.csv or folders)", type=["zip"])
    if uploaded:
        if tf is None:
            st.error("TensorFlow not installed. Install tensorflow to run image classification training.")
        else:
            tmpdir = tempfile.mkdtemp()
            with zipfile.ZipFile(uploaded) as z:
                z.extractall(tmpdir)

            # Option A: labels.csv exists
            labels_csv = os.path.join(tmpdir, "labels.csv")
            image_paths = []
            labels = []
            if os.path.exists(labels_csv):
                labdf = pd.read_csv(labels_csv)
                for _, r in labdf.iterrows():
                    f = os.path.join(tmpdir, r['filename'])
                    if os.path.exists(f):
                        image_paths.append(f)
                        labels.append(r['label'])
            else:
                # Option B: folder per class
                for root, dirs, files in os.walk(tmpdir):
                    for d in dirs:
                        dpath = os.path.join(root, d)
                        for fname in os.listdir(dpath):
                            if fname.lower().endswith(('.png','.jpg','.jpeg')):
                                image_paths.append(os.path.join(dpath, fname))
                                labels.append(d)

            if not image_paths:
                st.error("No images found or labels.csv missing/incorrect.")
            else:
                st.write(f"Found {len(image_paths)} images across {len(set(labels))} classes.")
                # Small dataset loader (resize)
                img_size = st.slider("Image size (px)", 64, 224, 128)
                def load_and_preprocess(paths, labels):
                    X = []
                    y = []
                    for p, lab in zip(paths, labels):
                        img = tf.keras.preprocessing.image.load_img(p, target_size=(img_size, img_size))
                        arr = tf.keras.preprocessing.image.img_to_array(img) / 255.0
                        X.append(arr)
                        y.append(lab)
                    return np.array(X), np.array(y)
                X, y = load_and_preprocess(image_paths, labels)
                le = LabelEncoder(); y_enc = le.fit_transform(y)
                X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42, stratify=y_enc)
                st.write("Training shape:", X_train.shape, "Test shape:", X_test.shape)

                if st.button("Train Transfer-learning (MobileNetV2)"):
                    base = applications.MobileNetV2(include_top=False, input_shape=(img_size, img_size, 3), pooling='avg', weights='imagenet')
                    base.trainable = False
                    model = models.Sequential([base, layers.Dense(128, activation='relu'), layers.Dropout(0.3), layers.Dense(len(le.classes_), activation='softmax')])
                    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
                    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)
                    loss, acc = model.evaluate(X_test, y_test)
                    st.write("Test accuracy:", acc)
                    preds = model.predict(X_test).argmax(axis=1)
                    st.write(classification_report(y_test, preds, target_names=le.classes_))
            shutil.rmtree(tmpdir)

# Footer
st.sidebar.markdown("---")
st.sidebar.write("This app is a compact, educational dashboard that demonstrates standard ML workflows for a variety of tasks.")
st.write("Made for learning and quick prototyping. For production, add robust logging, saving/loading models, monitoring, and error handling.")


Writing ml_full_app.py


In [None]:

!pip install -q streamlit pyngrok seaborn scikit-learn pandas numpy matplotlib
# Optional extras (for advanced tasks)
!pip install -q xgboost imbalanced-learn librosa tensorflow


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:

# Replace with your token (from https://dashboard.ngrok.com/get-started/your-authtoken)
!ngrok config add-authtoken 33KvSSpeeM4QsTbnxHMJobyrDOM_828VvJQ9C33d3SA1k1jv7


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:

from pyngrok import ngrok

# Start streamlit in background
get_ipython().system_raw("streamlit run ml_full_app.py --server.port 8501 &")

# Open tunnel
public_url = ngrok.connect(8501)
public_url


<NgrokTunnel: "https://offenseless-margurite-embryologic.ngrok-free.dev" -> "http://localhost:8501">