In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import joblib
import mlflow
import dagshub

In [None]:
df = pd.read_csv('cleaned_survey_results_after_feature.csv')
df.head(3)

<h3 style="color: #4CAF50; font-family: 'Trebuchet MS', sans-serif;">
    📝 Step 1: Data Split for Training</h3>

In [None]:
df.drop(['respondent_id'],axis=1,inplace=True)
df.head(2)

In [4]:
X = df.drop(['price_range'],axis=1)
y = df['price_range']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

<h3 style="color: #4CAF50; font-family: 'Trebuchet MS', sans-serif;">
    📝 Step 2: Feature Encoding</h3>

In [6]:
label_cols = ['age_group','income_levels','health_concerns','consume_frequency(weekly)','preferable_consumption_size']

le = LabelEncoder()

for col in label_cols:
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

In [None]:
X_train['consume_frequency(weekly)'].value_counts()

In [8]:
remaining_cols = X_train.select_dtypes(include=['object']).columns.tolist()
X_train = pd.get_dummies(X_train, columns=remaining_cols,drop_first=True)
X_test = pd.get_dummies(X_test, columns=remaining_cols, drop_first=True)

X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [11]:
label_encoder_y = LabelEncoder()
y_train = label_encoder_y.fit_transform(y_train)
y_test = label_encoder_y.transform(y_test)

<h3 style="color: #4CAF50; font-family: 'Trebuchet MS', sans-serif;">
    📈 🛠️ Logistic Model </h3>

In [None]:
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

<h3 style="color: #4CAF50; font-family: 'Trebuchet MS', sans-serif;">
    📈 🛠️ Guassian NB</h3>

In [None]:
model_nb = GaussianNB()
model_nb.fit(X_train, y_train)
y_pred_nb = model_nb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report", classification_report(y_test, y_pred_nb))

<h3 style="color: #4CAF50; font-family: 'Trebuchet MS', sans-serif;">
    📈 🛠️ Support Vector Machine (SVM) </h3>

In [None]:
model_svm = SVC()
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report", classification_report(y_test, y_pred_svm))

<h3 style="color: #4CAF50; font-family: 'Trebuchet MS', sans-serif;">
    📈 🛠️ Random Forest </h3>

In [None]:
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report", classification_report(y_test, y_pred_rf))

<h3 style="color: #4CAF50; font-family: 'Trebuchet MS', sans-serif;">
    📈 🛠️ XGBOOST </h3>

In [None]:
model_xgb = XGBClassifier()
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Classification Report", classification_report(y_test, y_pred_xgb))

<h3 style="color: #4CAF50; font-family: 'Trebuchet MS', sans-serif;">
    📈 🛠️ Light GBM </h3>

In [None]:
model_lgbm = LGBMClassifier(force_col_wise=True)
model_lgbm.fit(X_train, y_train)
y_pred_lgbm = model_lgbm.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_lgbm))
print("Classification Report", classification_report(y_test, y_pred_lgbm))

<h3 style="color: #4CAF50; font-family: 'Trebuchet MS', sans-serif;">
    🎯 Save Final Model </h3>

In [None]:
joblib.dump(model_xgb, 'final_model_version1.pkl')

In [None]:
joblib.dump(X_train.columns.tolist(), 'expected_columns.pkl')

In [None]:
joblib.dump(label_encoder_y, 'label_encoders.pkl')

<h3 style="color: #4CAF50; font-family: 'Trebuchet MS', sans-serif;">
    🚀 Config ML Flow for Tracking & Comparison of Models</h3>

In [None]:
models = [
    (
        "Logistic Regression",
        model_lr,
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        'GaussianNB',
        model_nb,
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        'Support Vector Machine',
        model_svm,
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        'Random Forest',
        model_rf,
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        'XGBOOST',
        model_xgb,
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        'LightGBM',
        model_lgbm,
        (X_train, y_train),
        (X_test, y_test)
    )
]

In [None]:
reports = []
for model_name, model, train_set, test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    reports.append(report)

In [None]:
dagshub.init(repo_owner='<your user-name>',
             repo_name='<your repo-name>',
             mlflow=True)

mlflow.set_experiment('Beverage Price Prediction')

for i, element in enumerate(models):
    model_name = element[0]
    model = element[1]
    report = reports[i]

    with mlflow.start_run(run_name=model_name):
        mlflow.log_param('model_name', model_name)
        mlflow.log_metric('accuracy', report['accuracy'])
        mlflow.log_metric('recall_class_0', report['0']['recall'])
        mlflow.log_metric('precision_class_0', report['0']['precision'])
        mlflow.log_metric('recall_class_1', report['1']['recall'])
        mlflow.log_metric('precision_class_1', report['1']['precision'])
        mlflow.log_metric('recall_class_2', report['2']['recall'])
        mlflow.log_metric('precision_class_2', report['2']['precision'])
        mlflow.log_metric('recall_class_3', report['3']['recall'])
        mlflow.log_metric('precision_class_3', report['3']['precision'])
        mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score'])

        if 'XGBOOST' in model_name:
            mlflow.xgboost.log_model(model, 'model')
        elif 'LightGBM' in model_name:
            mlflow.lightgbm.log_model(model, 'model')
        else:
            mlflow.sklearn.log_model(model, 'model')