In [1]:
import mlflow 
import dagshub

import pandas as pd
import numpy as np 

mlflow.set_tracking_uri("https://dagshub.com/AJINKYA-TEMGIRE/Influencers-Comment-Analysis.mlflow")
dagshub.init(repo_owner='AJINKYA-TEMGIRE', repo_name='Influencers-Comment-Analysis', mlflow=True)




In [2]:
mlflow.set_experiment("Max Features")

2025/02/08 10:52:26 INFO mlflow.tracking.fluent: Experiment with name 'Max Features' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/d0f0882084324ec6a8b7cf332901537f', creation_time=1738992147566, experiment_id='2', last_update_time=1738992147566, lifecycle_stage='active', name='Max Features', tags={}>

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score , classification_report

data = pd.read_csv(r"../data/interim/cleaned_data.csv")

data.dropna(inplace=True)

x = data['clean_comment']
y = data['category']

n_gram = (1,3)

def max__features(features):

    xtrain , xtest , ytrain , ytest = train_test_split(x , y , test_size=0.2 , random_state=42)

    vectorizer = TfidfVectorizer(max_features=features , ngram_range=n_gram)

    xtrain = vectorizer.fit_transform(xtrain)
    xtest = vectorizer.transform(xtest)

    clf = RandomForestClassifier()
    clf.fit(xtrain , ytrain)
    ypred = clf.predict(xtest)
    accuracy = accuracy_score(ytest , ypred)
    classification_rep = classification_report(ytest , ypred , output_dict = True)


    with mlflow.start_run() as run:
        mlflow.set_tag("mlflow.runName" , f"Vectorizer with {features} features")
        mlflow.set_tag("Vectorizer" , "TFidf")
        mlflow.set_tag("Model Name" , "Random Forest")

        mlflow.log_param("max_features" , features)
        mlflow.log_param("n_gram" , n_gram)
        
        mlflow.log_metric("Accuracy" , accuracy)

        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

    

max_features = [1000 , 2000 , 3000 , 4000 , 5000 , 6000 , 7000 , 8000 , 9000 , 10000]

for i in max_features:
    max__features(i)