In [8]:
import pandas as pd
import numpy as np


data = pd.read_csv(r"../data/interim/cleaned_data.csv")

data.head()

data.isna().sum()

data.dropna(inplace=True)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import mlflow 
import dagshub
from sklearn.metrics import accuracy_score , classification_report  , confusion_matrix





In [3]:
# TFIDF VS BOW Ngram Range

In [None]:
mlflow.set_tracking_uri("https://dagshub.com/AJINKYA-TEMGIRE/Influencers-Comment-Analysis.mlflow")
dagshub.init(repo_owner='AJINKYA-TEMGIRE', repo_name='Influencers-Comment-Analysis', mlflow=True)



In [9]:
N_Gram = [(1,1) , (1,2) , (1,3)]

max_features = 5000 

import dagshub
import mlflow

mlflow.set_experiment("TFIDF VS CBOW NGRAM")

def tfidf_cbow(name , ngram , features):
    vectorizer = 0
    if name == "cbow":
        vectorizer = CountVectorizer(max_features=features , ngram_range=ngram)
    else:
        vectorizer = TfidfVectorizer(max_features=features , ngram_range=ngram)

    X_train, X_test, y_train, y_test = train_test_split(data['clean_comment'], data['category'], test_size=0.2, random_state=42, stratify=data['category'])

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    with mlflow.start_run() as run:

        mlflow.set_tag("description" , f"Run with {name} and {ngram} as n_gram..")
        mlflow.set_tag("mlflow.runName" , f"{name} __ {ngram}")

        mlflow.log_param("vectorizer_type", name)
        mlflow.log_param("ngram_range", ngram)
        mlflow.log_param("vectorizer_max_features", features)

        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        mlflow.sklearn.log_model(model, f"random_forest_model_{name}_{ngram}")

        dataset = mlflow.data.from_pandas(data)
        mlflow.log_input(dataset , "data")  

    

    


for i in N_Gram:
    tfidf_cbow("cbow" , i , max_features)

    tfidf_cbow("tfidf" , i , max_features)




In [10]:
# With the help of graphs from the mlflow ui we can understand that using tfidf with 1,3 as ngram can give us the good results.
# So in the further experiments we will take the same settings to do the experimentation.