In [20]:
import mlflow
import dagshub

dagshub.init(repo_owner='CodeNeuron58', repo_name='Insightube', mlflow=True)

In [37]:
import pandas as pd
import numpy as np
import os

df = pd.read_csv("Reddit_Data.csv")
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [38]:
df.dropna(inplace=True)
df.isnull().sum()

clean_comment    0
category         0
dtype: int64

In [39]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

np.int64(0)

In [40]:
df[(df['clean_comment'].str.strip() == '')].count()

clean_comment    6
category         6
dtype: int64

In [41]:
df = df[(df['clean_comment'].str.strip() != '')]
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [42]:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bipra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bipra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [43]:
import re

def preprocess_comment(comment):
    
    comment = comment.lower()
    
    comment = comment.strip()
    
    comment = re.sub(r'[^A-Za-z0-9\s!?.,]', '', comment)

    comment = re.sub(r'\n', ' ', comment)

    stopword = set(stopwords.words('english')) - {"not", "but",  "or", "yet", "however"}
    comment = " ".join([word for word in comment.split() if word not in stopword])
    
    lemitizer = WordNetLemmatizer()
    comment = " ".join([lemitizer.lemmatize(word) for word in comment.split()])
    
    return comment

In [44]:
df["clean_comment"] = df["clean_comment"].apply(preprocess_comment)
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [45]:
df.to_csv("cleaned_reddit_data.csv", index=False)

In [29]:
import mlflow
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [30]:
vectorizer = CountVectorizer(max_features=5000)

X = vectorizer.fit_transform(df['clean_comment']).toarray()
y = df['category']

In [31]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(36793, 5000))

In [32]:
y

0        1
1        1
2       -1
3        0
4        1
        ..
37244    0
37245    1
37246    0
37247    1
37248    0
Name: category, Length: 36793, dtype: int64

In [33]:
vectorizer.get_params()["max_features"]

5000

In [34]:
import seaborn as sns

In [36]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set MLflow experiment
mlflow.set_experiment('Random Forest - Baseline')

with mlflow.start_run():
    # Set tags
    mlflow.set_tag("Algorithm", "Random Forest")
    mlflow.set_tag("Dataset", "Reddit")

    # Log vectorizer parameters
    mlflow.log_param("vectorizer_type", "CountVectorizer")
    mlflow.log_param("vectorizer_max_features", vectorizer.get_params()['max_features'])

    # Model hyperparameters
    n_estimators = 150
    max_depth = 15

    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)

    # Train model
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", float(accuracy))

    # Classification report
    classification_reports = classification_report(y_test, y_pred, output_dict=True)
    for label, metrics in classification_reports.items():
        if isinstance(metrics, dict):
            for metric_name, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric_name}", value)

    # Confusion matrix plot
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, cmap="Blues", fmt="d")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")

    # Save and log plot
    cm_filename = "confusion_matrix.png"
    plt.savefig(cm_filename)
    plt.close()  # Better than plt.show() when running in scripts or notebooks
    mlflow.log_artifact(cm_filename)

    # Optional: remove file after logging
    if os.path.exists(cm_filename):
        os.remove(cm_filename)

    # Print result
    print("accuracy_score:", accuracy)


accuracy_score: 0.6466911265117543
🏃 View run nosy-mole-890 at: https://dagshub.com/CodeNeuron58/Insightube.mlflow/#/experiments/1/runs/9e1768cd3c0e47f282ff87402940396c
🧪 View experiment at: https://dagshub.com/CodeNeuron58/Insightube.mlflow/#/experiments/1
