In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import os

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv')
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37249 entries, 0 to 37248
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   clean_comment  37149 non-null  object
 1   category       37249 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 582.1+ KB


In [4]:
df = df.dropna(subset=['clean_comment']) 

In [5]:
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [6]:
df['target'] = np.random.randint(0, 2, size=len(df))

In [7]:
df['text_length'] = df['clean_comment'].apply(len)

In [8]:
X = df[['text_length']]
X

Unnamed: 0,text_length
0,261
1,1269
2,460
3,168
4,691
...,...
37244,5
37245,99
37246,38
37247,10


In [9]:
y = df['target']
y

0        1
1        0
2        1
3        0
4        0
        ..
37244    1
37245    1
37246    1
37247    1
37248    0
Name: target, Length: 37149, dtype: int32

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Set MLflow tracking URI from .env or use direct URI
mlflow.set_tracking_uri(os.getenv("MLFLOW_SERVER_TRACKING_URI_EC2"))

In [12]:
# Start experiment run
with mlflow.start_run():
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)

    # Log parameters and metrics
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_metric("accuracy", acc)

    # Log the model
    mlflow.sklearn.log_model(model, "model")

    print(f"Logged to MLflow with accuracy: {acc}")




Logged to MLflow with accuracy: 0.5041722745625841


2025/07/16 11:56:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run nervous-croc-996 at: http://ec2-98-81-90-102.compute-1.amazonaws.com:5000/#/experiments/0/runs/e05ae3dbefaa42cba7f4d1992a134e06.
2025/07/16 11:56:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://ec2-98-81-90-102.compute-1.amazonaws.com:5000/#/experiments/0.
