In [None]:
import pandas as pd
import sys
import os

# Add the parent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'data')))

# importing the JSON data
from data import data

from sklearn.model_selection import train_test_split

# Importing necessary libraries for data preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# Importing text preprocessing class from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
# Importing Pipeline and ColumnTransformer for preprocessing
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

# Importing model
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
# from sklearn.matrics import confusion_matrix, classification_report, accuracy_score


# Loding data

In [24]:
data = pd.read_csv('../data/raw/data.csv')

In [25]:
data.head()

Unnamed: 0,user_goal,habit_log,habit_type,sentiment,aligned
0,I want to learn how to play the guitar.,I practiced guitar for 30 minutes today.,Learning,Positive,True
1,I want to learn how to play the guitar.,I watched TV instead of practicing guitar.,Distraction,Negative,False
2,I want to learn a new language.,I practiced Spanish for 15 minutes today.,Learning,Positive,True
3,I want to learn a new language.,I didn't practice my language skills this week.,Learning,Negative,False
4,I want to learn photography.,I took photos in manual mode and watched a tut...,Learning,Positive,True


# Cleaning and Preprocessing

In [26]:
data.drop(columns=["aligned"], inplace=True)

In [27]:
X_text = data[["user_goal", "habit_log"]]
y_habit_type = data["habit_type"]
y_sentiment = data["sentiment"]

In [32]:

# preprocessing for text
text_transformer = ColumnTransformer(
    transformers=[
        ('goal', TfidfVectorizer(),  'user_goal'),
        ('log', TfidfVectorizer(), 'habit_log')
    ]
)

#  Pipeline for habit_type prediction
habit_type_pipeline = Pipeline(steps=[
    ('text_transformer', text_transformer),
    ('classifier', LogisticRegression(multi_class='ovr', max_iter=1000)) # This will predict habit_type
])

# Training it
habit_type_pipeline.fit(X_text, y_habit_type)

#  preparing predicted data for next prediction
predicted_habit_type = habit_type_pipeline.predict(X_text)
X_text_with_predicted = X_text.copy()
X_text_with_predicted['predicted_habit_type'] = predicted_habit_type


#  Pipeline for sentiment prediction

final_preprocessor = ColumnTransformer(
    transformers=[
        ('goal', TfidfVectorizer(), 'user_goal'),
        ('habit_log', TfidfVectorizer(), 'habit_log'),
        ('habit_ohe', OneHotEncoder(), ['predicted_habit_type'])
    ]
)

final_pipeline = Pipeline(steps=[
    ('preprocessor', final_preprocessor),
    ('classifier', LogisticRegression()) # This will predict sentiment of the user habit based on user_goal, habit_log, and habit_type(predicted above)
])

# Training it
final_pipeline.fit(X_text_with_predicted, y_sentiment)

# Example: Predict on the same data (always use a DataFrame, not a Series)
# predicted_sentiment = final_pipeline.predict(X_text_with_predicted)

# If you want to predict for a single sample, use .iloc and wrap it in double brackets to keep it a DataFrame:
single_pred = final_pipeline.predict(X_text_with_predicted.iloc[[0]])

print(single_pred)



['Positive']


In [None]:
X_text_with_predicted.head()

Unnamed: 0,user_goal,habit_log,predicted_habit_type
0,I want to learn how to play the guitar.,I practiced guitar for 30 minutes today.,Learning
1,I want to learn how to play the guitar.,I watched TV instead of practicing guitar.,Distraction
2,I want to learn a new language.,I practiced Spanish for 15 minutes today.,Learning
3,I want to learn a new language.,I didn't practice my language skills this week.,Learning
4,I want to learn photography.,I took photos in manual mode and watched a tut...,Learning


# Saving models

In [33]:
#  module for saving model
import joblib

In [34]:
# 1. for habit_type predictor
joblib.dump(habit_type_pipeline, '../models/habit_type_predictor.pkl')
# 2. for sentiment predictor
joblib.dump(final_pipeline, '../models/sentiment_predictor.pkl')

['../models/sentiment_predictor.pkl']