In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import joblib
import mlflow
from prefect import task, flow


mlflow.start_run(run_name="Sentiment Analysis Model Training")

@task
def load_data():
    return pd.read_csv("./reviews_badminton/data.csv")

@task
def preprocess_data(data):
    data.dropna(subset=['Review text', 'Ratings'], inplace=True)
    data['Sentiment'] = data['Ratings'].apply(lambda x: 1 if x >= 3 else 0)
    return data

@task
def split_data(data):
    X_train, X_test, y_train, y_test = train_test_split(data['Review text'], data['Sentiment'], test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

@task
def train_model(X_train, X_test, y_train, y_test):
    cv = CountVectorizer(max_features=100)
    X_train_cv = cv.fit_transform(X_train)
    X_test_cv = cv.transform(X_test)

    model = LogisticRegression()
    model.fit(X_train_cv, y_train)

    y_pred = model.predict(X_test_cv)

    mlflow.log_param("max_features", 100)

    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))

    conf_matrix_fig = plt.figure(figsize=(6, 4))
    plt.imshow(confusion_matrix(y_test, y_pred), cmap='Blues', interpolation='nearest')
    plt.colorbar()
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    conf_matrix_path = "confusion_matrix.png"
    plt.savefig(conf_matrix_path)
    plt.close(conf_matrix_fig)
    mlflow.log_artifact(conf_matrix_path)

    class_report_text = classification_report(y_test, y_pred)
    with open("classification_report.txt", "w") as f:
        f.write(class_report_text)
    mlflow.log_artifact("classification_report.txt")

    feature_names = cv.get_feature_names_out()
    word_freq = dict(zip(feature_names, X_train_cv.sum(axis=0).tolist()[0]))
    sorted_word_freq = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:20]

    plt.figure(figsize=(10, 6))
    plt.barh([word[0] for word in sorted_word_freq], [word[1] for word in sorted_word_freq], color='salmon')
    plt.title('Top 20 Most Common Words in Reviews')
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    top_words_path = "top_words_plot.png"
    plt.savefig(top_words_path)
    plt.close()
    mlflow.log_artifact(top_words_path)

    model_path = "model.pkl"
    joblib.dump(model, model_path)
    mlflow.log_artifact("model.pkl")

    joblib.dump(cv.vocabulary_, "vocabulary.pkl")
    mlflow.log_artifact("vocabulary.pkl")

@flow
def sentiment_analysis_flow():
    data = load_data()
    preprocessed_data = preprocess_data(data)
    X_train, X_test, y_train, y_test = split_data(preprocessed_data)
    train_model(X_train, X_test, y_train, y_test)

mlflow.end_run()


if __name__ == "__main__":
    sentiment_analysis_flow()