<h1>Making a Submission</h1>

<h3>Imports</h3>

In [1]:
from sklearn.metrics import accuracy_score
import pandas as pd
import joblib

<h3>Specifying a Model and an Output File</h3>

In [2]:
MODEL_DIRECTORY = "models/logistic_regression_f1"
SUBMISSION_PATH = "submissions/logistic_regression_f1.csv"

<h3>Making predictions for every feature</h3>

In [3]:
# Features to consider for classification
feature_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] 

In [4]:
# Load the test data
test_data = pd.read_csv('kmaml223/test.csv')
test_data['comment_text_cleaned'].fillna('', inplace=True)
X_test = test_data['comment_text_cleaned']

# Load the saved TfidfVectorizer
loaded_tfidf_vectorizer = joblib.load(f"{MODEL_DIRECTORY}/tfidf_vectorizer.pkl")

# Create a dictionary to store predictions
submission = pd.DataFrame()
submission['id'] = test_data['id']

# Iterate through each feature and make predictions
for feature in feature_columns:
    print(f"===== Predicting for '{feature}' =====")
    
    # Load the stored model for the feature
    model_filename = f"{MODEL_DIRECTORY}/{feature}_model.pkl"
    loaded_model = joblib.load(model_filename)
    
    # Transform test data using the pre-fitted TfidfVectorizer
    X_test_tfidf = loaded_tfidf_vectorizer.transform(X_test)
    
    # Make predictions using the loaded model
    predictions = loaded_model.predict(X_test_tfidf)
    
    # Add predictions to the submission DataFrame
    submission[feature] = predictions

# Save predictions to a submission CSV file
submission.to_csv(SUBMISSION_PATH, index=False)
print(f"Predictions saved to '{SUBMISSION_PATH}'")

===== Predicting for 'toxic' =====
===== Predicting for 'severe_toxic' =====
===== Predicting for 'obscene' =====
===== Predicting for 'threat' =====
===== Predicting for 'insult' =====
===== Predicting for 'identity_hate' =====
Predictions saved to 'submissions/logistic_regression_f1.csv'


<h3>Comparing Difference Between Other Submissions</h3>

In [5]:
# List of file paths to compare with the new submission
baseline_files = ['baseline.csv', 'logistic_regression_accuracy.csv']  # Add more file paths as needed
new_submission = pd.read_csv(SUBMISSION_PATH)

for baseline_file in baseline_files:
    baseline = pd.read_csv(f"submissions/{baseline_file}")
    print(f"Comparison with {baseline_file}:")
    for feature in feature_columns:
        accuracy = accuracy_score(baseline[feature], new_submission[feature])
        print(f"Accuracy for '{feature}': {accuracy}")
    print()  # Add a line break between different baseline comparisons


Comparison with baseline.csv:
Accuracy for 'toxic': 0.9600643971365157
Accuracy for 'severe_toxic': 0.9991559598612023
Accuracy for 'obscene': 0.9877457876144925
Accuracy for 'threat': 0.9986401575541592
Accuracy for 'insult': 0.9908093407108693
Accuracy for 'identity_hate': 0.9982181374847604

Comparison with logistic_regression_accuracy.csv:
Accuracy for 'toxic': 0.982118853355841
Accuracy for 'severe_toxic': 0.9990309168776768
Accuracy for 'obscene': 0.9973115758541998
Accuracy for 'threat': 0.999687392541186
Accuracy for 'insult': 0.9965769483259871
Accuracy for 'identity_hate': 0.9992497420988464

