# 5. Making Predictions

1. Save the trained model to `.pkl` files.

In [2]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import joblib

# ====================
# Step 1: Load and Preprocess Data
# ====================
# Load the JSON dataset
file_path = '../merged_profiles.json'  # Replace with your actual file path
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

# Convert JSON data to DataFrame
df = pd.DataFrame(data)

# Ensure the 'text' column is selected for TF-IDF vectorization
texts = df['text']
categories = df['category']  # Target variable

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # Adjust max_features as needed

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

# Convert to DataFrame for easier manipulation
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

# Add the category column to the TF-IDF DataFrame
tfidf_df['category'] = categories

# Save the TF-IDF DataFrame for future reference
tfidf_vectorized_file = 'tfidf_vectorized.csv'
tfidf_df.to_csv(tfidf_vectorized_file, index=False)
print("TF-IDF vectorization completed and saved.")

# ====================
# Step 2: Train the Model
# ====================
# Separate features and target
X = tfidf_df.drop('category', axis=1)
y = tfidf_df['category']

# Set the threshold for underrepresented categories
threshold = 600

# Identify categories with less than threshold samples
category_counts = y.value_counts()
categories_to_resample = category_counts[category_counts < threshold].index

# Apply SMOTE only to underrepresented categories
smote = SMOTE(sampling_strategy={category: threshold for category in categories_to_resample}, random_state=21)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the resampled data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=21)

# Define the best hyperparameters directly
best_params = {'C': 10, 'penalty': 'l2', 'solver': 'saga'}
base_lr = LogisticRegression(random_state=42, max_iter=1000, **best_params)
best_lr = OneVsRestClassifier(base_lr)

# Train the model
best_lr.fit(X_train, y_train)

# Predict on the test set
y_pred = best_lr.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report Logistic Regression with OneVsRest and SMOTE:\n", classification_report(y_test, y_pred))

# Save the trained model and TF-IDF vectorizer
model_filename = 'logistic_regression_model.pkl'
vectorizer_filename = 'tfidf_vectorizer.pkl'
joblib.dump(best_lr, model_filename)
joblib.dump(tfidf_vectorizer, vectorizer_filename)
print(f"Model saved to {model_filename}")
print(f"TF-IDF vectorizer saved to {vectorizer_filename}")

# ====================
# Step 3: Load and Predict on New Data
# ====================
# Load the saved model and vectorizer
loaded_model = joblib.load(model_filename)
loaded_vectorizer = joblib.load(vectorizer_filename)
print("Model and vectorizer loaded successfully.")

# Example: Predict categories for new accounts
new_texts = ["This is an example bio or caption.", "Another profile description here."]  # Replace with actual text
new_data_matrix = loaded_vectorizer.transform(new_texts)
new_data_predictions = loaded_model.predict(new_data_matrix)

# Output predictions
print("Predicted Categories for New Data:", new_data_predictions)


TF-IDF vectorization completed and saved.
Accuracy: 0.8975

Classification Report Logistic Regression with OneVsRest and SMOTE:
                       precision    recall  f1-score   support

                 art       0.88      0.93      0.90       106
       entertainment       0.86      0.82      0.84       130
             fashion       0.88      0.90      0.89       135
                food       0.89      0.88      0.89       120
              gaming       0.98      1.00      0.99       124
health and lifestyle       0.77      0.67      0.72       118
    mom and children       0.97      0.98      0.97       121
              sports       0.99      0.99      0.99       101
                tech       0.88      0.93      0.90       124
              travel       0.88      0.88      0.88       121

            accuracy                           0.90      1200
           macro avg       0.90      0.90      0.90      1200
        weighted avg       0.90      0.90      0.90      1200





2. Using data from a `.dat` file, selectively taken from `processed_files.json`.
3. Save the result to `filtered_profiles.json`

In [1]:
import json

# ====================
# Load Processed Profiles
# ====================
processed_profiles_file = '../processed_profiles.json'  # Update with your file path
with open(processed_profiles_file, 'r', encoding='utf-8') as file:
    processed_profiles = json.load(file)

# ====================
# Load Usernames from .dat File
# ====================
usernames_file = 'test-classification-round3.dat'  # Update with your file path

# Read usernames from the .dat file
with open(usernames_file, 'r', encoding='utf-8') as file:
    usernames = set(line.strip() for line in file)  # Assuming one username per line

# ====================
# Filter Profiles by Usernames
# ====================
filtered_profiles = [
    profile for profile in processed_profiles if profile["username"] in usernames
]

# ====================
# Save Filtered Profiles
# ====================
filtered_profiles_file = 'filtered_profiles.json'
with open(filtered_profiles_file, 'w', encoding='utf-8') as file:
    json.dump(filtered_profiles, file, ensure_ascii=False, indent=4)

print(f"Filtered profiles saved to {filtered_profiles_file}")


Filtered profiles saved to filtered_profiles.json


4. Processing the `text` field from the data for prediction.
5. Saving predictions to `prediction-classification-round*.json`.

In [4]:
import json
import joblib

# ====================
# Load Filtered Profiles
# ====================
filtered_profiles_file = 'filtered_profiles.json'  # Update if necessary
with open(filtered_profiles_file, 'r', encoding='utf-8') as file:
    filtered_profiles = json.load(file)

# ====================
# Load the Saved Model and TF-IDF Vectorizer
# ====================
model_file = 'logistic_regression_model.pkl'  # Update if necessary
vectorizer_file = 'tfidf_vectorizer.pkl'  # Update if necessary
loaded_model = joblib.load(model_file)
loaded_vectorizer = joblib.load(vectorizer_file)

# ====================
# Prepare the Text Data for Prediction
# ====================
# Extract usernames and texts
usernames = [profile['username'] for profile in filtered_profiles]
texts = [profile['text'] for profile in filtered_profiles]

# Transform the texts using the loaded vectorizer
text_vectors = loaded_vectorizer.transform(texts)

# Predict labels using the loaded model
predicted_labels = loaded_model.predict(text_vectors)

# ====================
# Create the Output JSON Object
# ====================
# Map usernames to predicted labels
output = {usernames[i]: predicted_labels[i] for i in range(len(usernames))}

# Save the output to a JSON file
output_file = 'prediction-classification-round3.json'
with open(output_file, 'w', encoding='utf-8') as file:
    json.dump(output, file, ensure_ascii=False, indent=4)

print(f"Predictions saved to {output_file}")


Predictions saved to prediction-classification-round3.json
