# Predicting Subjects From Articles Supervised
Written by Eric Detjen  

In [29]:
##imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
from joblib import dump
from IPython.core.display import display, HTML
import pandas as pd
from IPython.display import display
from rubicon_ml import Rubicon
from datetime import datetime
import json
from colorama import Fore, Style
from dateutil.relativedelta import relativedelta
import csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report



  from IPython.core.display import display, HTML


In [30]:
rubicon = Rubicon(persistence="filesystem", root_dir="./rubicon-root")
project = rubicon.get_or_create_project("Article Classification")

# Log the experiment
experiment = project.log_experiment(
    model_name="Random Forest",
    tags=["text classification", "NLP"]
)


In [31]:
with open("train.json", "r") as file:
    data = json.load(file)

with open("test.json", "r") as file:
    final_test_data = json.load(file)

# Split the data into 80% for training and 20% for testing
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)




# Extract Data

In [32]:
max_features_abstract = 1000
stop_words = 'english'
max_features_title = 500
# Extracting month and year from the date for both train and test data
def extract_date_features(data):
    months = []
    years = []
    for article in data:
        article_date = datetime.strptime(article['date'], '%Y-%m-%dT%H:%M:%S.%fZ')
        months.append(article_date.month)
        years.append(article_date.year)
    return months, years


# Initialize the vectorizers and the label encoder
vectorizer_abstract = TfidfVectorizer(max_features= max_features_abstract, stop_words='english')
vectorizer_title = TfidfVectorizer(max_features= max_features_title, stop_words='english')
label_encoder = LabelEncoder()

# Extract date features for train and test data
train_months, train_years = extract_date_features(train_data)
test_months, test_years = extract_date_features(test_data)

# Using TF-IDF for abstract and title feature extraction for train and test data
train_abstract_features = vectorizer_abstract.fit_transform([article['abstract'] for article in train_data])
train_title_features = vectorizer_title.fit_transform([article['title'] for article in train_data])

test_abstract_features = vectorizer_abstract.transform([article['abstract'] for article in test_data])
test_title_features = vectorizer_title.transform([article['title'] for article in test_data])

# Combining all features
X_train = hstack([train_abstract_features, train_title_features, [[month, year] for month, year in zip(train_months, train_years)]])
X_test = hstack([test_abstract_features, test_title_features, [[month, year] for month, year in zip(test_months, test_years)]])

# Encoding the target variable 
y_train = label_encoder.fit_transform([article['subject'] for article in train_data])
y_test = label_encoder.transform([article['subject'] for article in test_data])


In [39]:
# Initializing and training the Random Forest classifier
n_estimators=200
random_state=92
verbose=1 
n_jobs=-1
rf_classifier = RandomForestClassifier(n_estimators = n_estimators, random_state = random_state, verbose = verbose, n_jobs = n_jobs)
rf_classifier.fit(X_train, y_train)

experiment = project.log_experiment(
    model_name="Random Forest",
    tags=["text classification", "NLP"]
)

# Log parameters dynamically from the trained RandomForestClassifier object
parameters_to_log = [
    "n_estimators", "max_depth", "min_samples_split", "min_samples_leaf",
    "min_weight_fraction_leaf", "max_features", "max_leaf_nodes", 
    "min_impurity_decrease", "min_impurity_split", "bootstrap", 
    "oob_score", "n_jobs", "random_state", "verbose", "warm_start"
]

for param_name in parameters_to_log:
    param_value = getattr(rf_classifier, param_name, "Not set")
    experiment.log_parameter(name=param_name, value=param_value)

model_path = "random_forest_model.joblib"
dump(rf_classifier, model_path)

#predicting the test data
y_pred = rf_classifier.predict(X_test)



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  2.4min finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.4s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    1.8s
[Parallel(n_jobs=10)]: Done 200 out of 200 | elapsed:    2.0s finished


Test the accuracy by comparing using 80% of the train data to train and then for the last 20% removing the subjects and comparing the models results on that against the true subject data for the same 20%

In [40]:
accuracy = accuracy_score(y_test, y_pred)
model_path, accuracy

experiment.log_metric(name="Accuracy_Score_2", value=accuracy)  

# Optionally, log additional text or configurations
#experiment.log_parameter(name="TF-IDF Max Features", value=f"{max_features_abstract} for abstract, {max_features_title} for title")


<rubicon_ml.client.metric.Metric at 0x2a7d14cd0>

In [48]:
decoded_predictions = label_encoder.inverse_transform(y_pred)

# Extract the actual labels from the test data
actual_labels = [article['subject'] for article in test_data]

 # Initialize dictionaries to keep track of all predictions and incorrect predictions for each unique "Actual" value.
total_predictions = defaultdict(int)
incorrect_predictions = defaultdict(lambda: defaultdict(int))

# Create a list to store rows for the DataFrame.
df_rows = []

# Populate the dictionaries with data.
for actual, predicted in results_comparison:
    total_predictions[actual] += 1
    if actual != predicted:
        incorrect_predictions[actual][predicted] += 1

# Add rows to the DataFrame list.
for actual, predicted_dict in incorrect_predictions.items():
    total = total_predictions[actual]
    correct = total - sum(predicted_dict.values())
    correct_percentage = (correct / total) * 100

    first_row = True
    for predicted, count in predicted_dict.items():
        if first_row:
            df_rows.append({
                'Actual': actual,
                'Predicted': predicted,
                'Count': count,
                'Correct Percentage': f"{correct_percentage:.2f}%",
                'Total Articles': total
            })
            first_row = False
        else:
            df_rows.append({
                'Actual': '',
                'Predicted': predicted,
                'Count': count,
                'Correct Percentage': '',
                'Total Articles': ''
            })

    # Add a separator row
    df_rows.append({
        'Actual': '',
        'Predicted': '',
        'Count': '',
        'Correct Percentage': '',
        'Total Articles': ''
    })

# Create a DataFrame from the list of rows.
df = pd.DataFrame(df_rows)

# Display the DataFrame.
display(df)

Unnamed: 0,Actual,Predicted,Count,Correct Percentage,Total Articles
0,quantum physics,mesoscale and nanoscale physics,193,32.22%,841
1,,quantum gases,67,,
2,,materials science,30,,
3,,superconductivity,30,,
4,,disordered systems and neural networks,4,,
...,...,...,...,...,...
302,,strongly correlated electrons,9,,
303,,mesoscale and nanoscale physics,3,,
304,,disordered systems and neural networks,1,,
305,,high energy physics - theory,3,,


Display the rubicon data

In [52]:
# Get the project

# Loop through the experiments and print details
for experiment in project.experiments():
    print(f"Experiment ID: {experiment.id}")
    print(f"Model Name: {experiment.model_name}")
    print("Parameters:")
    for param in experiment.parameters():
        print(f"  - {param.name}: {param.value}")
    print("Metrics:")
    for metric in experiment.metrics():
        print(f"  - {metric.name}: {metric.value}")
    print("------")


Experiment ID: f967a29a-0c03-4270-8a70-766802a02193
Model Name: Random Forest
Parameters:
Metrics:
------
Experiment ID: 08961d24-105a-415f-b78f-40bdc4e03a5e
Model Name: Random Forest
Parameters:
Metrics:
------
Experiment ID: 924925f4-ee26-4b89-b27e-6631c79de01d
Model Name: Random Forest
Parameters:
  - n_estimators: 200
  - max_depth: None
  - min_samples_split: 2
  - min_samples_leaf: 1
  - min_weight_fraction_leaf: 0.0
  - max_features: sqrt
  - max_leaf_nodes: None
  - min_impurity_decrease: 0.0
  - min_impurity_split: Not set
  - bootstrap: True
  - oob_score: False
  - n_jobs: -1
  - random_state: 92
  - verbose: 1
  - warm_start: False
Metrics:
  - Accuracy: 0.6119800521364616
  - Accuracy_Score: 0.6119800521364616
  - New_Accuracy_Score: 0.6119800521364616
  - Accuracy_Score_1: 0.6119800521364616
  - Accuracy_Score_2: 0.6119800521364616
------
Experiment ID: 6ed79741-4d0b-49b4-a1be-d2f7cd025851
Model Name: Random Forest
Parameters:
Metrics:
------
Experiment ID: 1a36363e-6313-

In [49]:
#aggigate the results

predictions_by_month = defaultdict(lambda: defaultdict(int))
for i, article in enumerate(test_data):
    article_date = datetime.strptime(article['date'], '%Y-%m-%dT%H:%M:%S.%fZ')
    month_year_key = (article_date.year, article_date.month)
    subject = label_encoder.inverse_transform([y_pred[i]])[0]
    predictions_by_month[month_year_key][subject] += 1

predictions_by_month

defaultdict(<function __main__.<lambda>()>,
            {(2013,
              4): defaultdict(int,
                         {'mesoscale and nanoscale physics': 62,
                          'materials science': 56,
                          'superconductivity': 40,
                          'soft condensed matter': 16,
                          'strongly correlated electrons': 38,
                          'quantum physics': 5,
                          'quantum gases': 12,
                          'statistical mechanics': 41,
                          'high energy physics - theory': 1}),
             (1998,
              12): defaultdict(int,
                         {'mesoscale and nanoscale physics': 20,
                          'strongly correlated electrons': 16,
                          'statistical mechanics': 33,
                          'condensed matter': 8,
                          'soft condensed matter': 5,
                          'superconductivity': 10,
          