In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.preprocessing import LabelEncoder
import requests
from bs4 import BeautifulSoup
import matplotlib as mpl
from sklearn import metrics
from sklearn import preprocessing


Preliminary Data Fetching, Cleaning, and Vectorization. This code also initiates the sdg assignment to each sentence in the array - enabling us to sort and organize each line.

In [2]:
data_dir = "/Users/christian fink/Math485/Math485_2/"
sdg_names = pd.read_csv(data_dir + "sdg_name_definition.csv")
text_file_name = "osdg-community-data-v2024-04-01.csv"
text_df = pd.read_csv(data_dir + text_file_name,sep = "\t",  quotechar='"')
text_df.drop(text_df.columns.values[0],axis = 1, inplace=True)
text_df = text_df.query("agreement > 0.5 and (labels_positive - labels_negative) > 2").reset_index(drop=True)
corpus = text_df.text
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
count_vector_df = pd.DataFrame(count_vector, columns=count_vectorizer.get_feature_names_out())
term_freq = pd.DataFrame({"term": count_vector_df.columns.values, "freq" : count_vector_df.sum(axis=0)})
term_freq.sort_values(by="freq", ascending=False)
sdg_num = text_df.sdg

SDG_CLASSIFIER takes in our data (corpus), what type of algorithm we want to use (classifier_algorithm), the vectorizer type (vectorizer_type), bigram or unigram (ngram_range), and the min_df value. It then splits the data into training and testing partitions, sorts based on the fed information, and then runs the selected classifier, returning the accuracy, recall, f1_score, and precision.

In [None]:
def sdg_classifier(corpus, classifier_algorithm, vectorizer_type='count', ngram_range=(1,1), stop_words='english', min_df=2):
    X_train, X_test, y_train, y_test = train_test_split(corpus, sdg_num, test_size=0.25, random_state=8)
    
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    if vectorizer_type == 'count':
        vectorizer = CountVectorizer(
            ngram_range=ngram_range, 
            stop_words=stop_words,
            min_df=min_df
        )
    else:
        vectorizer = TfidfVectorizer(
            ngram_range=ngram_range, 
            stop_words=stop_words,
            min_df=min_df
        )

    X_train_vector = vectorizer.fit_transform(X_train)
    X_test_vector = vectorizer.transform(X_test)

    if isinstance(classifier_algorithm, str):
        if classifier_algorithm.lower() == 'multinomialnb':
            clf = MultinomialNB()
        elif classifier_algorithm.lower() == 'mlp':
            clf = MLPClassifier(max_iter=5, random_state=8)
        elif classifier_algorithm.lower() == 'ridge':
            clf = RidgeClassifier(alpha=1, solver='auto', max_iter=5)
    else:
        clf = classifier_algorithm
    
    clf.fit(X_train_vector, y_train)
    y_pred = clf.predict(X_test_vector)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    
    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
    }


This cell contains all the configurations/combinations of vectorizer_type, ngram_range, and min_df we select. It then runs these selected configurations and appends the results into a table, printing a "B: " next to the best value in each column.

In [4]:
configurations = [
    {'vectorizer': 'count', 'ngram_range': (1, 1), 'min_df': 2},
    {'vectorizer': 'count', 'ngram_range': (2, 2), 'min_df': 2},
    {'vectorizer': 'count', 'ngram_range': (1, 2), 'min_df': 2},
    {'vectorizer': 'tfidf', 'ngram_range': (1, 1), 'min_df': 2},
    {'vectorizer': 'tfidf', 'ngram_range': (2, 2), 'min_df': 2},
    {'vectorizer': 'tfidf', 'ngram_range': (1, 2), 'min_df': 2}
]

results = []

for config in configurations:
    vectorizer_type = config['vectorizer']
    ngram_range = config['ngram_range']
    min_df = config['min_df']
    
    run1 = sdg_classifier(corpus, classifier_algorithm='multinomialnb', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    run2 = sdg_classifier(corpus, classifier_algorithm='mlp', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    run3 = sdg_classifier(corpus, classifier_algorithm='ridge', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    results.append({
        'vectorizer': vectorizer_type,
        'ngram_range': str(ngram_range),
        'min_df': min_df,
        'MultinomialNB_precision': run1['precision'],
        'MLP_precision': run2['precision'],
        'Ridge_precision': run3['precision'],
        'MultinomialNB_recall': run1['recall'],
        'MLP_recall': run2['recall'],
        'Ridge_recall': run3['recall'],
        'MultinomialNB_f1': run1['f1_score'],
        'MLP_f1': run2['f1_score'],
        'Ridge_f1': run3['f1_score'],
        'MultinomialNB_accuracy': run1['accuracy'],
        'MLP_accuracy': run2['accuracy'],
        'Ridge_accuracy': run3['accuracy']
    })

results_df = pd.DataFrame(results)
def highlight_best(s):
    return ["B: " + str(v) if v == s.max() else str(v) for v in s]

highlighted_df = results_df.copy()
metrics = ['precision', 'recall', 'f1', 'accuracy']
for metric in metrics:
    highlighted_df[[f'MultinomialNB_{metric}', f'MLP_{metric}', f'Ridge_{metric}']] = highlighted_df[[f'MultinomialNB_{metric}', f'MLP_{metric}', f'Ridge_{metric}']].apply(highlight_best)

highlighted_df




Unnamed: 0,vectorizer,ngram_range,min_df,MultinomialNB_precision,MLP_precision,Ridge_precision,MultinomialNB_recall,MLP_recall,Ridge_recall,MultinomialNB_f1,MLP_f1,Ridge_f1,MultinomialNB_accuracy,MLP_accuracy,Ridge_accuracy
0,count,"(1, 1)",2,B: 0.8494199484561036,0.8784494044251351,0.8118893628429225,B: 0.8481604342581424,0.8784680337756333,0.8098612786489746,B: 0.8456923177718398,0.8778208064328132,0.8098842263592053,B: 0.8481604342581424,0.8784680337756333,0.8098612786489746
1,count,"(2, 2)",2,0.8140283992558922,0.8238964243693374,0.7922129944947116,0.8097104945717732,0.8223763570566948,0.794481302774427,0.8035533293809642,0.8188390950274772,0.7908925083612142,0.8097104945717732,0.8223763570566948,0.794481302774427
2,count,"(1, 2)",2,0.8411318516598572,B: 0.890916419587095,0.8738489843174359,0.8297647768395657,B: 0.8911338962605548,0.8745476477683957,0.8197638370231969,B: 0.8903803653272887,0.8731847549677968,0.8297647768395657,B: 0.8911338962605548,0.8745476477683957
3,tfidf,"(1, 1)",2,0.8111105755736483,0.8824680105743974,0.8801440619074978,0.7454764776839565,0.882388419782871,0.8817852834740652,0.7230167700650596,0.8813329753768443,0.8798864388908068,0.7454764776839565,0.882388419782871,0.8817852834740652
4,tfidf,"(2, 2)",2,0.7797434892619032,0.8304058821227179,0.8212857016687594,0.6949638118214716,0.8300663449939686,0.821773220747889,0.6652111610792425,0.8265588914674353,0.8177049415636208,0.6949638118214716,0.8300663449939686,0.821773220747889
5,tfidf,"(1, 2)",2,0.8011519053176797,0.8900795592839371,B: 0.8886074919564034,0.7003920386007237,0.8905307599517491,B: 0.889475271411339,0.671296226355045,0.8893836734667491,B: 0.8874442666981663,0.7003920386007237,0.8905307599517491,B: 0.889475271411339


In [5]:
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p and p.get_text(strip=True)]
    full_text = ' '.join(paragraphs)
    sentences = [s.strip() for s in full_text.split('.') if s.strip()]
    return sentences

urls = [
    "http://gianttortoise.org/en/beyond-tracking",
    "https://www.dhs.gov/blue-campaign/what-human-trafficking",
    "https://www.dol.gov/agencies/odep/program-areas/individuals/older-workers",
    "https://michigantoday.umich.edu/2022/08/26/positively-breaking-the-age-code/"
]

all_sentences = []
for url in urls:
    all_sentences.extend(extract_text_from_url(url))

corpus2_df = pd.DataFrame({'text': all_sentences})

corpus2 = corpus2_df['text']
sdg_numbers = [1,2,8,10]
corpus2_length = len(corpus2)
repeated_sdg2 = (sdg_numbers * (corpus2_length // len(sdg_numbers) + 1))[:corpus2_length]
sdg_num2 = pd.Series(repeated_sdg2, name="sdg")


In [6]:
def sdg_classifier2(corpus, classifier_algorithm, vectorizer_type='count', ngram_range=(1,1), stop_words='english', min_df=2):
    X_train, X_test, y_train, y_test = train_test_split(corpus, sdg_num2, test_size=0.25, random_state=8)
    
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    if vectorizer_type == 'count':
        vectorizer = CountVectorizer(
            ngram_range=ngram_range, 
            stop_words=stop_words,
            min_df=min_df
        )
    else:
        vectorizer = TfidfVectorizer(
            ngram_range=ngram_range, 
            stop_words=stop_words,
            min_df=min_df
        )

    X_train_vector = vectorizer.fit_transform(X_train)
    X_test_vector = vectorizer.transform(X_test)

    if isinstance(classifier_algorithm, str):
        if classifier_algorithm.lower() == 'multinomialnb':
            clf = MultinomialNB()
        elif classifier_algorithm.lower() == 'mlp':
            clf = MLPClassifier(max_iter=25, random_state=8)
        elif classifier_algorithm.lower() == 'ridge':
            clf = RidgeClassifier(alpha=1, solver='auto', max_iter=50) 
        else:
            raise ValueError(f"Unsupported classifier: {classifier_algorithm}")
    else:
        clf = classifier_algorithm
    
    clf.fit(X_train_vector, y_train)
    y_pred = clf.predict(X_test_vector)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    
    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
    }


In [7]:
configurations = [
    {'vectorizer': 'count', 'ngram_range': (1, 1), 'min_df': 3},
    {'vectorizer': 'count', 'ngram_range': (2, 2), 'min_df': 3},
    {'vectorizer': 'count', 'ngram_range': (1, 2), 'min_df': 3},
    {'vectorizer': 'tfidf', 'ngram_range': (1, 1), 'min_df': 3},
    {'vectorizer': 'tfidf', 'ngram_range': (2, 2), 'min_df': 3},
    {'vectorizer': 'tfidf', 'ngram_range': (1, 2), 'min_df': 3}
]

results = []
for config in configurations:
    vectorizer_type = config['vectorizer']
    ngram_range = config['ngram_range']
    min_df = config['min_df']
    
    run1 = sdg_classifier2(corpus2, classifier_algorithm='multinomialnb', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    run2 = sdg_classifier2(corpus2, classifier_algorithm='mlp', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    run3 = sdg_classifier2(corpus2, classifier_algorithm='ridge', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    results.append({
        'vectorizer': vectorizer_type,
        'ngram_range': str(ngram_range),
        'min_df': min_df,
        'MultinomialNB_precision': run1['precision'],
        'MLP_precision': run2['precision'],
        'Ridge_precision': run3['precision'],
        'MultinomialNB_recall': run1['recall'],
        'MLP_recall': run2['recall'],
        'Ridge_recall': run3['recall'],
        'MultinomialNB_f1': run1['f1_score'],
        'MLP_f1': run2['f1_score'],
        'Ridge_f1': run3['f1_score'],
        'MultinomialNB_accuracy': run1['accuracy'],
        'MLP_accuracy': run2['accuracy'],
        'Ridge_accuracy': run3['accuracy']
    })

results_df1 = pd.DataFrame(results)
def highlight_best(s):
    return ["B: " + str(v) if v == s.max() else str(v) for v in s]

highlighted_df1 = results_df1.copy()
metrics = ['precision', 'recall', 'f1', 'accuracy']
for metric in metrics:
    highlighted_df1[[f'MultinomialNB_{metric}', f'MLP_{metric}', f'Ridge_{metric}']] = highlighted_df1[[f'MultinomialNB_{metric}', f'MLP_{metric}', f'Ridge_{metric}']].apply(highlight_best)

highlighted_df1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,vectorizer,ngram_range,min_df,MultinomialNB_precision,MLP_precision,Ridge_precision,MultinomialNB_recall,MLP_recall,Ridge_recall,MultinomialNB_f1,MLP_f1,Ridge_f1,MultinomialNB_accuracy,MLP_accuracy,Ridge_accuracy
0,count,"(1, 1)",3,0.2511671335200747,B: 0.23725981620718462,0.16819291819291818,0.2619047619047619,B: 0.23809523809523808,0.16666666666666666,0.24235827664399093,B: 0.22042380170458986,0.16402116402116404,0.2619047619047619,B: 0.23809523809523808,0.16666666666666666
1,count,"(2, 2)",3,B: 0.40111540111540106,0.03333333333333333,0.09555984555984558,0.21428571428571427,0.16666666666666666,B: 0.19047619047619047,0.13984533984533987,0.05555555555555556,0.09327286470143613,0.21428571428571427,0.16666666666666666,B: 0.19047619047619047
2,count,"(1, 2)",3,0.29466331987340394,0.08465608465608465,0.176984126984127,B: 0.2857142857142857,0.14285714285714285,B: 0.19047619047619047,B: 0.2719560806778852,0.10612244897959185,B: 0.17739403453689168,B: 0.2857142857142857,0.14285714285714285,B: 0.19047619047619047
3,tfidf,"(1, 1)",3,0.17354497354497353,0.223141186299081,B: 0.17735042735042736,0.16666666666666666,0.21428571428571427,0.16666666666666666,0.16488426034029172,0.19814554174160084,0.1696900982615268,0.16666666666666666,0.21428571428571427,0.16666666666666666
4,tfidf,"(2, 2)",3,0.11025311025311024,0.03418803418803419,0.09126984126984126,0.16666666666666666,0.16666666666666666,0.16666666666666666,0.08742368742368743,0.05673758865248227,0.0859616573902288,0.16666666666666666,0.16666666666666666,0.16666666666666666
5,tfidf,"(1, 2)",3,0.14047619047619048,0.08095238095238096,0.16666666666666666,0.14285714285714285,0.16666666666666666,B: 0.19047619047619047,0.13652765454007693,0.1054421768707483,0.17482993197278912,0.14285714285714285,0.16666666666666666,B: 0.19047619047619047
