In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.preprocessing import LabelEncoder
import requests
from bs4 import BeautifulSoup
import matplotlib as mpl
from sklearn import metrics
from sklearn import preprocessing


Preliminary Data Fetching, Cleaning, and Vectorization. This code also initiates the sdg assignment to each sentence in the array - enabling us to sort and organize each line.

In [2]:
data_dir = "/Users/christian fink/Math485/Math485_2/"
sdg_names = pd.read_csv(data_dir + "sdg_name_definition.csv")
text_file_name = "osdg-community-data-v2024-04-01.csv"
text_df = pd.read_csv(data_dir + text_file_name,sep = "\t",  quotechar='"')
text_df.drop(text_df.columns.values[0],axis = 1, inplace=True)
text_df = text_df.query("agreement > 0.5 and (labels_positive - labels_negative) > 2").reset_index(drop=True)
corpus = text_df.text
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit(corpus)
count_vector = count_vectorizer.transform(corpus).toarray() 
count_vector_df = pd.DataFrame(count_vector, columns=count_vectorizer.get_feature_names_out())
term_freq = pd.DataFrame({"term": count_vector_df.columns.values, "freq" : count_vector_df.sum(axis=0)})
term_freq.sort_values(by="freq", ascending=False)
sdg_num = text_df.sdg

SDG_CLASSIFIER takes in our data (corpus), what type of algorithm we want to use (classifier_algorithm), the vectorizer type (vectorizer_type), bigram or unigram (ngram_range), and the min_df value. It then splits the data into training and testing partitions, sorts based on the fed information, and then runs the selected classifier, returning the accuracy, recall, f1_score, and precision.

In [3]:
def sdg_classifier(corpus, classifier_algorithm, vectorizer_type='count', ngram_range=(1,1), stop_words='english', min_df=2):
    X_train, X_test, y_train, y_test = train_test_split(corpus, sdg_num, test_size=0.25, random_state=8)
    
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    if vectorizer_type == 'count':
        vectorizer = CountVectorizer(
            ngram_range=ngram_range, 
            stop_words=stop_words,
            min_df=min_df
        )
    else:
        vectorizer = TfidfVectorizer(
            ngram_range=ngram_range, 
            stop_words=stop_words,
            min_df=min_df
        )

    X_train_vector = vectorizer.fit_transform(X_train)
    X_test_vector = vectorizer.transform(X_test)

    if isinstance(classifier_algorithm, str):
        if classifier_algorithm.lower() == 'multinomialnb':
            clf = MultinomialNB()
        elif classifier_algorithm.lower() == 'mlp':
            clf = MLPClassifier(max_iter=5, random_state=8)
        elif classifier_algorithm.lower() == 'ridge':
            clf = RidgeClassifier(alpha=1, solver='auto', max_iter=5)
    else:
        clf = classifier_algorithm
    
    clf.fit(X_train_vector, y_train)
    y_pred = clf.predict(X_test_vector)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    
    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
    }


This cell contains all the configurations/combinations of vectorizer_type, ngram_range, and min_df we select. It then runs these selected configurations and appends the results into a table, printing a "B: " next to the best value in each column.

In [4]:
configurations = [
    {'vectorizer': 'count', 'ngram_range': (1, 1), 'min_df': 2},
    {'vectorizer': 'count', 'ngram_range': (2, 2), 'min_df': 2},
    {'vectorizer': 'count', 'ngram_range': (1, 2), 'min_df': 2},
    {'vectorizer': 'tfidf', 'ngram_range': (1, 1), 'min_df': 2},
    {'vectorizer': 'tfidf', 'ngram_range': (2, 2), 'min_df': 2},
    {'vectorizer': 'tfidf', 'ngram_range': (1, 2), 'min_df': 2}
]

results = []

for config in configurations:
    vectorizer_type = config['vectorizer']
    ngram_range = config['ngram_range']
    min_df = config['min_df']
    
    run1 = sdg_classifier(corpus, classifier_algorithm='multinomialnb', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    run2 = sdg_classifier(corpus, classifier_algorithm='mlp', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    run3 = sdg_classifier(corpus, classifier_algorithm='ridge', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    results.append({
        'vectorizer': vectorizer_type,
        'ngram_range': str(ngram_range),
        'min_df': min_df,
        'MultinomialNB_precision': run1['precision'],
        'MLP_precision': run2['precision'],
        'Ridge_precision': run3['precision'],
        'MultinomialNB_recall': run1['recall'],
        'MLP_recall': run2['recall'],
        'Ridge_recall': run3['recall'],
        'MultinomialNB_f1': run1['f1_score'],
        'MLP_f1': run2['f1_score'],
        'Ridge_f1': run3['f1_score'],
        'MultinomialNB_accuracy': run1['accuracy'],
        'MLP_accuracy': run2['accuracy'],
        'Ridge_accuracy': run3['accuracy']
    })

results_df = pd.DataFrame(results)
def highlight_best(s):
    return ["B: " + str(v) if v == s.max() else str(v) for v in s]

highlighted_df = results_df.copy()
metrics = ['precision', 'recall', 'f1', 'accuracy']
for metric in metrics:
    highlighted_df[[f'MultinomialNB_{metric}', f'MLP_{metric}', f'Ridge_{metric}']] = highlighted_df[[f'MultinomialNB_{metric}', f'MLP_{metric}', f'Ridge_{metric}']].apply(highlight_best)

highlighted_df




Unnamed: 0,vectorizer,ngram_range,min_df,MultinomialNB_precision,MLP_precision,Ridge_precision,MultinomialNB_recall,MLP_recall,Ridge_recall,MultinomialNB_f1,MLP_f1,Ridge_f1,MultinomialNB_accuracy,MLP_accuracy,Ridge_accuracy
0,count,"(1, 1)",2,B: 0.8494199484561036,0.8784494044251351,0.8118893628429225,B: 0.8481604342581424,0.8784680337756333,0.8098612786489746,B: 0.8456923177718398,0.8778208064328132,0.8098842263592053,B: 0.8481604342581424,0.8784680337756333,0.8098612786489746
1,count,"(2, 2)",2,0.8140283992558922,0.8238964243693374,0.7922129944947116,0.8097104945717732,0.8223763570566948,0.794481302774427,0.8035533293809642,0.8188390950274772,0.7908925083612142,0.8097104945717732,0.8223763570566948,0.794481302774427
2,count,"(1, 2)",2,0.8411318516598572,B: 0.890916419587095,0.8738489843174359,0.8297647768395657,B: 0.8911338962605548,0.8745476477683957,0.8197638370231969,B: 0.8903803653272887,0.8731847549677968,0.8297647768395657,B: 0.8911338962605548,0.8745476477683957
3,tfidf,"(1, 1)",2,0.8111105755736483,0.8824680105743974,0.8801440619074978,0.7454764776839565,0.882388419782871,0.8817852834740652,0.7230167700650596,0.8813329753768443,0.8798864388908068,0.7454764776839565,0.882388419782871,0.8817852834740652
4,tfidf,"(2, 2)",2,0.7797434892619032,0.8304058821227179,0.8212857016687594,0.6949638118214716,0.8300663449939686,0.821773220747889,0.6652111610792425,0.8265588914674353,0.8177049415636208,0.6949638118214716,0.8300663449939686,0.821773220747889
5,tfidf,"(1, 2)",2,0.8011519053176797,0.8900795592839371,B: 0.8886074919564034,0.7003920386007237,0.8905307599517491,B: 0.889475271411339,0.671296226355045,0.8893836734667491,B: 0.8874442666981663,0.7003920386007237,0.8905307599517491,B: 0.889475271411339


Provided Links  
With min_df = 3

| Bigram with Count Vectors     | MultinomialNB | MLP  | Ridge |
|:------------------------------|:--------------|:-----|:------|
| Precision                     | .814          | **.824** | .792  |
| Recall                        | .810          | **.822** | .794  |
| F1                            | .804          | **.819** | .791  |
| Accuracy                      | .810          | **.822** | .794  |
| Unigram with Count Vectors    |               |      |       |
| Precision                     | .849          | **.878** | .811  |
| Recall                        | .848          | **.878** | .810  |
| F1                            | .846          | **.879** | .810  |
| Accuracy                      | .848          | **.878** | .810  |
| Mixed Gram with Count Vectors |               |      |       |
| Precision                     | .841          | **.891** | .874  |
| Recall                        | .830          | **.891** | .875  |
| F1                            | .820          | **.890** | .873  |
| Accuracy                      | .830          | **.891** | .875  |
| Bigram with tfidf Vectors     |               |      |       |
| Precision                     | .780          | **.830** | .821  |
| Recall                        | .695          | **.830** | .821  |
| F1                            | .665          | **.827** | .818  |
| Accuracy                      | .695          | **.830** | .822  |
| Unigram with tfidf Vectors    |               |      |       |
| Precision                     | .811          | **.882** | .880  |
| Recall                        | .745          | **.882** | **.882**  |
| F1                            | .723          | **.881** | .880  |
| Accuracy                      | .745          | **.882** | **.882**  |
| Mixed Gram with tfidf Vectors |               |      |       |
| Precision                     | .801          | **.890** | .889  |
| Recall                        | .700          | **.891** | .889  |
| F1                            | .671          | **.890** | .887  |
| Accuracy                      | .700          | **.891** | .889  |

In [20]:
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p and p.get_text(strip=True)]
    full_text = ' '.join(paragraphs)
    sentences = [s.strip() for s in full_text.split('.') if s.strip()]
    return sentences

urls = [
    "http://gianttortoise.org/en/beyond-tracking",
    "https://www.dhs.gov/blue-campaign/what-human-trafficking",
    "https://www.dol.gov/agencies/odep/program-areas/individuals/older-workers",
    "https://michigantoday.umich.edu/2022/08/26/positively-breaking-the-age-code/"
]

all_sentences = []
for url in urls:
    all_sentences.extend(extract_text_from_url(url))
corpus2_df = pd.DataFrame({'text': all_sentences})
corpus2_df['sdg'] = sdg_num[:len(corpus2_df)]
corpus2 = corpus2_df['text']
sdg_num_corpus2 = corpus2_df['sdg']

In [None]:
def sdg_classifier2(train_corpus, train_label, test_corpus=None, test_label=None, classifier_algorithm='multinomialnb', vectorizer_type='count', ngram_range=(1,1), stop_words='english', min_df=2):

    le = LabelEncoder()
    y_train = le.fit_transform(train_label)
    y_test = le.transform(test_label)
    
    if vectorizer_type == 'count':
        vectorizer = CountVectorizer(
            ngram_range=ngram_range, 
            stop_words=stop_words,
            min_df=min_df
        )
    else:
        vectorizer = TfidfVectorizer(
            ngram_range=ngram_range, 
            stop_words=stop_words,
            min_df=min_df
        )

    X_train_vector = vectorizer.fit_transform(train_corpus)
    X_test_vector = vectorizer.transform(test_corpus) 

    if isinstance(classifier_algorithm, str):
        if classifier_algorithm.lower() == 'multinomialnb':
            clf = MultinomialNB()
        elif classifier_algorithm.lower() == 'mlp':
            clf = MLPClassifier(max_iter=5, random_state=8)
        elif classifier_algorithm.lower() == 'ridge':
            clf = RidgeClassifier(alpha=1, solver='auto', max_iter=5)
    else:
        clf = classifier_algorithm
    
    clf.fit(X_train_vector, y_train)
    
    y_pred = clf.predict(X_test_vector)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
        
    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': accuracy,
        }


In [22]:
configurations = [
    {'vectorizer': 'count', 'ngram_range': (1, 1), 'min_df': 3},
    {'vectorizer': 'count', 'ngram_range': (2, 2), 'min_df': 3},
    {'vectorizer': 'count', 'ngram_range': (1, 2), 'min_df': 3},
    {'vectorizer': 'tfidf', 'ngram_range': (1, 1), 'min_df': 3},
    {'vectorizer': 'tfidf', 'ngram_range': (2, 2), 'min_df': 3},
    {'vectorizer': 'tfidf', 'ngram_range': (1, 2), 'min_df': 3}
]

results = []
for config in configurations:
    vectorizer_type = config['vectorizer']
    ngram_range = config['ngram_range']
    min_df = config['min_df']
    
    
    run1 = sdg_classifier2(train_corpus = corpus, test_corpus = corpus2, train_label = sdg_num, test_label = sdg_num_corpus2, classifier_algorithm='multinomialnb', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    run2 = sdg_classifier2(train_corpus = corpus, test_corpus = corpus2, train_label = sdg_num, test_label = sdg_num_corpus2, classifier_algorithm='mlp', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    run3 = sdg_classifier2(train_corpus = corpus, test_corpus = corpus2, train_label = sdg_num, test_label = sdg_num_corpus2, classifier_algorithm='ridge', 
                           vectorizer_type=vectorizer_type, 
                           ngram_range=ngram_range, min_df=min_df)
    
    results.append({
        'vectorizer': vectorizer_type,
        'ngram_range': str(ngram_range),
        'min_df': min_df,
        'MultinomialNB_precision': run1['precision'],
        'MLP_precision': run2['precision'],
        'Ridge_precision': run3['precision'],
        'MultinomialNB_recall': run1['recall'],
        'MLP_recall': run2['recall'],
        'Ridge_recall': run3['recall'],
        'MultinomialNB_f1': run1['f1_score'],
        'MLP_f1': run2['f1_score'],
        'Ridge_f1': run3['f1_score'],
        'MultinomialNB_accuracy': run1['accuracy'],
        'MLP_accuracy': run2['accuracy'],
        'Ridge_accuracy': run3['accuracy']
    })

results_df1 = pd.DataFrame(results)
def highlight_best(s):
    return ["B: " + str(v) if v == s.max() else str(v) for v in s]

highlighted_df1 = results_df1.copy()
metrics = ['precision', 'recall', 'f1', 'accuracy']
for metric in metrics:
    highlighted_df1[[f'MultinomialNB_{metric}', f'MLP_{metric}', f'Ridge_{metric}']] = highlighted_df1[[f'MultinomialNB_{metric}', f'MLP_{metric}', f'Ridge_{metric}']].apply(highlight_best)

highlighted_df1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Unnamed: 0,vectorizer,ngram_range,min_df,MultinomialNB_precision,MLP_precision,Ridge_precision,MultinomialNB_recall,MLP_recall,Ridge_recall,MultinomialNB_f1,MLP_f1,Ridge_f1,MultinomialNB_accuracy,MLP_accuracy,Ridge_accuracy
0,count,"(1, 1)",3,0.05396105056383803,0.03668931514913658,B: 0.05352315091493781,0.07142857142857142,0.07142857142857142,B: 0.1130952380952381,0.060007320988023756,0.04824553206669211,B: 0.06649050161635975,0.07142857142857142,0.07142857142857142,B: 0.1130952380952381
1,count,"(2, 2)",3,0.04445342493433017,0.03769743529595746,0.050066400016796846,0.05952380952380952,0.07142857142857142,0.08333333333333333,0.04259891728729079,0.038789985035946835,0.05379033176152024,0.05952380952380952,0.07142857142857142,0.08333333333333333
2,count,"(1, 2)",3,0.049274757980115116,0.04662047879260994,0.04607229601557452,0.07738095238095238,0.07738095238095238,0.08333333333333333,0.057639890973224306,0.05266789732770746,0.0552242528074473,0.07738095238095238,0.07738095238095238,0.08333333333333333
3,tfidf,"(1, 1)",3,0.049531615925058545,B: 0.084771465301184,0.047831632653061215,0.09523809523809523,B: 0.09523809523809523,0.05952380952380952,0.06282209287435768,B: 0.07881089359812764,0.05092828798185941,0.09523809523809523,B: 0.09523809523809523,0.05952380952380952
4,tfidf,"(2, 2)",3,0.05813524273345702,0.04402281746031746,0.027990362811791385,0.08928571428571429,0.07738095238095238,0.06547619047619048,0.05711922247839348,0.04519551789055335,0.03333459621472351,0.08928571428571429,0.07738095238095238,0.06547619047619048
5,tfidf,"(1, 2)",3,B: 0.06312076487515084,0.055093698571959435,0.05053571428571429,B: 0.11904761904761904,0.07142857142857142,0.07142857142857142,B: 0.07800342359165888,0.0599373840445269,0.05848326172574293,B: 0.11904761904761904,0.07142857142857142,0.07142857142857142


Provided Links with min_df = 3

| Bigram with Count Vectors     | MultinomialNB | MLP      | Ridge    |
|:------------------------------|:--------------|:---------|:---------|
| Precision                     | **.401**    | .091     | .100     |
| Recall                        | **.214**    | .167     | .190     |
| F1                            | **.140**    | .060     | .093     |
| Accuracy                      | **.214**    | .167     | .190     |
| Unigram with Count Vectors    |             |          |          |
| Precision                     | **.251**    | .237     | .168     |
| Recall                        | **.262**    | .238     | .167     |
| F1                            | **.242**    | .220     | .164     |
| Accuracy                      | **.262**    | .238     | .167     |
| Mixed Gram with Count Vectors |             |          |          |
| Precision                     | **.295**    | .085     | .177     |
| Recall                        | **.286**    | .143     | .190     |
| F1                            | **.272**    | .106     | .177     |
| Accuracy                      | **.286**    | .143     | .190     |
| Bigram with tfidf Vectors     |             |          |          |
| Precision                     | **.110**    | .034     | .091     |
| Recall                        | **.167**    | **.167** | **.167** |
| F1                            | .**087**    | .057     | .086     |
| Accuracy                      | **.167**    | **.167** | **.167** |
| Unigram with tfidf Vectors    |             |          |          |
| Precision                     | .174        | **.223** | .177     |
| Recall                        | .167        | **.214** | .167     |
| F1                            | .165        | **.198** | .170     |
| Accuracy                      | .167        | **.214** | .167     |
| Mixed Gram with tfidf Vectors |             |          |          |
| Precision                     | .140        | .081     | .**167** |
| Recall                        | .143        | .167     | **.190** |
| F1                            | .137        | .105     | **.175** |
| Accuracy                      | .143        | .167     | **.190** |

**Bigram with Count Vectors (MultinomialNB):** This model performs best with high precision (0.401) but low recall (0.214). The F1 score is 0.140, indicating a moderate trade-off between precision and recall. The accuracy is also relatively low at 0.214.   
  
**Unigram with Count Vectors (MultinomialNB):** Precision is lower at 0.251, with recall at 0.262, giving an F1 score of 0.242 and accuracy of 0.262. This is an improvement over the bigram model. 
    
**Mixed Gram with Count Vectors (MultinomialNB):** Precision is still relatively good (0.295), with recall at 0.286. This model has a decent F1 score of 0.272 and accuracy of 0.286.  
   
**Bigram with TF-IDF Vectors:** The precision (0.110) and F1 score (0.087) are very low here, with accuracy also very low (0.167). The model struggles to differentiate well between the classes.  
  
**Unigram with TF-IDF Vectors:** Precision (0.174) and recall (0.167) are low, but F1 is somewhat better (0.165), with accuracy of 0.167.  
  
**Mixed Gram with TF-IDF Vectors:** Precision is low (0.140), recall is 0.143, and accuracy is 0.143, showing that the model isnâ€™t able to effectively classify the text in this setup.

 Assignment: Take the main text content from these pages, and feed them into your classifier and see how your model classifies them. Are the classifications reasonable? find a case where your classification is not reasonable and explain what the model does that leads to the not ideal classification.
 ------------------------------------------------------------------------------------------------------------------------------------------------