## Project 3: Textual Data

#### Imports

In [57]:
import pandas as pd
import numpy as np
import seaborn as sns 
from matplotlib import pyplot as plt
import matplotlib as mpl
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline


from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import RidgeClassifier



#### Importing and Cleaning Dataset

In [58]:
sdg_names = pd.read_excel("/Users/aidan/Documents/applieddatascience/sample_code/Digital+Science+SDG+training+set+searches.xlsx")
sdg_names

Unnamed: 0.1,Unnamed: 0,Each tab contains the keywords/phrases used for generating the training set.,Unnamed: 2
0,,,
1,,,
2,Goal,Name,
3,1,No Poverty,End poverty in all its forms everywhere
4,2,Zero Hunger,"End hunger, achieve food security and improved..."
5,3,Good Health and Well Being,Ensure healthy lives and promote well-being fo...
6,4,Quality Education,Ensure inclusive and equitable quality educati...
7,5,Gender Equality,Achieve gender equality and empower all women ...
8,6,Clean Water and Sanitation,Ensure availability and sustainable management...
9,7,Affordable and Clean Energy,"Ensure access to affordable, reliable, sustain..."


In [59]:
sdg_names = sdg_names.drop([0,1,2], axis=0)
sdg_names = sdg_names.set_axis(["sdg", "sdg_name", "sdg_definition"],axis=1, copy=False)
sdg_names

Unnamed: 0,sdg,sdg_name,sdg_definition
3,1,No Poverty,End poverty in all its forms everywhere
4,2,Zero Hunger,"End hunger, achieve food security and improved..."
5,3,Good Health and Well Being,Ensure healthy lives and promote well-being fo...
6,4,Quality Education,Ensure inclusive and equitable quality educati...
7,5,Gender Equality,Achieve gender equality and empower all women ...
8,6,Clean Water and Sanitation,Ensure availability and sustainable management...
9,7,Affordable and Clean Energy,"Ensure access to affordable, reliable, sustain..."
10,8,Decent Work and Economic Growth,"Promote sustained, inclusive and sustainable e..."
11,9,"Industry, Innovation and Infrastructure","Build resilient infrastructure, promote inclus..."
12,10,Reduced Inequalities,Reduce inequality within and among countries


In [60]:
text_df = pd.read_csv("/Users/aidan/Documents/applieddatascience/sample_code/osdg-community-data-v2023-01-01.csv", sep = "\t")
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40062 entries, 0 to 40061
Data columns (total 1 columns):
 #   Column                                                          Non-Null Count  Dtype 
---  ------                                                          --------------  ----- 
 0   doi	text_id	text	sdg	labels_negative	labels_positive	agreement  40062 non-null  object
dtypes: object(1)
memory usage: 313.1+ KB


In [61]:
col_names = text_df.columns.values[0].split('\t')
text_df[col_names] = text_df[text_df.columns.values[0]].apply(lambda x: pd.Series(str(x).split("\t")))
text_df.drop(text_df.columns.values[0],axis = 1, inplace=True)
text_df = text_df.astype({'sdg':int, 'labels_negative': int, 'labels_positive':int, 'agreement': float}, copy=True)
text_df = text_df.query("agreement > 0.5 and (labels_positive - labels_negative) > 2")
text_df.reset_index(inplace=True)
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24669 entries, 0 to 24668
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   index            24669 non-null  int64  
 1   doi              24669 non-null  object 
 2   text_id          24669 non-null  object 
 3   text             24669 non-null  object 
 4   sdg              24669 non-null  int64  
 5   labels_negative  24669 non-null  int64  
 6   labels_positive  24669 non-null  int64  
 7   agreement        24669 non-null  float64
dtypes: float64(1), int64(4), object(3)
memory usage: 1.5+ MB


In [62]:
docs = text_df.text
categories = text_df.sdg
X_train, X_test, y_train, y_test = \
    train_test_split(docs, categories, test_size=0.33, random_state=7)

- This is the train-test split that will be used to test and evaluate different classifiers' ability to sort different documents into their corresponding SDG goals.

#### Classifier and Evaluation Table Functions

In [63]:
def classifier( text_df,estimators = [('myvec',CountVectorizer(ngram_range=(2,2),stop_words = "english", min_df = 5)),('my_clf',MultinomialNB())], ngram_range = (2,2), min_df = 5, stop_words = 'english'):
    X_train, X_test, y_train, y_test = \
        train_test_split(text_df.text, text_df.sdg, test_size=0.33, random_state=7)
    model = Pipeline(estimators)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1= metrics.f1_score(y_test,y_pred, average = 'weighted')
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average = 'weighted')
    recall = metrics.recall_score(y_test, y_pred, average = 'weighted')
    return  precision, recall, f1, accuracy
                  
                  
    
        

- Classifier Function takes in a data frame, a vectorization method(tfidf or MyVectorizer), and a classification method(MultinomialNB, MLP, or Ridge) and outputs 4 parameters to describe this combination of vectorizer and classifier's abillity to correctly classify documents into their correct sdg type

In [64]:
def evaluation_table(text_df, vectorizer = 'count', ngr = (2,2), mdf = 5, sw = 'english'):
    if (vectorizer == 'count'):
        MNB = classifier(text_df,estimators = [('myvec',CountVectorizer(ngram_range= ngr,stop_words = sw, min_df = mdf))
                                 ,('my_clf',MultinomialNB())])
        MLP = classifier(text_df,estimators = [('myvec',CountVectorizer(ngram_range= ngr,stop_words = sw, min_df = mdf))
                                 ,('my_clf',MLPClassifier(max_iter = 10))])
        Ridge = classifier(text_df,estimators = [('myvec',CountVectorizer(ngram_range= ngr,stop_words = sw, min_df = mdf))
                                 ,('my_clf',RidgeClassifier())])
        
    elif (vectorizer == 'tfidf'):
        MNB = classifier(text_df,estimators = [('myvec',TfidfVectorizer(ngram_range= ngr,stop_words = sw, min_df = mdf))
                                   ,('my_clf',MultinomialNB())])
        MLP = classifier(text_df,estimators = [('myvec',TfidfVectorizer(ngram_range= ngr,stop_words = sw, min_df = mdf))
                                    ,('my_clf',MLPClassifier(max_iter = 10))])
        Ridge = classifier(text_df,estimators = [('myvec',TfidfVectorizer(ngram_range= ngr,stop_words = sw, min_df = mdf))
                                    ,('my_clf',RidgeClassifier())])
    else:
        MNB = ()
        MLP = ()
        Ridge = ()
        
    data = [MNB,MLP,Ridge]
    data = np.transpose(data)
    table = pd.DataFrame(data, index = ['Precision','Recall','F1','Accuracy'], columns =['MNB','MLP','Ridge'])
    return table
    

- Evaluation table takes in a text_df and a vectorizer type and returns a table of the four parameters for each of the three classifier types for the current setting. The vectorizer type, ngram range, and min_df can all be tweaked to change the results.

### Running the trials

- The following loop will run the evaluation_table on all different combinations between min_df values, ngram range, and vectorizer type and compile the results into one table

In [65]:
import itertools
import warnings 
from sklearn.exceptions import ConvergenceWarning

warnings.simplefilter(action = 'ignore', category=FutureWarning)
warnings.simplefilter(action = 'ignore', category=ConvergenceWarning)

vectorizers = ['tfidf', 'count']
classifiers = ['Ridge','MLP','MNB']
min_dfs = [3,4,5,6,7]
ngram_ranges =[(1,1),(2,2),(1,2)]

results_df = pd.DataFrame(columns=['ngram_range', 'min_df', 'Vectorizer','Classifier','Precision','Recall','F1','Accuracy'])

for v, m, n in itertools.product(vectorizers, min_dfs, ngram_ranges):
    eval_table = evaluation_table(text_df, vectorizer = v, mdf = m, ngr = n)
    eval_table = eval_table.transpose()
    for c in classifiers:
        precision = eval_table['Precision'][c]
        recall = eval_table['Recall'][c]
        f1 = eval_table['F1'][c]
        accuracy = eval_table['Accuracy'][c]
        results_df = results_df.append({
           'Vectorizer': v,
            'Classifier': c,
            'min_df': m,
            'ngram_range': n,
            'Precision': precision,
            'Recall': recall,
            'F1': f1,
            'Accuracy': accuracy
        }, ignore_index=True)
        
        

In [66]:
results_df.sort_values(by='Accuracy',ascending=False,inplace=True)
results_df = results_df.reset_index().drop(['index'],axis=1)


In [67]:
results_df.describe()


Unnamed: 0,Precision,Recall,F1,Accuracy
count,90.0,90.0,90.0,90.0
mean,0.822517,0.816665,0.809592,0.816665
std,0.058617,0.065309,0.072126,0.065309
min,0.679848,0.682594,0.667226,0.682594
25%,0.78018,0.76213,0.750293,0.76213
50%,0.814807,0.814949,0.813481,0.814949
75%,0.881308,0.883153,0.881532,0.883153
max,0.894404,0.895836,0.894717,0.895836


In [68]:
def highlight_top_row(s):
    if s.name == 0:
        return ['font-weight: bold'] * len(s)
    else:
        return [''] * len(s)

styled_df = results_df.style.apply(highlight_top_row, axis=1)



In [69]:
styled_df


Unnamed: 0,ngram_range,min_df,Vectorizer,Classifier,Precision,Recall,F1,Accuracy
0,"(1, 2)",3,tfidf,MLP,0.894404,0.895836,0.894717,0.895836
1,"(1, 2)",3,tfidf,Ridge,0.89321,0.895345,0.893385,0.895345
2,"(1, 2)",3,count,MLP,0.892817,0.894116,0.892931,0.894116
3,"(1, 2)",5,tfidf,MLP,0.892781,0.893993,0.893141,0.893993
4,"(1, 2)",4,tfidf,MLP,0.892413,0.893748,0.892776,0.893748
5,"(1, 2)",4,tfidf,Ridge,0.890924,0.893256,0.891224,0.893256
6,"(1, 2)",7,tfidf,MLP,0.890852,0.892274,0.89123,0.892274
7,"(1, 2)",5,tfidf,Ridge,0.888966,0.891537,0.889413,0.891537
8,"(1, 2)",4,count,MLP,0.890139,0.891414,0.890481,0.891414
9,"(1, 2)",6,tfidf,Ridge,0.888616,0.891045,0.889106,0.891045


- The top, bolded row is the configuration with the highest precision, recall, F1, and accuracy. The rest of the table is sorted by descending accuracy values.

- It would appear that by all four metrics used, the best configuration out of those tested is a min_df of 3, an n_gram range of (1,2), the tfidf vectorizer, and the MLP classifier.

#### Finding the most differentiating features

- Now the most effective configuration from earlier can be used to find the most differentiating features within the documents

In [70]:
X_train_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words = "english",min_df=7 )
X_train_tfidf_vectorizer.fit(X_train)
labels = X_train_tfidf_vectorizer.get_feature_names_out()

X_train_tfidf_vector = X_train_tfidf_vectorizer.transform(X_train)
X_test_tfidf_vector = X_train_tfidf_vectorizer.transform(X_test)

tfidf_MNB_clf = MultinomialNB().fit(X_train_tfidf_vector, y_train)




In [71]:
def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
    for labelid in classlabel:
        feature_names = vectorizer.get_feature_names_out()
        top_n = sorted(zip(classifier.feature_log_prob_[labelid], feature_names), reverse=True)[:n]
        for coef, feat in top_n:
            print("SDG {} : {:30}  {:.6}".format((labelid+1), feat, coef))
        print("")
        
        

In [78]:
most_informative_feature_for_class(X_train_tfidf_vectorizer,tfidf_MNB_clf, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], n = 20)



SDG 1 : poverty                         -5.31051
SDG 1 : income                          -6.22083
SDG 1 : poor                            -6.3083
SDG 1 : children                        -6.44797
SDG 1 : households                      -6.79279
SDG 1 : social                          -6.79298
SDG 1 : deprivation                     -6.80614
SDG 1 : child                           -6.80743
SDG 1 : countries                       -6.85516
SDG 1 : household                       -6.9804
SDG 1 : living                          -7.13178
SDG 1 : child poverty                   -7.17395
SDG 1 : population                      -7.24999
SDG 1 : rates                           -7.26227
SDG 1 : growth                          -7.28473
SDG 1 : cent                            -7.30973
SDG 1 : people                          -7.32112
SDG 1 : rate                            -7.3648
SDG 1 : families                        -7.38051
SDG 1 : line                            -7.39858

SDG 2 : food          

- The most differentiating features for each SDG can be seen above. There is some overlap between SDGs, for example 'rights' shoes up in SDG 5 and 16.