# Automatic classification

In [1]:
import pandas as pd

dataset = pd.read_excel("dataset.xlsx")

# Semantic Scholar abstract do not include the title -> concatenation of title and abstract
dataset.loc[dataset['source'] == 'semantic_scholar', 'full_text'] = dataset.loc[dataset['source'] == 'semantic_scholar', 'title'] + ' ' + dataset.loc[dataset['source'] == 'semantic_scholar', 'full_text']
len(dataset.index)

180

## Text cleaning

In [2]:
import re
def clean(x) :
    x = str(x)
    reg = r'<[^>]*>'
    x = re.sub(reg, ' ', x)
    for y in ['_x000D_\n', '\xa0', '\t'] :
        x = re.sub(y, ' ', x)   
    x = ' '.join(x.split())
    return(x)

dataset['full_text_clean'] = dataset['full_text'].apply(lambda x:clean(x))

## Supervised classification (linear SVM)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.model_selection import cross_val_predict, StratifiedKFold

### Classification of MC_relevance category

In [6]:
from collections import Counter

# Binarize category
y = dataset['MC_relevance'].tolist() 
y = [x.replace("L", "X") for x in y]
y = [x.replace("M", "H") for x in y]
Counter(y)

Counter({'X': 117, 'H': 63})

In [7]:
# Texts to vectorize
X = dataset['full_text_clean'].tolist() 

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=3)  # Adjust min_df for your dataset

# Convert texts to TF-IDF matrix
X_tfidf = tfidf.fit_transform(X)

# Initialize models
svc_model = LinearSVC(class_weight='balanced')

def calculate_metrics(model, X, y, cv= 5):
    y_pred = cross_val_predict(model, X, y, cv=cv)
    precision, recall, f1, _ = precision_recall_fscore_support(y, y_pred, average=None)
    
    print("Classification Report:")
    print(classification_report(y, y_pred))

    return precision, recall, f1

# Evaluate Linear SVC
print("Linear SVC Metrics:")
svc_precision, svc_recall, svc_f1 = calculate_metrics(svc_model, X_tfidf, y)

Linear SVC Metrics:
Classification Report:
              precision    recall  f1-score   support

           H       0.54      0.43      0.48        63
           X       0.72      0.80      0.76       117

    accuracy                           0.67       180
   macro avg       0.63      0.62      0.62       180
weighted avg       0.66      0.67      0.66       180



In [8]:
import numpy as np
mod = LinearSVC(class_weight = 'balanced').fit(X_tfidf, y )

# Retrieve feature names (words) from TF-IDF vectorizer
feature_names = np.array(tfidf.get_feature_names_out())

# Retrieve coefficients from the SVC model
coefficients = mod.coef_.flatten()

# Create a DataFrame with the words and their coefficients
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Sort words by coefficient size
feature_importance['Importance'] = np.abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display the most important words
print(feature_importance.head(20))  

             Feature  Coefficient  Importance
2732            tree    -0.947472    0.947472
272           arctic    -0.887889    0.887889
474          changes    -0.829692    0.829692
1151         genetic    -0.824693    0.824693
1972      population    -0.823669    0.823669
496          climate    -0.771517    0.771517
1657       migratory    -0.762063    0.762063
1738            nest    -0.711197    0.711197
1047            fish     0.683832    0.683832
472           change    -0.669957    0.669957
1893       peatlands    -0.665610    0.665610
1639          method     0.625150    0.625150
2536           stand    -0.620132    0.620132
1716           natal    -0.617330    0.617330
387   boreal forests    -0.608343    0.608343
274            areas    -0.597297    0.597297
1025         females    -0.589724    0.589724
363     biodiversity    -0.580275    0.580275
2136           range    -0.574546    0.574546
2878            wood     0.573972    0.573972


### Classification of MC_relevance_mod category

In [9]:
# Binarize category
y = dataset['MC_relevance_modifier'].tolist() 
y = [x.replace("L", "X") for x in y]
y = [x.replace("M", "H") for x in y]
Counter(y)

Counter({'X': 82, 'H': 98})

In [10]:
# Texts to vectorize
X = dataset['full_text_clean'].tolist() 

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=3)  # Adjust min_df for your dataset

# Convert texts to TF-IDF matrix
X_tfidf = tfidf.fit_transform(X)

# Initialize models
svc_model = LinearSVC(class_weight='balanced')

def calculate_metrics(model, X, y, cv= 5):
    y_pred = cross_val_predict(model, X, y, cv=cv)
    precision, recall, f1, _ = precision_recall_fscore_support(y, y_pred, average=None)
    
    print("Classification Report:")
    print(classification_report(y, y_pred))

    return precision, recall, f1

# Evaluate Linear SVC
print("Linear SVC Metrics:")
svc_precision, svc_recall, svc_f1 = calculate_metrics(svc_model, X_tfidf, y)

Linear SVC Metrics:
Classification Report:
              precision    recall  f1-score   support

           H       0.57      0.57      0.57        98
           X       0.48      0.48      0.48        82

    accuracy                           0.53       180
   macro avg       0.52      0.52      0.52       180
weighted avg       0.53      0.53      0.53       180



In [11]:
import numpy as np
mod = LinearSVC(class_weight = 'balanced').fit(X_tfidf, y )

# Retrieve feature names (words) from TF-IDF vectorizer
feature_names = np.array(tfidf.get_feature_names_out())

# Retrieve coefficients from the SVC model
coefficients = mod.coef_.flatten()

# Create a DataFrame with the words and their coefficients
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Sort words by coefficient size
feature_importance['Importance'] = np.abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display the most important words
print(feature_importance.head(20)) 

          Feature  Coefficient  Importance
2732         tree    -1.143905    1.143905
2353         seed     0.911526    0.911526
2136        range    -0.726287    0.726287
1044          fir     0.719595    0.719595
447         carex    -0.718494    0.718494
1047         fish     0.703840    0.703840
1972   population    -0.702008    0.702008
2878         wood     0.677389    0.677389
1980  populations     0.674325    0.674325
272        arctic    -0.655570    0.655570
2302         root     0.651785    0.651785
2254   resistance     0.651414    0.651414
2425         site     0.641023    0.641023
2905  zooplankton    -0.639594    0.639594
2778        urban    -0.638602    0.638602
1151      genetic    -0.620718    0.620718
1063       floral    -0.616217    0.616217
474       changes    -0.605517    0.605517
2346     seasonal    -0.580158    0.580158
1639       method     0.579147    0.579147
