In [1]:
import spacy
spacy.cli.download('en_core_web_md')
nlp = spacy.load('en_core_web_md')

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [3]:
stopwords = list(nlp.Defaults.stop_words) + ['ll','ve']

In [4]:
# Load CSV Train + Test
train = pd.read_csv('Training.csv')
test = pd.read_csv('Test.csv')

In [5]:
# Quick Look at Features
train.head(3)

Unnamed: 0.1,Unnamed: 0,title,tags,heading,source,text,bias_rating,text_length,text_word_count,heading_length,heading_word_count
0,16938,Latest Presidential Polls,"['Presidential Elections', 'Elections']",Washington Post-ABC News poll: Clinton holds f...,Washington Post,"With three weeks until Election Day, Hillary C...",left,281.0,46.0,87,13
1,6759,House Democrat Leaders Rein In Impeachment Talk,"['Impeachment', 'Donald Trump', 'US House', 'P...",OPINION: Democrats may be outsmarting themselv...,Guest Writer - Left,"History can be a wise teacher, but it's also c...",left,573.0,90.0,63,8
2,908,European Countries Suspend Use of AstraZeneca ...,"['Coronavirus', 'World', 'Europe', 'European U...",AstraZeneca's COVID vaccine suspended in more ...,CBS News (Online),Sweden on Tuesday became the latest to join a ...,left,571.0,90.0,86,13


In [6]:
test.head(3)

Unnamed: 0.1,Unnamed: 0,title,tags,heading,source,text,bias_rating,text_length,text_word_count,heading_length,heading_word_count
0,6330,What Will Republicans Do If They Gain Control ...,"['2022 Elections', 'Midterms', 'Republican Par...",The Polls Still Do Not Show A GOP Bounce Back,FiveThirtyEight,Ever since we launched our election model in l...,center,548.0,90.0,45,10
1,20950,Drug Overdose Deaths Climb During Pandemic,"['Safety And Sanity During COVID-19', 'Opioid ...",San Francisco struggles to stem ‘horrific’ upt...,Los Angeles Times,"In early 2019, a formerly homeless man named T...",left,491.0,90.0,81,12
2,621,Liz Truss Becomes United Kingdom's New Prime M...,"['World', 'United Kingdom', 'Liz Truss', 'Euro...",Liz Truss becomes UK prime minister after meet...,The Guardian,Liz Truss has become the UK’s new prime minist...,left,513.0,90.0,67,11


### Preprocessing

In [7]:
# Drop Unnamed: 0, title, tags
train = train.drop(['Unnamed: 0'],axis=1)
train = train.drop(['title'],axis=1)
train = train.drop(['tags'],axis=1)

test = test.drop(['Unnamed: 0'],axis=1)
test = test.drop(['title'],axis=1)
test = test.drop(['tags'],axis=1)

In [8]:
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize using spaCy
    doc = nlp(text)
    
    cleaned_tokens = []
    for token in doc:
        lemma = token.lemma_
        cleaned_tokens.append(lemma)
    
    return " ".join(cleaned_tokens)

In [9]:
train['cleaned_text'] = train['text'].astype(str).apply(clean_text)

In [10]:
train['cleaned_header'] = train['heading'].astype(str).apply(clean_text)

In [11]:
train.head(3)

Unnamed: 0,heading,source,text,bias_rating,text_length,text_word_count,heading_length,heading_word_count,cleaned_text,cleaned_header
0,Washington Post-ABC News poll: Clinton holds f...,Washington Post,"With three weeks until Election Day, Hillary C...",left,281.0,46.0,87,13,with three week until election day hillary cli...,washington postabc news poll clinton hold four...
1,OPINION: Democrats may be outsmarting themselv...,Guest Writer - Left,"History can be a wise teacher, but it's also c...",left,573.0,90.0,63,8,history can be a wise teacher but its also cru...,opinion democrats may be outsmart themselves o...
2,AstraZeneca's COVID vaccine suspended in more ...,CBS News (Online),Sweden on Tuesday became the latest to join a ...,left,571.0,90.0,86,13,sweden on tuesday become the late to join a gr...,astrazenecas covid vaccine suspend in more cou...


In [12]:
train = train.drop(['text_length'],axis=1)
train = train.drop(['text_word_count'],axis=1)
train = train.drop(['heading_length'],axis=1)
train = train.drop(['heading_word_count'],axis=1)

test = test.drop(['text_length'],axis=1)
test = test.drop(['text_word_count'],axis=1)
test = test.drop(['heading_length'],axis=1)
test = test.drop(['heading_word_count'],axis=1)

In [13]:
test['cleaned_text'] = test['text'].astype(str).apply(clean_text)

In [14]:
test['cleaned_header'] = test['heading'].astype(str).apply(clean_text)

In [15]:
test.head(3)

Unnamed: 0,heading,source,text,bias_rating,cleaned_text,cleaned_header
0,The Polls Still Do Not Show A GOP Bounce Back,FiveThirtyEight,Ever since we launched our election model in l...,center,ever since we launch our election model in lat...,the poll still do not show a gop bounce back
1,San Francisco struggles to stem ‘horrific’ upt...,Los Angeles Times,"In early 2019, a formerly homeless man named T...",left,in early a formerly homeless man name tom wo...,san francisco struggle to stem horrific uptick...
2,Liz Truss becomes UK prime minister after meet...,The Guardian,Liz Truss has become the UK’s new prime minist...,left,liz truss have become the uks new prime minist...,liz truss become uk prime minister after meet ...


In [16]:
train.to_csv('train_cleaned.csv')

Split into 3 different types to see the accuracy. text, heading, source.

### Log Reg Text

In [17]:
df_text =train[['cleaned_text', 'bias_rating']].copy().dropna()

In [18]:
df_text

Unnamed: 0,cleaned_text,bias_rating
0,with three week until election day hillary cli...,left
1,history can be a wise teacher but its also cru...,left
2,sweden on tuesday become the late to join a gr...,left
3,attorney general eric h holder jr announce on ...,left
4,moderna inc climb as much as monday after ge...,right
...,...,...
17398,deadly hurricane ian be downgrade to tropical ...,center
17399,president donald trump have veto a congression...,center
17400,no one have ever want to be vice president as ...,right
17401,more than individual and entity have shell o...,left


In [19]:
df_test = test[['cleaned_text', 'bias_rating']].copy().dropna()

In [20]:
df_test

Unnamed: 0,cleaned_text,bias_rating
0,ever since we launch our election model in lat...,center
1,in early a formerly homeless man name tom wo...,left
2,liz truss have become the uks new prime minist...,left
3,with rep todd akin rmo embroil in controversy ...,left
4,the top nuclear commander in the us say he wou...,center
...,...,...
4346,south carolina family be reel wednesday after ...,left
4347,even before the coronavirus pandemic the us re...,left
4348,wikileak founder julian assange have be charge...,left
4349,a key inflation gauge that reveal consumer pri...,right


In [25]:
def load_and_preprocess(dataframe,feature_name,label_name, max_features=20000):
    
    texts = dataframe[feature_name].fillna("").tolist()
    labels = dataframe[label_name].tolist()
    
    # Step 2: Encode labels
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)

    # Step 3: Vectorize text using TF-IDF
    vectorizer = TfidfVectorizer(
        max_features=max_features,  # Top N most important words
        ngram_range=((1,1)),          # Use unigrams and bigrams
        stop_words= stopwords
    )
    X = vectorizer.fit_transform(texts)


    return vectorizer, label_encoder, X, encoded_labels


In [26]:
vectorizer, label_encoder, X, encoded_labels = load_and_preprocess(df_text,'cleaned_text','bias_rating')

In [27]:
from imblearn.over_sampling import SMOTE

In [28]:
sampler = SMOTE(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X, encoded_labels)

In [29]:
C = np.arange(0.01, 1.0, 0.01)
warm_start = [True, False]
max_iter = list(range(100, 600))
solver = ['lbfgs', 'newton-cg','saga']
penalty = ['l2', 'l1','elasticnet']

params = {
    'C': C,
    'warm_start': warm_start,
    'solver': solver,
    'penalty': penalty,
    'max_iter': max_iter
}

random_search = RandomizedSearchCV(
    estimator=LogisticRegression(),
    param_distributions=params,
    n_iter=200,
    scoring='f1_macro',
    cv=5,
    n_jobs=1,
    random_state=1,
    verbose=2
).fit(X_resampled, y_resampled)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV] END C=0.15000000000000002, max_iter=213, penalty=l2, solver=newton-cg, warm_start=False; total time=   0.2s
[CV] END C=0.15000000000000002, max_iter=213, penalty=l2, solver=newton-cg, warm_start=False; total time=   0.2s
[CV] END C=0.15000000000000002, max_iter=213, penalty=l2, solver=newton-cg, warm_start=False; total time=   0.1s
[CV] END C=0.15000000000000002, max_iter=213, penalty=l2, solver=newton-cg, warm_start=False; total time=   0.1s
[CV] END C=0.15000000000000002, max_iter=213, penalty=l2, solver=newton-cg, warm_start=False; total time=   0.1s
[CV] END C=0.55, max_iter=419, penalty=elasticnet, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.55, max_iter=419, penalty=elasticnet, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.55, max_iter=419, penalty=elasticnet, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.55, max_iter=419, penalty=elasticnet, solver=lbfgs, 

600 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
110 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/suz/anaconda3/envs/conda4/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/suz/anaconda3/envs/conda4/lib/python3.11/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/suz/anaconda3/envs/conda4/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.

In [30]:
random_search.best_params_

{'warm_start': False,
 'solver': 'saga',
 'penalty': 'l2',
 'max_iter': 326,
 'C': 0.99}

In [31]:

# Step 2: Clean and transform using your fitted vectorizer
X_new = vectorizer.transform(df_test['cleaned_text'].to_list())

# Step 3: Predict with your trained classifier
preds = random_search.predict(X_new)

# Step 4: Convert predictions back to original labels
predicted_labels = label_encoder.inverse_transform(preds)

In [32]:
predicted_labels

array(['right', 'left', 'right', ..., 'right', 'left', 'right'],
      dtype='<U6')

In [33]:
print(classification_report(predicted_labels,df_test['bias_rating'].to_list() ))

              precision    recall  f1-score   support

      center       0.41      0.30      0.35      1169
        left       0.48      0.57      0.52      1743
       right       0.42      0.42      0.42      1439

    accuracy                           0.45      4351
   macro avg       0.44      0.43      0.43      4351
weighted avg       0.44      0.45      0.44      4351



### Log Reg Heading

In [34]:
heading_train = train[['cleaned_header', 'bias_rating']].copy().dropna()
heading_test = test[['cleaned_header', 'bias_rating']].copy().dropna()

In [35]:
heading_test

Unnamed: 0,cleaned_header,bias_rating
0,the poll still do not show a gop bounce back,center
1,san francisco struggle to stem horrific uptick...,left
2,liz truss become uk prime minister after meet ...,left
3,todd akin lead missouri senate poll before rap...,left
4,we nuclear chief would resist illegal presiden...,center
...,...,...
4346,officer charge with murder after shoot man in ...,left
4347,a wave of eviction would be bad for everybody,left
4348,assange have be charge we prosecutor reveal in...,left
4349,inflation spike could force fed hand on taper ...,right


In [36]:
vectorizer, label_encoder, X, encoded_labels = load_and_preprocess(heading_train,'cleaned_header','bias_rating')

In [37]:
sampler = SMOTE(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X, encoded_labels)

In [38]:
C = np.arange(0.01, 1.0, 0.01)
warm_start = [True, False]
max_iter = list(range(100, 600))
solver = ['lbfgs', 'newton-cg','saga']
penalty = ['l2', 'l1','elasticnet']

params = {
    'C': C,
    'warm_start': warm_start,
    'solver': solver,
    'penalty': penalty,
    'max_iter': max_iter
}

heading_search = RandomizedSearchCV(
    estimator=LogisticRegression(),
    param_distributions=params,
    n_iter=200,
    scoring='f1_macro',
    cv=5,
    n_jobs=1,
    random_state=1,
    verbose=2
).fit(X_resampled, y_resampled)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV] END C=0.15000000000000002, max_iter=213, penalty=l2, solver=newton-cg, warm_start=False; total time=   0.1s
[CV] END C=0.15000000000000002, max_iter=213, penalty=l2, solver=newton-cg, warm_start=False; total time=   0.0s
[CV] END C=0.15000000000000002, max_iter=213, penalty=l2, solver=newton-cg, warm_start=False; total time=   0.1s
[CV] END C=0.15000000000000002, max_iter=213, penalty=l2, solver=newton-cg, warm_start=False; total time=   0.1s
[CV] END C=0.15000000000000002, max_iter=213, penalty=l2, solver=newton-cg, warm_start=False; total time=   0.1s
[CV] END C=0.55, max_iter=419, penalty=elasticnet, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.55, max_iter=419, penalty=elasticnet, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.55, max_iter=419, penalty=elasticnet, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.55, max_iter=419, penalty=elasticnet, solver=lbfgs, 

600 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
110 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/suz/anaconda3/envs/conda4/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/suz/anaconda3/envs/conda4/lib/python3.11/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/suz/anaconda3/envs/conda4/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.

In [39]:
heading_search.best_params_

{'warm_start': False,
 'solver': 'saga',
 'penalty': 'l2',
 'max_iter': 326,
 'C': 0.99}

In [40]:

# Step 2: Clean and transform using your fitted vectorizer
X_new = vectorizer.transform(heading_test['cleaned_header'].to_list())

# Step 3: Predict with your trained classifier
preds = heading_search.predict(X_new)

# Step 4: Convert predictions back to original labels
predicted_labels = label_encoder.inverse_transform(preds)

In [41]:
print(classification_report(predicted_labels,heading_test['bias_rating'].to_list() ))

              precision    recall  f1-score   support

      center       0.36      0.25      0.29      1213
        left       0.43      0.51      0.47      1729
       right       0.36      0.37      0.37      1409

    accuracy                           0.39      4351
   macro avg       0.38      0.38      0.38      4351
weighted avg       0.39      0.39      0.39      4351



### Log Reg Source

In [42]:
source_train = train[['source', 'bias_rating']].copy().dropna()
source_test = test[['source', 'bias_rating']].copy().dropna()

In [43]:
source_test

Unnamed: 0,source,bias_rating
0,FiveThirtyEight,center
1,Los Angeles Times,left
2,The Guardian,left
3,HuffPost,left
4,BBC News,center
...,...,...
4346,CNN (Online News),left
4347,Bloomberg,left
4348,Bloomberg,left
4349,Fox Business,right


In [44]:
vectorizer, label_encoder, X, encoded_labels = load_and_preprocess(source_train,'source','bias_rating')

In [45]:
sampler = SMOTE(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X, encoded_labels)

In [46]:
C = np.arange(0.01, 1.0, 0.01)
warm_start = [True, False]
max_iter = list(range(100, 600))
solver = ['lbfgs', 'newton-cg','saga']
penalty = ['l2', 'l1','elasticnet']

params = {
    'C': C,
    'warm_start': warm_start,
    'solver': solver,
    'penalty': penalty,
    'max_iter': max_iter
}

source_search = RandomizedSearchCV(
    estimator=LogisticRegression(),
    param_distributions=params,
    n_iter=200,
    scoring='f1_macro',
    cv=5,
    n_jobs=1,
    random_state=1,
    verbose=2
).fit(X_resampled, y_resampled)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV] END C=0.15000000000000002, max_iter=213, penalty=l2, solver=newton-cg, warm_start=False; total time=   0.1s
[CV] END C=0.15000000000000002, max_iter=213, penalty=l2, solver=newton-cg, warm_start=False; total time=   0.0s
[CV] END C=0.15000000000000002, max_iter=213, penalty=l2, solver=newton-cg, warm_start=False; total time=   0.1s
[CV] END C=0.15000000000000002, max_iter=213, penalty=l2, solver=newton-cg, warm_start=False; total time=   0.0s
[CV] END C=0.15000000000000002, max_iter=213, penalty=l2, solver=newton-cg, warm_start=False; total time=   0.0s
[CV] END C=0.55, max_iter=419, penalty=elasticnet, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.55, max_iter=419, penalty=elasticnet, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.55, max_iter=419, penalty=elasticnet, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.55, max_iter=419, penalty=elasticnet, solver=lbfgs, 



[CV] END C=0.93, max_iter=571, penalty=l1, solver=saga, warm_start=False; total time=   8.7s
[CV] END C=0.42000000000000004, max_iter=233, penalty=l1, solver=newton-cg, warm_start=False; total time=   0.0s
[CV] END C=0.42000000000000004, max_iter=233, penalty=l1, solver=newton-cg, warm_start=False; total time=   0.0s
[CV] END C=0.42000000000000004, max_iter=233, penalty=l1, solver=newton-cg, warm_start=False; total time=   0.0s
[CV] END C=0.42000000000000004, max_iter=233, penalty=l1, solver=newton-cg, warm_start=False; total time=   0.0s
[CV] END C=0.42000000000000004, max_iter=233, penalty=l1, solver=newton-cg, warm_start=False; total time=   0.0s
[CV] END C=0.09, max_iter=174, penalty=elasticnet, solver=saga, warm_start=False; total time=   0.0s
[CV] END C=0.09, max_iter=174, penalty=elasticnet, solver=saga, warm_start=False; total time=   0.0s
[CV] END C=0.09, max_iter=174, penalty=elasticnet, solver=saga, warm_start=False; total time=   0.0s
[CV] END C=0.09, max_iter=174, penalty=



[CV] END C=0.12, max_iter=118, penalty=l1, solver=saga, warm_start=False; total time=   0.7s
[CV] END C=0.12, max_iter=118, penalty=l1, solver=saga, warm_start=False; total time=   0.7s
[CV] END C=0.12, max_iter=118, penalty=l1, solver=saga, warm_start=False; total time=   0.8s
[CV] END C=0.92, max_iter=356, penalty=l1, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.92, max_iter=356, penalty=l1, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.92, max_iter=356, penalty=l1, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.92, max_iter=356, penalty=l1, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.92, max_iter=356, penalty=l1, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.14, max_iter=131, penalty=l1, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.14, max_iter=131, penalty=l1, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.14, max_iter=131, penalty=l1, solver=lbfgs, warm_s



[CV] END C=0.42000000000000004, max_iter=107, penalty=l1, solver=saga, warm_start=False; total time=   1.1s




[CV] END C=0.42000000000000004, max_iter=107, penalty=l1, solver=saga, warm_start=False; total time=   1.3s




[CV] END C=0.42000000000000004, max_iter=107, penalty=l1, solver=saga, warm_start=False; total time=   1.2s




[CV] END C=0.42000000000000004, max_iter=107, penalty=l1, solver=saga, warm_start=False; total time=   1.2s
[CV] END C=0.15000000000000002, max_iter=327, penalty=l2, solver=saga, warm_start=False; total time=   0.0s
[CV] END C=0.15000000000000002, max_iter=327, penalty=l2, solver=saga, warm_start=False; total time=   0.0s
[CV] END C=0.15000000000000002, max_iter=327, penalty=l2, solver=saga, warm_start=False; total time=   0.0s
[CV] END C=0.15000000000000002, max_iter=327, penalty=l2, solver=saga, warm_start=False; total time=   0.0s
[CV] END C=0.15000000000000002, max_iter=327, penalty=l2, solver=saga, warm_start=False; total time=   0.0s
[CV] END C=0.54, max_iter=168, penalty=l1, solver=saga, warm_start=True; total time=   1.7s
[CV] END C=0.54, max_iter=168, penalty=l1, solver=saga, warm_start=True; total time=   1.7s
[CV] END C=0.54, max_iter=168, penalty=l1, solver=saga, warm_start=True; total time=   1.5s
[CV] END C=0.54, max_iter=168, penalty=l1, solver=saga, warm_start=True; tot



[CV] END C=0.54, max_iter=168, penalty=l1, solver=saga, warm_start=True; total time=   2.0s
[CV] END C=0.74, max_iter=111, penalty=l1, solver=lbfgs, warm_start=True; total time=   0.0s
[CV] END C=0.74, max_iter=111, penalty=l1, solver=lbfgs, warm_start=True; total time=   0.0s
[CV] END C=0.74, max_iter=111, penalty=l1, solver=lbfgs, warm_start=True; total time=   0.0s
[CV] END C=0.74, max_iter=111, penalty=l1, solver=lbfgs, warm_start=True; total time=   0.0s
[CV] END C=0.74, max_iter=111, penalty=l1, solver=lbfgs, warm_start=True; total time=   0.0s
[CV] END C=0.89, max_iter=321, penalty=l1, solver=lbfgs, warm_start=True; total time=   0.0s
[CV] END C=0.89, max_iter=321, penalty=l1, solver=lbfgs, warm_start=True; total time=   0.0s
[CV] END C=0.89, max_iter=321, penalty=l1, solver=lbfgs, warm_start=True; total time=   0.0s
[CV] END C=0.89, max_iter=321, penalty=l1, solver=lbfgs, warm_start=True; total time=   0.0s
[CV] END C=0.89, max_iter=321, penalty=l1, solver=lbfgs, warm_start=Tru



[CV] END C=0.77, max_iter=422, penalty=l1, solver=saga, warm_start=False; total time=   7.0s
[CV] END C=0.67, max_iter=478, penalty=l1, solver=saga, warm_start=False; total time=   2.2s
[CV] END C=0.67, max_iter=478, penalty=l1, solver=saga, warm_start=False; total time=   2.1s
[CV] END C=0.67, max_iter=478, penalty=l1, solver=saga, warm_start=False; total time=   1.8s
[CV] END C=0.67, max_iter=478, penalty=l1, solver=saga, warm_start=False; total time=   1.9s
[CV] END C=0.67, max_iter=478, penalty=l1, solver=saga, warm_start=False; total time=   5.3s
[CV] END C=0.6900000000000001, max_iter=220, penalty=l1, solver=newton-cg, warm_start=False; total time=   0.0s
[CV] END C=0.6900000000000001, max_iter=220, penalty=l1, solver=newton-cg, warm_start=False; total time=   0.0s
[CV] END C=0.6900000000000001, max_iter=220, penalty=l1, solver=newton-cg, warm_start=False; total time=   0.0s
[CV] END C=0.6900000000000001, max_iter=220, penalty=l1, solver=newton-cg, warm_start=False; total time=  



[CV] END C=0.6900000000000001, max_iter=206, penalty=l1, solver=saga, warm_start=True; total time=   2.6s
[CV] END C=0.46, max_iter=176, penalty=l1, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.46, max_iter=176, penalty=l1, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.46, max_iter=176, penalty=l1, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.46, max_iter=176, penalty=l1, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.46, max_iter=176, penalty=l1, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.34, max_iter=539, penalty=l2, solver=lbfgs, warm_start=False; total time=   0.1s
[CV] END C=0.34, max_iter=539, penalty=l2, solver=lbfgs, warm_start=False; total time=   0.2s
[CV] END C=0.34, max_iter=539, penalty=l2, solver=lbfgs, warm_start=False; total time=   0.1s
[CV] END C=0.34, max_iter=539, penalty=l2, solver=lbfgs, warm_start=False; total time=   0.1s
[CV] END C=0.34, max_iter=539, penalty=l2, solve



[CV] END C=0.39, max_iter=112, penalty=l1, solver=saga, warm_start=True; total time=   1.2s




[CV] END C=0.39, max_iter=112, penalty=l1, solver=saga, warm_start=True; total time=   1.2s




[CV] END C=0.39, max_iter=112, penalty=l1, solver=saga, warm_start=True; total time=   1.2s




[CV] END C=0.39, max_iter=112, penalty=l1, solver=saga, warm_start=True; total time=   1.2s
[CV] END C=0.68, max_iter=283, penalty=l1, solver=saga, warm_start=True; total time=   2.1s
[CV] END C=0.68, max_iter=283, penalty=l1, solver=saga, warm_start=True; total time=   2.2s
[CV] END C=0.68, max_iter=283, penalty=l1, solver=saga, warm_start=True; total time=   1.8s
[CV] END C=0.68, max_iter=283, penalty=l1, solver=saga, warm_start=True; total time=   1.9s




[CV] END C=0.68, max_iter=283, penalty=l1, solver=saga, warm_start=True; total time=   4.4s
[CV] END C=0.81, max_iter=164, penalty=elasticnet, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.81, max_iter=164, penalty=elasticnet, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.81, max_iter=164, penalty=elasticnet, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.81, max_iter=164, penalty=elasticnet, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.81, max_iter=164, penalty=elasticnet, solver=lbfgs, warm_start=False; total time=   0.0s
[CV] END C=0.45, max_iter=571, penalty=elasticnet, solver=lbfgs, warm_start=True; total time=   0.0s
[CV] END C=0.45, max_iter=571, penalty=elasticnet, solver=lbfgs, warm_start=True; total time=   0.0s
[CV] END C=0.45, max_iter=571, penalty=elasticnet, solver=lbfgs, warm_start=True; total time=   0.0s
[CV] END C=0.45, max_iter=571, penalty=elasticnet, solver=lbfgs, warm_start=True; total time=  



[CV] END C=0.42000000000000004, max_iter=177, penalty=l1, solver=saga, warm_start=True; total time=   2.1s
[CV] END C=0.13, max_iter=391, penalty=elasticnet, solver=saga, warm_start=False; total time=   0.0s
[CV] END C=0.13, max_iter=391, penalty=elasticnet, solver=saga, warm_start=False; total time=   0.0s
[CV] END C=0.13, max_iter=391, penalty=elasticnet, solver=saga, warm_start=False; total time=   0.0s
[CV] END C=0.13, max_iter=391, penalty=elasticnet, solver=saga, warm_start=False; total time=   0.0s
[CV] END C=0.13, max_iter=391, penalty=elasticnet, solver=saga, warm_start=False; total time=   0.0s
[CV] END C=0.49, max_iter=553, penalty=l1, solver=newton-cg, warm_start=True; total time=   0.0s
[CV] END C=0.49, max_iter=553, penalty=l1, solver=newton-cg, warm_start=True; total time=   0.0s
[CV] END C=0.49, max_iter=553, penalty=l1, solver=newton-cg, warm_start=True; total time=   0.0s
[CV] END C=0.49, max_iter=553, penalty=l1, solver=newton-cg, warm_start=True; total time=   0.0s


600 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
110 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/suz/anaconda3/envs/conda4/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/suz/anaconda3/envs/conda4/lib/python3.11/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/suz/anaconda3/envs/conda4/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.

In [47]:
source_search.best_params_

{'warm_start': False,
 'solver': 'saga',
 'penalty': 'l2',
 'max_iter': 326,
 'C': 0.99}

In [48]:

# Step 2: Clean and transform using your fitted vectorizer
X_new = vectorizer.transform(source_test['source'].to_list())

# Step 3: Predict with your trained classifier
preds = source_search.predict(X_new)

# Step 4: Convert predictions back to original labels
predicted_labels = label_encoder.inverse_transform(preds)

In [49]:
print(classification_report(predicted_labels,source_test['bias_rating'].to_list() ))

              precision    recall  f1-score   support

      center       0.99      0.98      0.98       860
        left       0.99      0.96      0.97      2123
       right       0.94      0.99      0.96      1366

    accuracy                           0.97      4349
   macro avg       0.97      0.98      0.97      4349
weighted avg       0.97      0.97      0.97      4349



In [50]:
# Double Check for test Count
print(len(test['bias_rating']))

4351


In [51]:
print(len(source_test['bias_rating']))

4349


In [52]:
print(len(heading_test['bias_rating']))

4351


In [53]:
print(len(df_test['bias_rating']))

4351
