In [18]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords # Import the stop word list
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import make_pipeline

In [3]:
df = pd.read_csv('../part_01/Data/united_states_headlines_last_decade.csv')


In [4]:
df

Unnamed: 0.1,Unnamed: 0,abstract,headline,pub_date,year,section_name,news_desk,keyword
0,0,"With the New Start treaty nearly wrapped up, P...",The Next Treaties,2011-01-01T02:42:37+0000,2011,Opinion,Editorial,United States
1,1,A response to a plea by Israel for the release...,Plea for Pollard’s Release,2011-01-01T02:42:37+0000,2011,Opinion,Letters,United States
2,2,Letters to the editor regarding tensions betwe...,Where Do Doctors Learn Best?,2011-01-03T03:03:04+0000,2011,Opinion,Letters,United States
3,3,Canada beat the United States in the semifinal...,Canada Beats United States in World Junior Sem...,2011-01-04T04:01:03+0000,2011,Sports,Sports,United States
4,4,"Kristine Lilly, who holds the world record wit...","Lilly, Iron Lady of U.S. Soccer, Retires at 39",2011-01-05T22:19:53+0000,2011,Sports,,United States
...,...,...,...,...,...,...,...,...
25505,25505,Unconscious bias in the medical profession may...,How Bias Endangers Pregnant Black Women,2023-12-29T16:39:00+0000,2023,Health,Health,United States
25506,25506,The Upshot staff shares a selection of their f...,10 Data Points and Documents That Made Us 🤔 in...,2023-12-30T05:05:46+0000,2023,The Upshot,The Upshot,United States
25507,25507,The move comes as Israel presses on with its o...,Biden Administration Again Bypasses Congress f...,2023-12-30T11:33:49+0000,2023,World,Foreign,United States
25508,25508,President Biden seems nervous about sealing a ...,"Electrify All the Big, Noisy, Belching Trucks",2023-12-30T12:00:14+0000,2023,Opinion,OpEd,United States


In [5]:
df['section_name'].value_counts(normalize=True)

section_name
U.S.                    0.364239
Opinion                 0.234201
World                   0.131253
Business Day            0.080877
Sports                  0.027325
Health                  0.024385
Science                 0.011722
Podcasts                0.011173
Technology              0.009918
The Upshot              0.009801
New York                0.008821
Arts                    0.008115
Briefing                0.008037
Magazine                0.006547
Travel                  0.006508
Blogs                   0.006469
Multimedia/Photos       0.005920
Books                   0.005645
Food                    0.005606
Climate                 0.005449
Automobiles             0.003999
Your Money              0.003999
Real Estate             0.003371
Style                   0.003175
Sunday Review           0.001999
Well                    0.001490
Fashion & Style         0.001372
Education               0.001255
Times Insider           0.001176
Movies                  0.0010

#### Idea:  Only Keep Opinion Pieces

In [6]:
USA_op_pieces = df[df['section_name'] == 'Opinion']

In [7]:
USA_op_pieces.columns

Index(['Unnamed: 0', 'abstract', 'headline', 'pub_date', 'year',
       'section_name', 'news_desk', 'keyword'],
      dtype='object')

In [8]:
columns_to_drop = ['Unnamed: 0', 'keyword', 'news_desk', 'keyword','pub_date','year']

In [9]:
df = df.drop(columns=columns_to_drop)

In [10]:
df

Unnamed: 0,abstract,headline,section_name
0,"With the New Start treaty nearly wrapped up, P...",The Next Treaties,Opinion
1,A response to a plea by Israel for the release...,Plea for Pollard’s Release,Opinion
2,Letters to the editor regarding tensions betwe...,Where Do Doctors Learn Best?,Opinion
3,Canada beat the United States in the semifinal...,Canada Beats United States in World Junior Sem...,Sports
4,"Kristine Lilly, who holds the world record wit...","Lilly, Iron Lady of U.S. Soccer, Retires at 39",Sports
...,...,...,...
25505,Unconscious bias in the medical profession may...,How Bias Endangers Pregnant Black Women,Health
25506,The Upshot staff shares a selection of their f...,10 Data Points and Documents That Made Us 🤔 in...,The Upshot
25507,The move comes as Israel presses on with its o...,Biden Administration Again Bypasses Congress f...,World
25508,President Biden seems nervous about sealing a ...,"Electrify All the Big, Noisy, Belching Trucks",Opinion


#### First step is correctly identifying whether it's an opinion-based article or not.

##### '1' denotes positive class, i.e. Opinion piece '0' not opinion piece. 



In [11]:
df['section_name'] = np.where(df['section_name'] == 'Opinion', 1, 0)

In [12]:
df

Unnamed: 0,abstract,headline,section_name
0,"With the New Start treaty nearly wrapped up, P...",The Next Treaties,1
1,A response to a plea by Israel for the release...,Plea for Pollard’s Release,1
2,Letters to the editor regarding tensions betwe...,Where Do Doctors Learn Best?,1
3,Canada beat the United States in the semifinal...,Canada Beats United States in World Junior Sem...,0
4,"Kristine Lilly, who holds the world record wit...","Lilly, Iron Lady of U.S. Soccer, Retires at 39",0
...,...,...,...
25505,Unconscious bias in the medical profession may...,How Bias Endangers Pregnant Black Women,0
25506,The Upshot staff shares a selection of their f...,10 Data Points and Documents That Made Us 🤔 in...,0
25507,The move comes as Israel presses on with its o...,Biden Administration Again Bypasses Congress f...,0
25508,President Biden seems nervous about sealing a ...,"Electrify All the Big, Noisy, Belching Trucks",1


#### Once the identification is made, classify how well it can tell and then for those Opinion pieces get a bias scale from those based on language and return that score. 

##### That is where the AI aspect comes in 

# Train Test Split and Cleaning

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df[['abstract', 'headline']],
                                                    df['section_name'],
                                                    train_size = 0.80,
                                                    random_state = 42)

In [14]:
def filtered_words(train_data):
    # Function to convert a raw post to a string of words
    # The input is a single string (a raw subreddit title), and 
    # the output is a single string (a preprocessed subreddit post)
    
    # 1. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", train_data)
    
    # 2. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 3. In Python, searching a set is much faster than searching
    # a list, so convert the stop words to a set.
    stops = set(stopwords.words('english'))
    
    # 4. Remove stop words.
    meaningful_words = [w for w in words if w not in stops]
    
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(meaningful_words))

total_abs_headlines = df.shape[0]
print(f'There are {total_abs_headlines} posts.')

# Initialize an empty list to hold the clean abstracts and headlines.
clean_train_abs = []
clean_test_abs = []
clean_train_lines = []
clean_test_lines = []

print("Cleaning and parsing the training set abstracts...")

j = 0
for train_abs in X_train['abstract']:
    # Convert abstract to words, then append to clean_train_abs.
    train_abs_str = str(train_abs)
    clean_train_abs.append(filtered_words(train_abs_str))   
    # If the index is divisible by 5000, print a message
    if (j + 1) % 5000 == 0:
        print(f'Post {j + 1} of {total_abs_headlines}.')
    
    j += 1

print("Cleaning and parsing the testing set abstracts...")
# Do the same for testing set.
for test_abs in X_test['abstract']:
    # Convert abstract to words, then append to clean_test_abs.
    test_abs_str = str(test_abs)
    clean_test_abs.append(filtered_words(test_abs_str))
    
    # If the index is divisible by 500, print a message
    if (j + 1) % 500 == 0:
        print(f'Post {j + 1} of {total_abs_headlines}.')
        
    j += 1

print("Cleaning and parsing the training set headlines...")

j = 0
for train_lines in X_train['headline']:
    # Convert headline to words, then append to clean_train_lines.
    train_lines_str = str(train_lines)
    clean_train_lines.append(filtered_words(train_lines_str))
    
    # If the index is divisible by 5000, print a message
    if (j + 1) % 5000 == 0:
        print(f'Post {j + 1} of {total_abs_headlines}.')
    
    j += 1

print("Cleaning and parsing the testing set headlines...")
# Do the same for testing set.
for test_lines in X_test['headline']:
    # Convert headline to words, then append to clean_test_lines.
    test_lines_str = str(test_lines)
    clean_test_lines.append(filtered_words(test_lines_str))
    
    # If the index is divisible by 500, print a message
    if (j + 1) % 500 == 0:
        print(f'Post {j + 1} of {total_abs_headlines}.')
        
    j += 1


There are 25510 posts.
Cleaning and parsing the training set abstracts...
Post 5000 of 25510.
Post 10000 of 25510.
Post 15000 of 25510.
Post 20000 of 25510.
Cleaning and parsing the testing set abstracts...
Post 20500 of 25510.
Post 21000 of 25510.
Post 21500 of 25510.
Post 22000 of 25510.
Post 22500 of 25510.
Post 23000 of 25510.
Post 23500 of 25510.
Post 24000 of 25510.
Post 24500 of 25510.
Post 25000 of 25510.
Post 25500 of 25510.
Cleaning and parsing the training set headlines...
Post 5000 of 25510.
Post 10000 of 25510.
Post 15000 of 25510.
Post 20000 of 25510.
Cleaning and parsing the testing set headlines...
Post 20500 of 25510.
Post 21000 of 25510.
Post 21500 of 25510.
Post 22000 of 25510.
Post 22500 of 25510.
Post 23000 of 25510.
Post 23500 of 25510.
Post 24000 of 25510.
Post 24500 of 25510.
Post 25000 of 25510.
Post 25500 of 25510.


# Preliminary Modeling Section

# Baseline accuracy
Calculate baseline accuracy in order to tell if model is better than null model (predicting the plurality class).

In [15]:
df['section_name'].value_counts(normalize=True)

section_name
0    0.765817
1    0.234183
Name: proportion, dtype: float64

In [16]:
# Baseline prediction dem 1 rep 0
y_train.value_counts(normalize=True)

section_name
0    0.765729
1    0.234271
Name: proportion, dtype: float64

In [17]:
print(f"Difference metric | {round(np.abs(np.mean(df['section_name'])-np.mean(y_train)), 5)}")

Difference metric | 9e-05


# Classification Modeling Section