# Generative AI Text Summarization

In [26]:
# imports

import opendatasets as od
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from openai import OpenAI
import os
from dotenv import load_dotenv
import s3fs
import fs_s3fs
import fsspec
import json
from llama_index.core import TreeIndex, SimpleDirectoryReader
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
import tensorflow as tf
import keras
import transformers
import mlflow
import hyperopt as hp
import sphinx
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
# download stopwords

# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nickr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
pd.set_option('display.max_colwidth', None)

In [27]:
# Download dataset from Kaggle

dataset = "https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset/data"
od.download(dataset)

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:

  Nicholas Royal


Your Kaggle Key:

  ········


Dataset URL: https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset
Downloading ag-news-classification-dataset.zip to .\ag-news-classification-dataset


100%|█████████████████████████████████████████████████████████████████████████████| 11.4M/11.4M [00:00<00:00, 42.6MB/s]





In [30]:
# Read dataset, import only 30000 rows of data

df = pd.read_csv(r'C:\Users\nickr\OneDrive\Documents\GitHub\generative-ai-text-summarization\data\ag-news-classification-dataset\ag_news.csv',nrows=30000)

In [32]:
df.shape

(30000, 3)

In [33]:
df.head() # Confirm importation

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again."
1,3,Carlyle Looks Toward Commercial Aerospace (Reuters),"Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market."
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.
3,3,Iraq Halts Oil Exports from Main Southern Pipeline (Reuters),"Reuters - Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday."
4,3,"Oil prices soar to all-time record, posing new menace to US economy (AFP)","AFP - Tearaway world oil prices, toppling records and straining wallets, present a new economic menace barely three months before the US presidential elections."


# Data Cleaning and Preprocessing

In [34]:
# find null values and datatypes

df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Class Index  30000 non-null  int64 
 1   Title        30000 non-null  object
 2   Description  30000 non-null  object
dtypes: int64(1), object(2)
memory usage: 10.3 MB


There are no null values in the df_train dataset.

In [35]:
# check for duplicates

df.duplicated().sum()

0

There are no duplicate values in the df_train dataset.

In [36]:
# Cleaning data set html, special, and non-textual characters

def cleaning_text(text):
    # Remove HTML tags
    cleaning_text = re.sub('<.*?>', '', text)
    # Remove special characters and non-textual 
    cleaning_text = re.sub(r'([^a-zA-Z\s]|\\b[A-Za-z] \\b|\\b [A-Za-z]\\b)', ' ', cleaning_text) # checks plain text for given characters
    return cleaning_text

In [37]:
# apply text cleaning to text in both Description and Title

df['Description'] = df['Description'].apply(cleaning_text)
df['Title'] = df['Title'].apply(cleaning_text)

In [38]:
df.head(7)

Unnamed: 0,Class Index,Title,Description
0,3,Wall St Bears Claw Back Into the Black Reuters,Reuters Short sellers Wall Street s dwindling band of ultra cynics are seeing green again
1,3,Carlyle Looks Toward Commercial Aerospace Reuters,Reuters Private investment firm Carlyle Group which has a reputation for making well timed and occasionally controversial plays in the defense industry has quietly placed its bets on another part of the market
2,3,Oil and Economy Cloud Stocks Outlook Reuters,Reuters Soaring crude prices plus worries about the economy and the outlook for earnings are expected to hang over the stock market next week during the depth of the summer doldrums
3,3,Iraq Halts Oil Exports from Main Southern Pipeline Reuters,Reuters Authorities have halted oil export flows from the main pipeline in southern Iraq after intelligence showed a rebel militia could strike infrastructure an oil official said on Saturday
4,3,Oil prices soar to all time record posing new menace to US economy AFP,AFP Tearaway world oil prices toppling records and straining wallets present a new economic menace barely three months before the US presidential elections
5,3,Stocks End Up But Near Year Lows Reuters,Reuters Stocks ended slightly higher on Friday but stayed near lows for the year as oil prices surged past a barrel offsetting a positive outlook from computer maker Dell Inc DELL O
6,3,Money Funds Fell in Latest Week AP,AP Assets of the nation s retail money market mutual funds fell by billion in the latest week to trillion the Investment Company Institute said Thursday


In [39]:
# Create a function to remove stop words

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization and lowercasing
    tokens = [word for word in tokens if word not in stop_words]  # Stop word removal
    return ' '.join(tokens)

In [40]:
# apply preprocessing to text in both Description and Title

df['Description'] = df['Description'].apply(preprocess_text)
df['Title'] = df['Title'].apply(preprocess_text)

In [41]:
df.head(7)

Unnamed: 0,Class Index,Title,Description
0,3,wall st bears claw back black reuters,reuters short sellers wall street dwindling band ultra cynics seeing green
1,3,carlyle looks toward commercial aerospace reuters,reuters private investment firm carlyle group reputation making well timed occasionally controversial plays defense industry quietly placed bets another part market
2,3,oil economy cloud stocks outlook reuters,reuters soaring crude prices plus worries economy outlook earnings expected hang stock market next week depth summer doldrums
3,3,iraq halts oil exports main southern pipeline reuters,reuters authorities halted oil export flows main pipeline southern iraq intelligence showed rebel militia could strike infrastructure oil official said saturday
4,3,oil prices soar time record posing new menace us economy afp,afp tearaway world oil prices toppling records straining wallets present new economic menace barely three months us presidential elections
5,3,stocks end near year lows reuters,reuters stocks ended slightly higher friday stayed near lows year oil prices surged past barrel offsetting positive outlook computer maker dell inc dell
6,3,money funds fell latest week ap,ap assets nation retail money market mutual funds fell billion latest week trillion investment company institute said thursday


In [42]:
# convert to CSV for ease of use in future

cleaned_data_file = r'C:\Users\nickr\OneDrive\Desktop\CapstoneTechX\ag_news_cleaned\cleaned_ag_news.csv'
df.to_csv(cleaned_data_file, index=False)

# Data Splitting

In [43]:
# Split training data into training and validation data 

df_train, df_test = train_test_split(df, test_size=.15, random_state=42)

In [44]:
# Create csv file for train and test data

df_train.to_csv(os.path.join(r'C:\Users\nickr\OneDrive\Desktop\CapstoneTechX\ag_news_cleaned', 'train.csv'), index=False)
df_test.to_csv(os.path.join(r'C:\Users\nickr\OneDrive\Desktop\CapstoneTechX\ag_news_cleaned', 'test.csv'), index=False)

In [45]:
# Create feature data directory

feature_data_dir = r'C:\Users\nickr\OneDrive\Desktop\CapstoneTechX\features'
os.makedirs(feature_data_dir, exist_ok=True)

In [46]:
# TF-IDF Vectorization for Description

tfidf_vectorizer = TfidfVectorizer(max_features=5000)  #we can play around with this. This was an arbitrary value
train_desc_features = tfidf_vectorizer.fit_transform(df_train['Description'])
test_desc_features = tfidf_vectorizer.transform(df_test['Description'])

In [47]:
# TF-IDF Vectorization for Title

tfidf_vectorizer = TfidfVectorizer(max_features=5000)  #we can play around with this. This was an arbitrary value
train_title_features = tfidf_vectorizer.fit_transform(df_train['Title'])
test_title_features = tfidf_vectorizer.transform(df_test['Title'])

In [48]:
print(train_desc_features)

  (0, 2225)	0.33036356267479217
  (0, 3733)	0.4184211803722469
  (0, 1750)	0.3583351578674814
  (0, 2716)	0.39671023064914784
  (0, 1881)	0.3918390748743369
  (0, 94)	0.3785117392696691
  (0, 179)	0.3649541648533077
  (1, 890)	0.23617821913580844
  (1, 1530)	0.24670624310566927
  (1, 3928)	0.2598949916429091
  (1, 4724)	0.14947068143830217
  (1, 233)	0.28935003331476816
  (1, 3648)	0.3071704444233021
  (1, 1673)	0.20373694738379866
  (1, 3696)	0.2912849075171654
  (1, 1740)	0.2306024110131076
  (1, 4553)	0.18091189277274572
  (1, 1364)	0.22020085775839038
  (1, 106)	0.22894752638880386
  (1, 1878)	0.22778402968916106
  (1, 1544)	0.20752450349361823
  (1, 3017)	0.23216621531217477
  (1, 4007)	0.23565503871857377
  (1, 3480)	0.2159983367549507
  (1, 1891)	0.22623557593271407
  :	:
  (25498, 225)	0.21773479405857907
  (25498, 920)	0.3903686330619651
  (25498, 2952)	0.11657380795674555
  (25499, 4150)	0.2799812745611309
  (25499, 4151)	0.26327930512428127
  (25499, 3592)	0.2620197031963991

Note, the vectorizer produces a value for a specific word on a scale of 0 to 1. The closer the number is to 1, the more unique that word is.

In [49]:
# Save the TF-IDF feature matrices

pd.DataFrame(train_desc_features.toarray()).to_csv(os.path.join(feature_data_dir, 'train_desc_features.csv'), index=False)
pd.DataFrame(test_desc_features.toarray()).to_csv(os.path.join(feature_data_dir, 'test_desc_featuress.csv'), index=False)
pd.DataFrame(train_title_features.toarray()).to_csv(os.path.join(feature_data_dir, 'train_title_features.csv'), index=False)
pd.DataFrame(test_title_features.toarray()).to_csv(os.path.join(feature_data_dir, 'test_title_featuress.csv'), index=False)