# Initial Setup

In [None]:
# imports

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from openai import OpenAI
import os
from dotenv import load_dotenv
import s3fs
import fs_s3fs
import fsspec
import json
from llama_index.core import TreeIndex, SimpleDirectoryReader
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
import tensorflow as tf
import keras
import torch
import transformers
import mlflow
import hyperopt as hp
import sphinx
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
# download stopwords

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
# setup python environment

# !python -m venv C:\Users\nickr\OneDrive\Documents\GitHub\generative-ai-text-summarization\config

In [None]:
# Load datasets

df_train = pd.read_csv(r"C:\Users\nickr\OneDrive\Desktop\CapstoneTechX\train.csv")

In [None]:
df_train.head() # Confirm importation

# Data Cleaning and Preprocessing

In [None]:
# find null values and datatypes

df_train.info(memory_usage='deep')

There are no null values in the df_train dataset.

In [None]:
# check for duplicates

df_train.duplicated().sum()

There are no duplicate values in the df_train dataset.

In [None]:
# Cleaning data set html, special, and non-textual characters

def cleaning_text(text):
    # Remove HTML tags
    cleaning_text = re.sub('<.*?>', '', text)
    # Remove special characters and non-textual 
    cleaning_text = re.sub(r'([^a-zA-Z\s]|\\b[A-Za-z] \\b|\\b [A-Za-z]\\b)', ' ', cleaning_text) # checks plain text for given characters
    return cleaning_text

In [None]:
# apply text cleaning to text in both Description and Title

df_train['Description'] = df_train['Description'].apply(cleaning_text)
df_train['Title'] = df_train['Title'].apply(cleaning_text)

In [None]:
df_train.head(7)

In [None]:
# Create a function to remove stop words

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization and lowercasing
    tokens = [word for word in tokens if word not in stop_words]  # Stop word removal
    return ' '.join(tokens)

In [None]:
# apply preprocessing to text in both Description and Title

df_train['Description'] = df_train['Description'].apply(preprocess_text)
df_train['Title'] = df_train['Title'].apply(preprocess_text)

In [None]:
df_train.head(7)

In [None]:
# convert to CSV for ease of use in future

cleaned_data_file = r'C:\Users\nickr\OneDrive\Documents\GitHub\generative-ai-text-summarization\data\cleaned_ag_news.csv'
df_train.to_csv(cleaned_data_file, index=False)

# Data Splitting

In [None]:
# Split training data into training and validation data 

df_train, df_test = train_test_split(df_train, test_size=.15, random_state=42)