In [4]:
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import string

# Step 1: Read data from CSV into DataFrame
df = pd.read_csv('/content/bbc_articles.csv')

# Step 2: Tokenize and preprocess text data
def preprocess_text(text):
    # Tokenization
    tokens = nltk.word_tokenize(text)

    # Lowercasing and removing punctuation
    tokens = [token.lower() for token in tokens if token.isalnum()]

    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    return ' '.join(tokens)

# Apply preprocessing to the 'text' column
df['clean_text'] = df['text'].apply(preprocess_text)

# Step 3: Vectorize text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])

# Convert TF-IDF matrix to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Step 4: Save numerical features and labels to a new CSV file
numerical_features = pd.concat([tfidf_df, df['category']], axis=1)
numerical_features.to_csv('numerical_features.csv', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
