In [1]:
import os
import csv
import zipfile

# Step 1: Unzip the data.zip file
with zipfile.ZipFile('data.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

# Step 2: Create a CSV file and write headers
csv_filename = 'bbc_articles.csv'
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['article_id', 'text', 'category'])

# Step 3: Read text files and write to CSV
articles_folder = 'BBC_articles'
for filename in os.listdir(articles_folder):
    if filename.endswith('.txt'):
        article_id, category = filename.split('_')
        with open(os.path.join(articles_folder, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow([article_id, text, category])


In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df=pd.read_csv('bbc_articles.csv')

In [6]:
df

Unnamed: 0,article_id,text,category
0,1003,jamelia s return to the top r&b star jamelia h...,entertainment.txt
1,1004,microsoft releases bumper patches microsoft ha...,tech.txt
2,1005,critics back aviator for oscars martin scorses...,entertainment.txt
3,1007,winemaker rejects foster s offer australian wi...,business.txt
4,1008,child access law shake-up planned parents who ...,politics.txt
...,...,...,...
1485,992,china s shanda buys stake in sina chinese onli...,business.txt
1486,993,concern over rfid tags consumers are very conc...,tech.txt
1487,995,jones medals must go if guilty world anti-do...,sport.txt
1488,997,benitez joy as reds take control liverpool bos...,sport.txt


In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import string

In [8]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anirb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anirb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Lowercasing, removing punctuation and stopwords, and stemming
    tokens = [ps.stem(token.lower()) for token in tokens if token.isalpha() and token.lower() not in stop_words]
    return ' '.join(tokens)

In [10]:
df['processed_text'] = df['text'].apply(preprocess_text)

# Step 3: Vectorize the text using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(df['processed_text'])

In [11]:
# Step 4: Combine numerical features with category labels
features_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names())
features_df['category'] = df['category']

# Step 5: Write the new CSV file
features_df.to_csv('bbc_articles_numerical_features.csv', index=False)



In [12]:
pd.read_csv('bbc_articles_numerical_features.csv').shape

(1490, 15723)

In [14]:
pd.read_csv('bbc_articles_numerical_features.csv').info

<bound method DataFrame.info of        aa  aaa  aac  aadc  aaliyah  aaltra  aamir  aaron  abacu  abandon  ...  \
0     0.0  0.0  0.0   0.0      0.0     0.0    0.0    0.0    0.0      0.0  ...   
1     0.0  0.0  0.0   0.0      0.0     0.0    0.0    0.0    0.0      0.0  ...   
2     0.0  0.0  0.0   0.0      0.0     0.0    0.0    0.0    0.0      0.0  ...   
3     0.0  0.0  0.0   0.0      0.0     0.0    0.0    0.0    0.0      0.0  ...   
4     0.0  0.0  0.0   0.0      0.0     0.0    0.0    0.0    0.0      0.0  ...   
...   ...  ...  ...   ...      ...     ...    ...    ...    ...      ...  ...   
1485  0.0  0.0  0.0   0.0      0.0     0.0    0.0    0.0    0.0      0.0  ...   
1486  0.0  0.0  0.0   0.0      0.0     0.0    0.0    0.0    0.0      0.0  ...   
1487  0.0  0.0  0.0   0.0      0.0     0.0    0.0    0.0    0.0      0.0  ...   
1488  0.0  0.0  0.0   0.0      0.0     0.0    0.0    0.0    0.0      0.0  ...   
1489  0.0  0.0  0.0   0.0      0.0     0.0    0.0    0.0    0.0      0.0  ...