# **Task 1**

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import json
from time import time


In [3]:
import sklearn
from sklearn.datasets import load_files
from sklearn.datasets import fetch_20newsgroups


In [4]:

newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)

df = pd.DataFrame({'data': newsgroups.data, 'target': newsgroups.target})

print(df.head())


                                                data  target
0  From: Mamatha Devineni Ratnam <mr47+@andrew.cm...      10
1  From: mblawson@midway.ecn.uoknor.edu (Matthew ...       3
2  From: hilmi-er@dsv.su.se (Hilmi Eren)\nSubject...      17
3  From: guyd@austin.ibm.com (Guy Dawson)\nSubjec...       3
4  From: Alexander Samuel McDiarmid <am2o+@andrew...       4


# Task 3 :
Display the length of all categories in the input dataset

In [5]:
target_names = newsgroups.target_names

category_lengths = [len(newsgroups.data[newsgroups.target == target_names[i]]) for i in range(len(target_names))]

# Print the length of each category
for i, name in enumerate(target_names):
    print(f"{name}: {category_lengths[i]}")

alt.atheism: 902
comp.graphics: 902
comp.os.ms-windows.misc: 902
comp.sys.ibm.pc.hardware: 902
comp.sys.mac.hardware: 902
comp.windows.x: 902
misc.forsale: 902
rec.autos: 902
rec.motorcycles: 902
rec.sport.baseball: 902
rec.sport.hockey: 902
sci.crypt: 902
sci.electronics: 902
sci.med: 902
sci.space: 902
soc.religion.christian: 902
talk.politics.guns: 902
talk.politics.mideast: 902
talk.politics.misc: 902
talk.religion.misc: 902


# Task 4 
Clean the data and save it as a single dataframe


In [6]:
import re

In [7]:
import re
from nltk.corpus import stopwords

In [8]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
stop_words = set(stopwords.words('english'))
clean_data = []

for doc in df['data']:
# Remove special characters and convert to lowercase
    cleaned_doc = re.sub(r'[^a-zA-Z]', ' ', doc).lower()
    
# Remove stop words
    cleaned_doc = ' '.join([word for word in cleaned_doc.split() if word not in stop_words])
    clean_data.append(cleaned_doc)

df['cleaned_data']=pd.DataFrame(clean_data)



In [13]:
df.head()

Unnamed: 0,data,target,cleaned_data
0,From: Mamatha Devineni Ratnam <mr47+@andrew.cm...,10,mamatha devineni ratnam mr andrew cmu edu subj...
1,From: mblawson@midway.ecn.uoknor.edu (Matthew ...,3,mblawson midway ecn uoknor edu matthew b lawso...
2,From: hilmi-er@dsv.su.se (Hilmi Eren)\nSubject...,17,hilmi er dsv su se hilmi eren subject armenia ...
3,From: guyd@austin.ibm.com (Guy Dawson)\nSubjec...,3,guyd austin ibm com guy dawson subject ide vs ...
4,From: Alexander Samuel McDiarmid <am2o+@andrew...,4,alexander samuel mcdiarmid andrew cmu edu subj...


# Task 5 
Split the dataframe into train and test.

In [14]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df['data'],df['target'],test_size=0.2, random_state=13)

In [15]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((15076,), (3770,), (15076,), (3770,))

In [16]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(f"Number of documents in train set: {len(train_df)}")
print(f"Number of documents in test set: {len(test_df)}")

Number of documents in train set: 15076
Number of documents in test set: 3770


# Task 6 
Perform multi- label classification with fastText

In [17]:
!pip install fasttext
import fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [18]:
clean_data = []
with open('train.txt', 'w') as f:
    for index, row in train_df.iterrows():
        f.write('__label__' + str(row['target']) + ' ' + row['cleaned_data'] + '\n')

# Save test data to file
with open('test.txt', 'w') as f:
    for index, row in test_df.iterrows():
        f.write('__label__' + str(row['target']) + ' ' + row['cleaned_data'] + '\n')

In [19]:
import fasttext

# Train the model
model = fasttext.train_supervised('train.txt')

# Evaluate the model on the test data
test_results = model.test('test.txt')

In [20]:
from sklearn.metrics import classification_report

with open('test.txt', 'r') as f:
    test_data = f.readlines()
predicted_labels = [model.predict(doc.strip())[0] for doc in test_data]
# Extract the predicted and true labels from the predicted_labels list
predicted_labels = [pred[0].replace('__label__', '') for pred in predicted_labels]
true_labels = [doc.strip().split()[0].replace('__label__', '') for doc in test_data]
print(classification_report(true_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       151
           1       0.00      0.00      0.00       202
          10       0.28      0.80      0.41       198
          11       0.44      0.33      0.38       201
          12       0.00      0.00      0.00       202
          13       0.00      0.00      0.00       194
          14       1.00      0.01      0.02       189
          15       0.16      0.92      0.27       202
          16       0.07      0.02      0.03       188
          17       0.00      0.00      0.00       182
          18       0.00      0.00      0.00       159
          19       0.00      0.00      0.00       136
           2       0.29      0.39      0.33       195
           3       0.09      0.04      0.06       183
           4       0.00      0.00      0.00       205
           5       0.34      0.72      0.46       215
           6       0.20      0.79      0.32       193
           7       0.00    

# Task 7

In [21]:
from sklearn.metrics import f1_score

f1 = f1_score(true_labels, predicted_labels, average='weighted')
print(f"F1-score: {f1*100:.2f}%")

F1-score: 13.11%


# Interpretation 
Fasttext is considered to be the quickest and smoothest text classification algorithms for huge datasets.