In [1]:
!pip install fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 KB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2
  Using cached pybind11-2.10.3-py3-none-any.whl (222 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp39-cp39-linux_x86_64.whl size=4391744 sha256=0e80a763521b20928e60aaebd1dc0e028e2fe4135fd79a3d858d727c236d8ecc
  Stored in directory: /root/.cache/pip/wheels/64/57/bc/1741406019061d5664914b070bd3e71f6244648732bc96109e
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.3


In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
import fasttext

In [3]:
# task 1

from sklearn.datasets import fetch_20newsgroups

In [4]:
# task 2

newsgroups = fetch_20newsgroups()
X = newsgroups.data
Y = newsgroups.target

df = pd.concat([pd.DataFrame(X).rename(columns = ({0: 'Text'})) , pd.DataFrame(Y).rename(columns = ({0: 'Target'}))], axis = 1) 
df

Unnamed: 0,Text,Target
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14
...,...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,4
11311,From: westes@netcom.com (Will Estes)\nSubject:...,3
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1


In [5]:
# task 3

df['Target'].value_counts()

10    600
15    599
8     598
9     597
11    595
7     594
13    594
5     593
14    593
2     591
12    591
3     590
6     585
1     584
4     578
17    564
16    546
0     480
18    465
19    377
Name: Target, dtype: int64

In [6]:
# task 4

def clean_text(text):
    re.sub("[^0-9A-Za-z\-]+"," ",text)  
    re.sub("(?<!\w)\d+","",text)       
    re.sub("-(?<!\w)| (?<!\w)-",'',text) 
    text = " ".join(text.split())
    text = text.lower()             
    return text

In [7]:
df['Text'] = df['Text'].apply(clean_text)
df['Target'] = pd.Categorical.from_codes(newsgroups.target,newsgroups.target_names)

In [8]:
df

Unnamed: 0,Text,Target
0,from: lerxst@wam.umd.edu (where's my thing) su...,rec.autos
1,from: guykuo@carson.u.washington.edu (guy kuo)...,comp.sys.mac.hardware
2,from: twillis@ec.ecn.purdue.edu (thomas e will...,comp.sys.mac.hardware
3,from: jgreen@amber (joe green) subject: re: we...,comp.graphics
4,from: jcm@head-cfa.harvard.edu (jonathan mcdow...,sci.space
...,...,...
11309,from: jim.zisfein@factory.com (jim zisfein) su...,sci.med
11310,from: ebodin@pearl.tufts.edu subject: screen d...,comp.sys.mac.hardware
11311,from: westes@netcom.com (will estes) subject: ...,comp.sys.ibm.pc.hardware
11312,from: steve@hcrlgw (steven collins) subject: r...,comp.graphics


In [9]:
# task 5

#X_train,X_test,y_train,y_test = train_test_split(df['Text'], df['Target'],train_size=0.8,random_state=42)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=13)

In [10]:
# task 6

with open('concept_train.txt','w+', encoding="utf-8") as train_file:
    for i in train_df.index:
        labels = ''
        for genre in train_df['Target'][i]:
            labels = labels + '__label__' + genre.replace(' ', '_') + ' '
        line = labels + ' '+train_df['Text'][i]
        train_file.write(line+'\n')

with open('concept_test.txt','w+', encoding="utf-8") as test_file:
    for i in test_df.index:
        labels = ''
        for genre in test_df['Target'][i]:
            labels = labels + '__label__'+ genre.replace(' ', '_') + ' '
        line = labels + ' '+test_df['Text'][i]
        test_file.write(line+'\n')

model_sup = fasttext.train_supervised(input='concept_train.txt', lr=0.5, epoch=20, wordNgrams=3, bucket=20000)
test_num, precision, recall = model_sup.test('concept_test.txt', k=1, threshold=0.1)
f_score = (2*precision*recall)/(precision+recall)

for i in zip(model_sup.words, model_sup.labels):
    print(i)

('the', '__label__.')
('to', '__label__s')
('of', '__label__c')
('a', '__label__i')
('and', '__label__o')
('in', '__label__a')
('is', '__label__r')
('i', '__label__e')
('that', '__label__t')
('>', '__label__m')
('for', '__label__p')
('it', '__label__l')
('you', '__label__n')
('on', '__label__w')
('be', '__label__d')
('are', '__label__h')
('this', '__label__y')
('have', '__label__k')
('not', '__label__g')
('with', '__label__b')
('as', '__label__u')
('or', '__label__f')
('was', '__label__-')
('if', '__label__x')


In [11]:
# task 7

test_num, precision, recall = model_sup.test('concept_test.txt', k=1, threshold=0.1)
f1_score = (2*precision*recall)/(precision+recall)
print(f1_score)

0.11685417393872675


# task 8

FastText and One-vs-Rest (OvR) classifiers can have different performance depending on the specific implementation and the size of the dataset. However, in general, FastText is considered to be faster than OvR classifiers for text classification tasks.