In [59]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report

# **1. Loading data set**

In [58]:
corpus = pd.read_csv('C:\\Users\\DELL\\OneDrive\\Bureau\\M2-RFIA\\Text Mining\\TP\\text-mining\\data\\bbc-text.csv', encoding='utf-8')
corpus.head() # print the first 5 rows

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


### **1.1 Checking for missing values**

In [20]:
corpus.isnull().sum() # check for missing values
# case of missing values in the corpus we can use corpus.fillna(method='ffill') to fill them with the previous value

category    0
text        0
dtype: int64

### **1.2 Unique Counts in 'category' and 'text' Columns**

In [21]:
corpus['category'].nunique(), corpus.text.nunique() # to print unique values ( for columns ) we can use nunique() function

(5, 2126)

### **1.3 Counting Occurrences of Unique Text**

In [22]:
corpus.groupby('text').size().reset_index(name='counts')

Unnamed: 0,text,counts
0,$1m payoff for former shell boss shell is to p...,1
1,&#163;1.8m indecency fine for viacom media gia...,1
2,2004: an irish athletics year 2004 won t be re...,1
3,2d metal slug offers retro fun like some drill...,2
4,a decade of good website design the web looks ...,1
...,...,...
2121,yukos seeks court action on sale yukos will re...,1
2122,yukos sues four firms for $20bn russian oil fi...,1
2123,yukos unit buyer faces loan claim the owners o...,1
2124,yukos unit fetches $9bn at auction a little-kn...,1


# **2. Preparing Data for Classification**

### **2.1 Extracting Texts**

In [23]:
text = corpus.drop('category', axis=1) # axis=1 means that we are referring to a column, not a row
text.head() # to print the first 5 rows

Unnamed: 0,text
0,tv future in the hands of viewers with home th...
1,worldcom boss left books alone former worldc...
2,tigers wary of farrell gamble leicester say ...
3,yeading face newcastle in fa cup premiership s...
4,ocean s twelve raids box office ocean s twelve...


In [40]:
text.columns # to print columns names

Index(['text'], dtype='object')

### **2.1.1 Text Data Vectorization with DictVectorizer**

In this section, we'll use the scikit-learn library to vectorize text data using the DictVectorizer. 
This process involves converting a DataFrame containing text into a numerical format suitable for machine learning models.

In [37]:
v = DictVectorizer(sparse=False) # to convert text to numerical values 
# ( expected output to be a dense matrix (NumPy array) rather than a sparse matrix )
texts = v.fit_transform(text.to_dict('records'))
# to convert the output to a dataframe
texts.shape 
# to print the shape of the dataframe

(2225, 2126)

### **2.2 Extracting Categories**

In [26]:
categories = corpus.category.values

### **2.3 Data Preparation**

In [27]:

classes = np.unique(categories) # to get unique values of categories

In [28]:
classes = classes.tolist() # to convert numpy array to list

In [35]:
classes

['business', 'entertainment', 'politics', 'sport', 'tech']

In [29]:

texts.shape, categories.shape 
# to print the shape of texts and categories

((2225, 2126), (2225,))

### **2.4 Train-Test Split**

In [30]:
text_features_train, text_features_test, categories_labels_train, categories_labels_test = train_test_split(texts, categories, test_size = 0.33, random_state=0) 
# to split the data into training and testing sets

In [31]:
text_features_train.shape, categories_labels_train.shape
# to print the shape of training data

((1490, 2126), (1490,))

### **2.5 Partial Fit of Perceptron Model**

In [None]:

per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(text_features_train, categories_labels_train, classes=classes)

# **3. Classification Report for Text Classification Model**

In [36]:

print(classification_report(y_pred=per.predict(text_features_test), y_true=categories_labels_test, labels=classes))

               precision    recall  f1-score   support

     business       0.24      1.00      0.39       168
entertainment       1.00      0.07      0.12       123
     politics       1.00      0.02      0.05       124
        sport       1.00      0.02      0.03       190
         tech       1.00      0.18      0.30       130

     accuracy                           0.28       735
    macro avg       0.85      0.26      0.18       735
 weighted avg       0.83      0.28      0.18       735

