# **Importing Packages**

In [18]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
!pip install langdetect
from langdetect import detect
import pickle
nltk.download("stopwords")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# **Loading the dataset**

In [19]:
#BBC News Classification
AG = pd.read_csv("ag.csv")

In [20]:
# Assuming your dataset is stored in a DataFrame called 'df'
AG_df = pd.DataFrame(AG, columns=['Class Index', 'Title', 'Description'])

# Change column names
AG_df = AG_df.rename(columns={'Class Index': 'category', 'Description': 'text'})

# Drop 'Title' column
AG_df = AG_df.drop('Title', axis=1)


# **Exploring the dataset**

In [5]:
print(list(AG_df.columns))

['category', 'text']


In [None]:
print(AG.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Class Index  120000 non-null  int64 
 1   Title        120000 non-null  object
 2   Description  120000 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.7+ MB
None


In [None]:
print(f"Dataframe Lenght: {len(AG_df)}")

Dataframe Lenght: 120000


In [21]:
#shuffling the data frame
AG_df = AG_df.sample(frac=1).reset_index(drop=True)

In [22]:
AG_df.head()

Unnamed: 0,category,text
0,1,The Army's intelligence chief said that he has...
1,3,US sales of previously owned houses rose in Se...
2,4,The new transfer policy for inter-registrar do...
3,3,Singapore #39;s unemployment rate fell in the ...
4,4,A prominent lineup of advertising talent is wo...


In [None]:
categories = list(np.unique(AG_df['category']))

In [None]:
print(categories)

[1, 2, 3, 4]


# **Preprocessing**


In [25]:
preprocessed_df = AG_df.copy()

cv = CountVectorizer(max_features = 5000)
corpus = []
ps = PorterStemmer()
for i in range(len(preprocessed_df)):
  news = re.sub('[^a-zA-Z]', ' ', preprocessed_df.iloc[i]['text']) #Regex to replace each not alphabetical char with a space
  news = news.lower().split()
  news = [ps.stem(word) for word in news if not word in set(stopwords.words('english'))] #Stemming and stopword removal
  news = ' '.join(news) #Joining the words of the array into a string 
  preprocessed_df.iloc[i]['text'] = news 



# The code snippet provided demonstrates the usage of the CountVectorizer class from scikit-learn (cv = CountVectorizer(max_features=5000)) along with some additional data preprocessing steps using NLTK and regular expressions.

# Let's break down the code step by step:

# cv = CountVectorizer(max_features=5000): An instance of CountVectorizer is created with the max_features parameter set to 5000. 
# This sets a limit on the maximum number of features (words) to consider when constructing the feature matrix.
# ---------------------------------------------------------------------------------------------------------------------------
# corpus = []: refers to a collection of text documents that are used for analysis or training of language models.
# ---------------------------------------------------------------------------------------------------------------------------
# ps = PorterStemmer(): An instance of PorterStemmer from NLTK is created.
# The PorterStemmer is used for stemming words, reducing them to their base or root form.
# ---------------------------------------------------------------------------------------------------------------
# A loop is set up to iterate over each row of the preprocessed_df DataFrame
# (assuming preprocessed_df is a DataFrame containing preprocessed text data).
# ---------------------------------------------------------------------------------------------------------------------------
# news = re.sub('[^a-zA-Z]', ' ', preprocessed_df.iloc[i]['text']):
# The re.sub() function from the re module is used to replace any non-alphabetical characters in the text with spaces.
# This regex pattern replaces all characters except alphabets [a-zA-Z] with a space: such as digits, punctuation marks, and special characters
# ---------------------------------------------------------------------------------------------------------------------------
# news = news.lower().split(): The text is converted to lowercase and then split into a list of words.
# ---------------------------------------------------------------------------------------------------------------------------
# news = [ps.stem(word) for word in news if not word in set(stopwords.words('english'))]: 
# The list comprehension iterates over each word in the news list and applies stemming using the PorterStemmer. 
# Additionally, it checks if the word is not present in the set of English stopwords obtained from NLTK (stopwords.words('english')). 
# This step removes stopwords from the text.
# ---------------------------------------------------------------------------------------------------------------------------
# news = ' '.join(news): The list of stemmed and filtered words is joined back into a single string, separated by spaces.
# ---------------------------------------------------------------------------------------------------------------------------
# preprocessed_df.iloc[i]['text'] = news: The preprocessed text (news) is assigned back to the 'text' column of the preprocessed_df DataFrame for the current row.
# ---------------------------------------------------------------------------------------------------------------------------
# The code processes the text data in the preprocessed_df DataFrame by applying the steps of replacing non-alphabetical characters, converting to lowercase, 
# splitting into words, stemming, removing stopwords, and finally joining the words back into a string. 
# This prepares the text data for further analysis or modeling using the CountVectorizer with a limit of 5000 features.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preprocessed_df.iloc[i]['text'] = news


In [None]:
#data after preprocessed
preprocessed_df['text'].head()

0    Reuters - Short-sellers, Wall Street's dwindli...
1    Reuters - Private investment firm Carlyle Grou...
2    Reuters - Soaring crude prices plus worries\ab...
3    Reuters - Authorities have halted oil export\f...
4    AFP - Tearaway world oil prices, toppling reco...
Name: text, dtype: object

In [26]:
# Creating a bag of words
preprocessed_df['text'] = list(cv.fit_transform(list(preprocessed_df['text'])).toarray())

''' 
cv.fit_transform(list(preprocessed_df['Text'])): The fit_transform() method of the CountVectorizer object (cv) is applied to the 'Text' column of the preprocessed_df DataFrame. 
This method fits the vectorizer to the text data and transforms the text documents into a matrix of token counts.

toarray(): The toarray() method is called on the transformed matrix to convert it from a sparse matrix representation to a dense array.

list(cv.fit_transform(list(preprocessed_df['Text'])).toarray()): The dense array representation of the transformed matrix is converted to a list. 
This creates a list of arrays, where each array corresponds to a row in the transformed matrix.

preprocessed_df['Text'] = ...: The 'Text' column in the preprocessed_df DataFrame is assigned the list of arrays obtained from the previous step. 
This replaces the original 'Text' column with the bag of words representation.

In summary, the code applies the CountVectorizer object (cv) to the 'Text' column of the preprocessed_df DataFrame, converting the text data into a bag of words representation. 
The resulting bag of words representation is then assigned back to the 'Text' column in the DataFrame.

Note that the bag of words representation will contain the count of each word in the text documents, 
where each column corresponds to a specific word and each row corresponds to a document in the DataFrame.

'''

" \ncv.fit_transform(list(preprocessed_df['Text'])): The fit_transform() method of the CountVectorizer object (cv) is applied to the 'Text' column of the preprocessed_df DataFrame. \nThis method fits the vectorizer to the text data and transforms the text documents into a matrix of token counts.\n\ntoarray(): The toarray() method is called on the transformed matrix to convert it from a sparse matrix representation to a dense array.\n\nlist(cv.fit_transform(list(preprocessed_df['Text'])).toarray()): The dense array representation of the transformed matrix is converted to a list. \nThis creates a list of arrays, where each array corresponds to a row in the transformed matrix.\n\npreprocessed_df['Text'] = ...: The 'Text' column in the preprocessed_df DataFrame is assigned the list of arrays obtained from the previous step. \nThis replaces the original 'Text' column with the bag of words representation.\n\nIn summary, the code applies the CountVectorizer object (cv) to the 'Text' column of

In [27]:
preprocessed_df.head()

Unnamed: 0,category,text
0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [28]:
# Label the data
categories = ['World','Sports','Business','Sci/Tech']
preprocessed_df['category'] = [ category-1 for category in preprocessed_df['category'] ]

'''

categories = list(np.unique(preprocessed_df['category'])): This line retrieves the unique categories present in the 'category' column of the DataFrame and stores them as a list in the categories variable. 
The np.unique() function from the NumPy library is used to obtain the unique values.

preprocessed_df['category'] = [categories.index(category) for category in preprocessed_df['category']]: This line assigns numerical labels to the categories in the 'category' column of the DataFrame. 
It uses a list comprehension to iterate over each category in the 'category' column.

For each category, categories.index(category) is used to find the index of the category in the categories list. 
This index represents the numerical label for that category.

The resulting numerical labels are then assigned back to the 'category' column of the DataFrame, replacing the original categorical values.

'''

"\n\ncategories = list(np.unique(preprocessed_df['category'])): This line retrieves the unique categories present in the 'category' column of the DataFrame and stores them as a list in the categories variable. \nThe np.unique() function from the NumPy library is used to obtain the unique values.\n\npreprocessed_df['category'] = [categories.index(category) for category in preprocessed_df['category']]: This line assigns numerical labels to the categories in the 'category' column of the DataFrame. \nIt uses a list comprehension to iterate over each category in the 'category' column.\n\nFor each category, categories.index(category) is used to find the index of the category in the categories list. \nThis index represents the numerical label for that category.\n\nThe resulting numerical labels are then assigned back to the 'category' column of the DataFrame, replacing the original categorical values.\n\n"

# **Train and test splits**

In [43]:
# Train and test splits
x_train, x_test, y_train, y_test = train_test_split(preprocessed_df['text'], preprocessed_df['category'], test_size=0.1, random_state=42)

# **Training the model**


In [44]:
model = LogisticRegression(max_iter=2000)
model.fit(list(x_train), y_train)

# **Evaluation**

In [45]:
num = 1000
y_preds = model.predict(list(x_test)[:num])
Y_test = y_test[:num]

accuracy = accuracy_score(Y_test, y_preds)
precision = precision_score(Y_test, y_preds, average='weighted')
recall = recall_score(Y_test, y_preds, average='weighted')
f1 = f1_score(Y_test, y_preds, average='weighted')
print('Accuracy: {:.2f}%'.format(accuracy*100))
print('Precision: {:.2f}%'.format(precision*100))
print('Recall: {:.2f}%'.format(recall*100))
print('F1-score: {:.2f}%'.format(f1*100))

Accuracy: 89.60%
Precision: 89.63%
Recall: 89.60%
F1-score: 89.60%


# **Making Prediction**

In [46]:
test_news = """ 
connors boost for british tennis former world number one jimmy connors is planning a long-term relationship with the lawn tennis association to help unearth the next tim henman.  the american spent three days at the lta s annual elite performance winter camp in la manga earlier this week.  britain has the right attitude   said connors.  the more involved i can be with the lta  the better.  a short-term arrangement is just confusing. the kids will ask:  what am i doing there    lta chief executive  john crowther  added:  the relationship that jimmy s already started to develop with the coaches and the players has said to us that we d like some more of it.  we want to use jimmy for a number of weeks a year and we hope this is the beginning of a good long-term relationship.   the camp played host to more than 30 leading senior and junior players  including greg rusedski  arvind parmar and anne keothavong.  la manga is an amazing site to take a bunch of kids who want to be the best   said connors  speaking at queen s club in london.  what impressed me most was not only the coaches but the way the kids went about their workouts and the feeling they put into every practice they had.  it was interesting to me to see kids of 15  16  17  with that desire and passion  and that can only be brought about by the coaches surrounding them.  instilling the importance of work and practice is something you can t buy.  they know what s been given to them and all they have to do is give back the effort  and every minute of practice they were doing that.   speaking from la manga  lta performance director david felgate told bbc sport:  jimmy was fantastic with the players and the coaches  and very humble considering what he s achieved.  he worked through the coaches and hopefully it will grow and he ll get to have more of an individual relationship with some of the players and get to know them.  he made it clear from the word go he didn t want it to be short-term. this is a 52-week-a-year job for me  it s my life and my passion and it s the same with the coaches.  he respects that but he wants to be involved and have real input. and why would he stake his reputation on something that s not going to be successful   connors has also agreed to commentate for the bbc at next year s wimbledon championships. he will work during the second week of the tournament.
"""

test_news = re.sub('[^a-zA-Z]', ' ', test_news)
test_news = test_news.lower()
test_news = test_news.split()
test_news = [ps.stem(word) for word in test_news if not word in set(stopwords.words('english'))]
test_news = ' '.join(test_news)

test_X = cv.transform([test_news]).toarray()

preds = model.predict(test_X)
print(preds)
result = categories[preds[0]]
print("Predicted categories: ", result)

[1]
Predicted categories:  Sports


# **Exporting and loading the trained model and vectorizor**

In [47]:
# Export the trained model and CountVectorizer instance
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('cv.pkl', 'wb') as f:
    pickle.dump(cv, f)

# Load the saved model and CountVectorizer instance, we will use this at the python app 
# with open('model.pkl', 'rb') as f:
#     model = pickle.load(f)
# with open('cv.pkl', 'rb') as f:
#     cv = pickle.load(f)

# **Text Validation Checking**

In [None]:
def is_valid_text(text):
    english_chars = set(string.printable) 
    is_english = all(char in english_chars for char in text)
    return is_english

In [None]:
import string
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [None]:
from langdetect import detect

def is_english(sentence):
    try:
        lang = detect(sentence)
        print(len(lang))
        return lang == 'en'
    except:
        return False


In [None]:
is_english("i love playing football")

2


True