In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud ,STOPWORDS
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from nltk.tokenize import word_tokenize 

import re
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install nltk

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/07NG/newsclassification/master/data.csv")

In [None]:
data.head()

# Understanding Features and Target Variables

In [None]:
data['Category'].unique()

In [None]:
data.shape

In [None]:
data.dtypes

# Checking for NULL values

In [None]:
data.isnull().any()

# Countplot of Target Variable(Category)

In [None]:
##sns.countplot(data.Category)

In [None]:
data['News_length'] = data['Text'].str.len()
print(data['News_length'])

# Distribution Plot

In [None]:
sns.distplot(data['News_length']).set_title('News length distribution');


# WordCloud

In [None]:
def create_wordcloud(words):
    wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(words)
    plt.figure(figsize=(10, 7))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis('off')
    plt.show()

In [None]:
subset=data[data.Category=="business"]
text=subset.Text.values
words =" ".join(text)
create_wordcloud(words)

In [None]:
subset=data[data.Category=="entertainment"]
text=subset.Text.values
words =" ".join(text)
create_wordcloud(words)

In [None]:
subset=data[data.Category=="politics"]
text=subset.Text.values
words =" ".join(text)
create_wordcloud(words)

In [None]:
subset=data[data.Category=="sport"]
text=subset.Text.values
words =" ".join(text)
create_wordcloud(words)

In [None]:
subset=data[data.Category=="tech"]
text=subset.Text.values
words =" ".join(text)
create_wordcloud(words)

# Feature Engineering

Removing the special characters<br>
1) \r<br>
2) \n<br>

Removing Punctuations and Stopwords

In [None]:
def process_text(text):
    text = text.lower().replace('\n',' ').replace('\r','').strip()
    text = re.sub(' +', ' ', text)
    text = re.sub(r'[^\w\s]','',text)
    
    
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
    
    text = " ".join(filtered_sentence)
    return text

In [None]:
data['Text_parsed'] = data['Text'].apply(process_text)

In [None]:
data.head()

Unnamed: 0,ArticleId,Text,Category,News_length,Text_parsed
0,1833,worldcom ex-boss launches defence lawyers defe...,business,1866,worldcom exboss launches defence lawyers defen...
1,154,german business confidence slides german busin...,business,2016,german business confidence slides german busin...
2,1101,bbc poll indicates economic gloom citizens in ...,business,3104,bbc poll indicates economic gloom citizens maj...
3,1976,lifestyle governs mobile choice faster bett...,tech,3618,lifestyle governs mobile choice faster better ...
4,917,enron bosses in $168m payout eighteen former e...,business,2190,enron bosses 168m payout eighteen former enron...


# Label Encoding

In [None]:
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 
data['Category_target']= label_encoder.fit_transform(data['Category']) 

In [None]:
data.head()

Unnamed: 0,ArticleId,Text,Category,News_length,Text_parsed,Category_target
0,1833,worldcom ex-boss launches defence lawyers defe...,business,1866,worldcom exboss launches defence lawyers defen...,0
1,154,german business confidence slides german busin...,business,2016,german business confidence slides german busin...,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,3104,bbc poll indicates economic gloom citizens maj...,0
3,1976,lifestyle governs mobile choice faster bett...,tech,3618,lifestyle governs mobile choice faster better ...,4
4,917,enron bosses in $168m payout eighteen former e...,business,2190,enron bosses 168m payout eighteen former enron...,0


In [None]:
data.Category

0            business
1            business
2            business
3                tech
4            business
            ...      
1485    entertainment
1486    entertainment
1487         business
1488             tech
1489             tech
Name: Category, Length: 1490, dtype: object

In [None]:
data['Category_target'].unique()


array([0, 4, 2, 3, 1])

In [None]:
data['Category'].unique()

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)

# Split the data in Training and testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['Text_parsed'], 
                                                    data['Category_target'], 
                                                    test_size=0.2, 
                                                    random_state=8)

In [None]:
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

In [None]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)





(1192, 300)
(298, 300)


# Building Models

# Random Forest

# Logistic Regression

In [None]:
model = LogisticRegression()
print(model.get_params())
model.fit(features_train, labels_train)
model_predictions = model.predict(features_test)
print('Accuracy of training model: ', model.score(features_train, labels_train))
print('Accuracy of testing model: ', model.score(features_test, labels_test))

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Accuracy of training model:  0.9840604026845637
Accuracy of testing model:  0.9429530201342282


# Hyper Parameter Tuning (Logistic Regression)

In [None]:
model = LogisticRegression(C=1)
model.fit(features_train, labels_train)
model_predictions = model.predict(features_test)
print('Accuracy: ', accuracy_score(labels_test, model_predictions))
print(classification_report(labels_test, model_predictions))

Accuracy:  0.9429530201342282
              precision    recall  f1-score   support

           0       0.92      0.92      0.92        76
           1       0.98      0.98      0.98        47
           2       0.96      0.87      0.91        55
           3       0.96      0.98      0.97        65
           4       0.91      0.96      0.94        55

    accuracy                           0.94       298
   macro avg       0.95      0.94      0.94       298
weighted avg       0.94      0.94      0.94       298



Accuracy remains the same after hyper paramter tuning

In [None]:
print(labels_test.shape)
print(labels_train.shape)
print(features_test.shape)
print(features_train.shape)
print(model_predictions.shape)

(298,)
(1192,)
(298, 300)
(1192, 300)
(298,)


In [None]:
pt=process_text("The bank has paid the amount to the wife of Uday Kumar Sah, who was an account holder at the bank and passed away due to an accident. Dipesh Singh Thakuri, Madhesh State head of Sanima Bank, handed over the cheque for the insurance amount in conjunction with insurance provider company Siddharth Insurance Ltd, as per a press release. ")

In [None]:
p2t=tfidf.transform([pt]).toarray()
print(p2t)
print(p2t.shape)

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.32745454 0.         0.         0.83932653
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.28474203 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.32745454 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.      

In [None]:
predictedtext = model.predict(p2t)
print(predictedtext)
print(predictedtext.shape)

[0]
(1,)


In [None]:
def classify_text(text1):
    # Preprocess the text
    processed_text = process_text(text1)

    # Vectorize the text using the same TfidfVectorizer as in the training
    text_vector = tfidf.transform([processed_text]).toarray()

    # Predict the category using the trained Logistic Regression model
    category_idx = model.predict(text_vector)[0]
    print(category_idx)
    # Map the category index to its label using the LabelEncoder
    category_label = label_encoder.inverse_transform([category_idx])[0]
    print(category_label)
    return category_label

In [None]:
pred_text = "Nepal Police Club defeated FC Khumaltar 3-1, while Friends Club salvaged a 1-1 draw with APF Football Club in the Martyrs Memorial A Division League here today. "
predict=classify_text(pred_text)

print("prediction:",predict )

3
sport
()
prediction: sport
