<a href="https://colab.research.google.com/github/Ayushrawat651/Ayushrawat651/blob/main/My_projects/Summarizer/news_summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
import pickle

In [None]:
#Import dataset
train_df = pd.read_csv("/content/bbc.csv")

In [None]:
train_df

Unnamed: 0,ArticleId,Text,Category
0,0,UK economy facing 'major risks'\n \n The UK ma...,business
1,1,Aids and climate top Davos agenda\n \n Climate...,business
2,2,Asian quake hits European shares\n \n Shares i...,business
3,3,India power shares jump on debut\n \n Shares i...,business
4,4,Lacroix label bought by US firm\n \n Luxury go...,business
...,...,...,...
2220,2220,Warning over Windows Word files\n \n Writing a...,tech
2221,2221,Fast lifts rise into record books\n \n Two hig...,tech
2222,2222,Nintendo adds media playing to DS\n \n Nintend...,tech
2223,2223,Fast moving phone viruses appear\n \n Security...,tech


In [None]:
train_df["Label_Encoding"] = train_df["Category"].factorize()[0]

In [None]:
train_df

Unnamed: 0,ArticleId,Text,Category,Label_Encoding
0,0,UK economy facing 'major risks'\n \n The UK ma...,business,0
1,1,Aids and climate top Davos agenda\n \n Climate...,business,0
2,2,Asian quake hits European shares\n \n Shares i...,business,0
3,3,India power shares jump on debut\n \n Shares i...,business,0
4,4,Lacroix label bought by US firm\n \n Luxury go...,business,0
...,...,...,...,...
2220,2220,Warning over Windows Word files\n \n Writing a...,tech,4
2221,2221,Fast lifts rise into record books\n \n Two hig...,tech,4
2222,2222,Nintendo adds media playing to DS\n \n Nintend...,tech,4
2223,2223,Fast moving phone viruses appear\n \n Security...,tech,4


In [None]:
# Frequency Distribution for Each Class
print (train_df["Category"].value_counts())
print (train_df["Label_Encoding"].value_counts())


# Based on frequency distribution  we can say that data is balanced, not suffering from class imbalance.

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: Category, dtype: int64
3    511
0    510
2    417
4    401
1    386
Name: Label_Encoding, dtype: int64


In [None]:
# Preserving the Category Coding
category_labels_to_id = {"business":0,"entertainment":1,"politics":2,"sport":3,"tech":4}
id_to_category = {0:"business",1:"entertainment",2:"politics",3:"sport",4:"tech"}

In [None]:
# Check the number of Null
train_df.isnull().sum()

ArticleId         0
Text              0
Category          0
Label_Encoding    0
dtype: int64

In [None]:
#TF IDF vectorization
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=7, norm='l2', encoding='utf-8', ngram_range=(1, 3),lowercase = True,stop_words='english')

In [None]:
# Training the tfidf feature
tfidf_feature = tfidf.fit_transform(train_df.Text).toarray()

In [None]:
with open('news_classification_tfidf_vectorizer', 'wb') as output:
    pickle.dump(tfidf, output)

In [None]:
N = 5  # We are going to look for top 3 categories
labels = train_df.Label_Encoding

#For each category, find words that are highly corelated to it
for category, category_id in sorted(category_labels_to_id.items()):
  features_chi2 = chi2(tfidf_feature, labels == category_id)              # Do chi2 analyses of all items in this category
  indices = np.argsort(features_chi2[0])                                  # Sorts the indices of features_chi2[0] - the chi-squared stats of each feature
  feature_names = np.array(tfidf.get_feature_names_out())[indices]            # Converts indices to feature names ( in increasing order of chi-squared stat values)
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]         # List of single word features ( in increasing order of chi-squared stat values)
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]          # List for two-word features ( in increasing order of chi-squared stat values)
  trigrams = [v for v in feature_names if len(v.split(" "))==3]
  print("# '{}':".format(category))
  print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:]))) # Print 3 unigrams with highest Chi squared stat
  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:]))) # Print 3 bigrams with highest Chi squared stat
  print("  . Most correlated Trigrams:\n       . {}".format('\n       . '.join(trigrams[-N:]))) # Print 3 bigrams with highest Chi squared stat

# 'business':
  . Most correlated unigrams:
       . economy
       . oil
       . bank
       . growth
       . shares
  . Most correlated bigrams:
       . chief executive
       . oil prices
       . analysts said
       . economic growth
       . stock market
  . Most correlated Trigrams:
       . reuters news agency
       . high oil prices
       . gross domestic product
       . london stock exchange
       . securities exchange commission
# 'entertainment':
  . Most correlated unigrams:
       . album
       . awards
       . singer
       . actor
       . film
  . Most correlated bigrams:
       . million dollar
       . best film
       . los angeles
       . film festival
       . box office
  . Most correlated Trigrams:
       . celebrity big brother
       . best supporting actress
       . best supporting actor
       . berlin film festival
       . million dollar baby
# 'politics':
  . Most correlated unigrams:
       . tory
       . party
       . blair
       . electio

In [None]:
# Train Test Split
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

model = RandomForestClassifier()

In [None]:
#Split Data
X_train, X_test, y_train, y_test= train_test_split(tfidf_feature, labels, test_size=0.25, random_state=0)

In [None]:
model.fit(X_train,y_train)

In [None]:
with open('news_classification_rf_model', 'wb') as output:
    pickle.dump(model, output)

In [None]:
predicted_train = model.predict(X_train)
predicted_test = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print (classification_report(y_test,predicted_test))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95       134
           1       0.97      0.95      0.96        94
           2       0.95      0.93      0.94       107
           3       0.99      0.99      0.99       134
           4       0.97      0.94      0.95        88

    accuracy                           0.96       557
   macro avg       0.96      0.96      0.96       557
weighted avg       0.96      0.96      0.96       557



In [None]:
print (classification_report(y_train,predicted_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       376
           1       1.00      1.00      1.00       292
           2       1.00      1.00      1.00       310
           3       1.00      1.00      1.00       377
           4       1.00      1.00      1.00       313

    accuracy                           1.00      1668
   macro avg       1.00      1.00      1.00      1668
weighted avg       1.00      1.00      1.00      1668



In [None]:
test_article = "Iron man actor rober junior came for promotion. The film is getting lot of attention from movie lovers across the globe. Its gonna be interesting to see how this movie performs on box-office."

In [None]:
test_article = test_article.lower()

In [None]:
test_frame = pd.DataFrame({"Text":[test_article]})
print (test_frame)

                                                Text
0  iron man actor rober junior came for promotion...


In [None]:

test_feature = tfidf.transform(test_frame.Text).toarray()

In [None]:
prediction = model.predict(test_feature)

In [None]:
print (prediction)

[1]
