In [1]:
import pandas as pd
import numpy as np
import nltk
#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#for word embedding
#import gensim
#from gensim.models import Word2Vec

In [2]:
#reading the cleaned dataset
df_start = pd.read_csv('final_processed_df.csv').drop(columns=['Unnamed: 0'])
print(df_start.columns)

#SPLITTING THE TRAINING DATASET INTO TRAIN AND TEST
X_train, X_test, y_train, y_test = train_test_split(df_start["Final_Text"],df_start["Category"],test_size=0.2,shuffle=True)
X_train.head()

Index(['Final_Text', 'Category'], dtype='object')


10283    fleabag succession chernobyl win top golden gl...
13067        egypt aim final arabtec housing deal midmarch
7271     cinegouna extends submission deadline film pos...
3551     art alert gypsy jazz project perform live cair...
13479    cyprus gas field produce 8 bcm year pipeline e...
Name: Final_Text, dtype: object

In [3]:
#Word2Vec
# Word2Vec runs on tokenized sentences
X_train_tok= [nltk.word_tokenize(i) for i in X_train]  
X_test_tok= [nltk.word_tokenize(i) for i in X_test]
X_train_tok

[['fleabag',
  'succession',
  'chernobyl',
  'win',
  'top',
  'golden',
  'globe',
  'egyptianamerican',
  'actor',
  'ramy',
  'youssef',
  'farewell',
  'star',
  'awkwafina',
  'among',
  'people',
  'color',
  'win'],
 ['egypt', 'aim', 'final', 'arabtec', 'housing', 'deal', 'midmarch'],
 ['cinegouna',
  'extends',
  'submission',
  'deadline',
  'film',
  'postproduction',
  'filmmaker',
  'film',
  'postproduction',
  'still',
  'opportunity',
  'compete',
  'elgouna',
  'film',
  'festivalâ\x80\x99s',
  'cash',
  'award'],
 ['art',
  'alert',
  'gypsy',
  'jazz',
  'project',
  'perform',
  'live',
  'cairo',
  'bandâ\x80\x99s',
  'music',
  'influenced',
  'renowned',
  'jazz',
  'artist',
  'django',
  'reinhardt',
  'stephane',
  'grappelli'],
 ['cyprus',
  'gas',
  'field',
  'produce',
  '8',
  'bcm',
  'year',
  'pipeline',
  'egypt'],
 ['egypt',
  'sisi',
  'transfer',
  'agricultural',
  'development',
  'credit',
  'bank',
  'cbe',
  'supervision',
  'newly',
  'named'

In [8]:
#Tf-Idf
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)


In [12]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression(tf-idf)
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)  
#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix: \n',confusion_matrix(y_test, y_predict))

              precision    recall  f1-score   support

         Art       0.99      0.99      0.99      2043
     economy       0.99      0.99      0.99      2008
      sports       0.99      0.99      0.99      2037

    accuracy                           0.99      6088
   macro avg       0.99      0.99      0.99      6088
weighted avg       0.99      0.99      0.99      6088

Confusion Matrix: 
 [[2023   10   10]
 [  11 1993    4]
 [  10   12 2015]]


In [14]:
#FITTING THE CLASSIFICATION MODEL using Naive Bayes(tf-idf)
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_vectors_tfidf, y_train)  
#Predict y value for test dataset
y_predict = nb_tfidf.predict(X_test_vectors_tfidf)
y_prob = nb_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix: \n',confusion_matrix(y_test, y_predict))


              precision    recall  f1-score   support

         Art       0.99      0.98      0.99      2043
     economy       0.98      0.99      0.99      2008
      sports       0.99      0.99      0.99      2037

    accuracy                           0.99      6088
   macro avg       0.99      0.99      0.99      6088
weighted avg       0.99      0.99      0.99      6088

Confusion Matrix: 
 [[2011   20   12]
 [  17 1990    1]
 [   9   15 2013]]


In [18]:
df_test = pd.DataFrame(X_test)
df_test

Unnamed: 0,Final_Text
20264,eu provides â¬3 billion fund egyptâs water ...
4661,programmetrailers 5th cairo video festival med...
14762,telecom egypt see 107pct jump net profit 2015
9168,saudi artist seen pushing boundary
5437,egyptian director yousry nasrallah say idea ne...
...,...
11447,egypt stock rise wednesday despite cairo unive...
27980,one fa cup semi used test event return crowd
26554,neymar eclipse ronaldo peru hattrick brings 64...
16553,egypt plan establish free economic zone south ...


In [204]:
#first step
import string
string.punctuation
from nltk.tokenize import word_tokenize

def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output


# preporcessing new text input
def finalpreprocess(sentence):
    punc = remove_punctuation(sentence)
    #print(punc)
    low = punc.lower()
    #tokens = word_tokenize(low)
    #stops=remove_stopwords(low)
    vectors = tfidf_vectorizer.transform([low])
    return vectors

#test_user_input = finalpreprocess("iconic actor adel emam celebration birthday public")
#test_user_input = finalpreprocess("we won the match after they were losing 2-0, they end up winning very well")
test_user_input = finalpreprocess("UAE President issues Federal Law to maintain 2022 budget balance")

input_predict = lr_tfidf.predict(test_user_input)
prob = lr_tfidf.predict_proba(test_user_input)

print("Predicted Category is : ",(input_predict[0]),"\nWords Probabilties : ", prob)
labels=["Art","Economy","Sports"]
labels[np.argmax(prob)]

Predicted Category is :  economy 
Words Probabilties :  [[0.01779189 0.86018548 0.12202263]]


'Economy'

In [205]:
input_predict = (nb_tfidf.predict(test_user_input))
prob = nb_tfidf.predict_proba(test_user_input)[:,1]
print(input_predict[0])


economy
