## Fake News Detector

#### 1. Importing Libraries


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

#### 2. Loading the Dataset

We load the dataset containing text data and labels. The dataset has two columns:

- `label`: Indicates whether the news article is real (1) or fake (0).
- `text`: Contains the news article text.


In [2]:
data = pd.read_csv('training_data_lowercase.csv', sep='\t', names=['label', 'text'])

# view data
print(data.shape)
print (data.head(10))

(34152, 2)
   label                                               text
0      0  donald trump sends out embarrassing new year‚s...
1      0  drunk bragging trump staffer started russian c...
2      0  sheriff david clarke becomes an internet joke ...
3      0  trump is so obsessed he even has obama‚s name ...
4      0  pope francis just called out donald trump duri...
5      0  racist alabama cops brutalize black boy while ...
6      0                          fresh off the golf course
7      0  trump said some insanely racist stuff inside t...
8      0   former cia director slams trump over un bullying
9      0  brand-new pro-trump ad features so much a** ki...


#### 3. Divide the data into training and test


In [3]:
data_train, data_val = train_test_split(data, test_size=0.2, random_state=44)

#### 4. Data Preprocessing

We apply several preprocessing steps to clean and prepare the text data for modeling. This includes converting text to lowercase, removing punctuation, digits, and stopwords, and normalizing accented characters.


In [4]:
from utils import remove_and_convert

# Apply the function to the data 
data_train['remove_and_convert'] = data_train['text'].apply(remove_and_convert)
data_val['remove_and_convert'] = data_val['text'].apply(remove_and_convert)

# Check if the data has been cleaned correctly
data_train.head(10)

Unnamed: 0,label,text,remove_and_convert
10131,0,canada sends troops to u.s. border to deal wit...,canada sends troops to us border to deal with ...
5354,0,trump supporter just mocked hispanics by insul...,trump supporter just mocked hispanics by insul...
14075,0,oops! donald trump‚s name missing from ballots...,oops donald trumps name missing from ballots i...
21401,1,fbi has sufficient resources for russia invest...,fbi has sufficient resources for russia invest...
5557,0,white man arrested unscathed after pointing sh...,white man arrested unscathed after pointing sh...
10801,0,liberal smackdown! sean spicer zings reporter ...,liberal smackdown sean spicer zings reporter o...
30223,1,aftershocks likely from september test detecte...,aftershocks likely from september test detecte...
28791,1,white house says iran's progress on nuclear de...,white house says irans progress on nuclear dea...
3751,0,tonight‚s ‚saturday night live‚ is sure to mak...,tonights saturday night live is sure to make t...
6256,0,ca senate passes ambitious gun control package...,ca senate passes ambitious gun control package...


4.1. Remove Stopwords

Common English stopwords are removed from the text to focus on important words that carry more meaning.


In [5]:
from utils import remove_stopwords

data_train['remove_stopwords'] = data_train['remove_and_convert'].apply(remove_stopwords)
data_val['remove_stopwords'] = data_val['remove_and_convert'].apply(remove_stopwords)

4.2. Tokenization

We tokenize the text by splitting each sentence into words. This step converts the raw text into a list of words (tokens), which will allow further text processing like stemming or lemmatization.


In [6]:
from utils import tokenizer

# Apply the function to the data 
data_train['tokenize'] = data_train['remove_stopwords'].apply(tokenizer)
data_val['tokenize'] = data_val['remove_stopwords'].apply(tokenizer)
data_train.head(10)

Unnamed: 0,label,text,remove_and_convert,remove_stopwords,tokenize
10131,0,canada sends troops to u.s. border to deal wit...,canada sends troops to us border to deal with ...,canada sends troops us border deal illegals as...,"[canada, sends, troops, us, border, deal, ille..."
5354,0,trump supporter just mocked hispanics by insul...,trump supporter just mocked hispanics by insul...,trump supporter mocked hispanics insulting spa...,"[trump, supporter, mocked, hispanics, insultin..."
14075,0,oops! donald trump‚s name missing from ballots...,oops donald trumps name missing from ballots i...,oops donald trumps name missing ballots florida,"[oops, donald, trumps, name, missing, ballots,..."
21401,1,fbi has sufficient resources for russia invest...,fbi has sufficient resources for russia invest...,fbi sufficient resources russia investigation ...,"[fbi, sufficient, resources, russia, investiga..."
5557,0,white man arrested unscathed after pointing sh...,white man arrested unscathed after pointing sh...,white man arrested unscathed pointing shotgun ...,"[white, man, arrested, unscathed, pointing, sh..."
10801,0,liberal smackdown! sean spicer zings reporter ...,liberal smackdown sean spicer zings reporter o...,liberal smackdown sean spicer zings reporter t...,"[liberal, smackdown, sean, spicer, zings, repo..."
30223,1,aftershocks likely from september test detecte...,aftershocks likely from september test detecte...,aftershocks likely september test detected nor...,"[aftershocks, likely, september, test, detecte..."
28791,1,white house says iran's progress on nuclear de...,white house says irans progress on nuclear dea...,white house says irans progress nuclear deal m...,"[white, house, says, irans, progress, nuclear,..."
3751,0,tonight‚s ‚saturday night live‚ is sure to mak...,tonights saturday night live is sure to make t...,tonights saturday night live sure make trump f...,"[tonights, saturday, night, live, sure, make, ..."
6256,0,ca senate passes ambitious gun control package...,ca senate passes ambitious gun control package...,ca senate passes ambitious gun control package...,"[ca, senate, passes, ambitious, gun, control, ..."


4.3. Stemming

We apply stemming to reduce words to their root forms. This helps standardize the text.


In [7]:
from utils import stem_words

# Apply stemming 
data_train['stem_words'] = data_train['tokenize'].apply(stem_words)
data_val['stem_words'] = data_val['tokenize'].apply(stem_words)

data_train.head(20)

Unnamed: 0,label,text,remove_and_convert,remove_stopwords,tokenize,stem_words
10131,0,canada sends troops to u.s. border to deal wit...,canada sends troops to us border to deal with ...,canada sends troops us border deal illegals as...,"[canada, sends, troops, us, border, deal, ille...",canada send troop us border deal illeg asylum ...
5354,0,trump supporter just mocked hispanics by insul...,trump supporter just mocked hispanics by insul...,trump supporter mocked hispanics insulting spa...,"[trump, supporter, mocked, hispanics, insultin...",trump support mock hispan insult spanish langu...
14075,0,oops! donald trump‚s name missing from ballots...,oops donald trumps name missing from ballots i...,oops donald trumps name missing ballots florida,"[oops, donald, trumps, name, missing, ballots,...",oop donald trump name miss ballot florida
21401,1,fbi has sufficient resources for russia invest...,fbi has sufficient resources for russia invest...,fbi sufficient resources russia investigation ...,"[fbi, sufficient, resources, russia, investiga...",fbi suffici resourc russia investig mccabe
5557,0,white man arrested unscathed after pointing sh...,white man arrested unscathed after pointing sh...,white man arrested unscathed pointing shotgun ...,"[white, man, arrested, unscathed, pointing, sh...",white man arrest unscath point shotgun car sho...
10801,0,liberal smackdown! sean spicer zings reporter ...,liberal smackdown sean spicer zings reporter o...,liberal smackdown sean spicer zings reporter t...,"[liberal, smackdown, sean, spicer, zings, repo...",liber smackdown sean spicer zing report trump ...
30223,1,aftershocks likely from september test detecte...,aftershocks likely from september test detecte...,aftershocks likely september test detected nor...,"[aftershocks, likely, september, test, detecte...",aftershock like septemb test detect north kore...
28791,1,white house says iran's progress on nuclear de...,white house says irans progress on nuclear dea...,white house says irans progress nuclear deal m...,"[white, house, says, irans, progress, nuclear,...",white hous say iran progress nuclear deal must...
3751,0,tonight‚s ‚saturday night live‚ is sure to mak...,tonights saturday night live is sure to make t...,tonights saturday night live sure make trump f...,"[tonights, saturday, night, live, sure, make, ...",tonight saturday night live sure make trump fo...
6256,0,ca senate passes ambitious gun control package...,ca senate passes ambitious gun control package...,ca senate passes ambitious gun control package...,"[ca, senate, passes, ambitious, gun, control, ...",ca senat pass ambiti gun control packag that g...


4.4. Lemmatization


In [8]:
from utils import lemmatize_words

# Apply lemmatization
data_train['lemmatize_words'] = data_train['tokenize'].apply(lemmatize_words)
data_val['lemmatize_words'] = data_val['tokenize'].apply(lemmatize_words)

data_train.head(20)

Unnamed: 0,label,text,remove_and_convert,remove_stopwords,tokenize,stem_words,lemmatize_words
10131,0,canada sends troops to u.s. border to deal wit...,canada sends troops to us border to deal with ...,canada sends troops us border deal illegals as...,"[canada, sends, troops, us, border, deal, ille...",canada send troop us border deal illeg asylum ...,canada sends troop u border deal illegals asyl...
5354,0,trump supporter just mocked hispanics by insul...,trump supporter just mocked hispanics by insul...,trump supporter mocked hispanics insulting spa...,"[trump, supporter, mocked, hispanics, insultin...",trump support mock hispan insult spanish langu...,trump supporter mocked hispanic insulting span...
14075,0,oops! donald trump‚s name missing from ballots...,oops donald trumps name missing from ballots i...,oops donald trumps name missing ballots florida,"[oops, donald, trumps, name, missing, ballots,...",oop donald trump name miss ballot florida,oops donald trump name missing ballot florida
21401,1,fbi has sufficient resources for russia invest...,fbi has sufficient resources for russia invest...,fbi sufficient resources russia investigation ...,"[fbi, sufficient, resources, russia, investiga...",fbi suffici resourc russia investig mccabe,fbi sufficient resource russia investigation m...
5557,0,white man arrested unscathed after pointing sh...,white man arrested unscathed after pointing sh...,white man arrested unscathed pointing shotgun ...,"[white, man, arrested, unscathed, pointing, sh...",white man arrest unscath point shotgun car sho...,white man arrested unscathed pointing shotgun ...
10801,0,liberal smackdown! sean spicer zings reporter ...,liberal smackdown sean spicer zings reporter o...,liberal smackdown sean spicer zings reporter t...,"[liberal, smackdown, sean, spicer, zings, repo...",liber smackdown sean spicer zing report trump ...,liberal smackdown sean spicer zing reporter tr...
30223,1,aftershocks likely from september test detecte...,aftershocks likely from september test detecte...,aftershocks likely september test detected nor...,"[aftershocks, likely, september, test, detecte...",aftershock like septemb test detect north kore...,aftershock likely september test detected nort...
28791,1,white house says iran's progress on nuclear de...,white house says irans progress on nuclear dea...,white house says irans progress nuclear deal m...,"[white, house, says, irans, progress, nuclear,...",white hous say iran progress nuclear deal must...,white house say iran progress nuclear deal mus...
3751,0,tonight‚s ‚saturday night live‚ is sure to mak...,tonights saturday night live is sure to make t...,tonights saturday night live sure make trump f...,"[tonights, saturday, night, live, sure, make, ...",tonight saturday night live sure make trump fo...,tonight saturday night live sure make trump fo...
6256,0,ca senate passes ambitious gun control package...,ca senate passes ambitious gun control package...,ca senate passes ambitious gun control package...,"[ca, senate, passes, ambitious, gun, control, ...",ca senat pass ambiti gun control packag that g...,ca senate pass ambitious gun control package t...


#### 5. Exploratory Analysis

Word Frequency Analysis in Real and Fake News

In this section, we analyze the most common words in real and fake news articles. We first split the dataset into real and fake news based on their labels, then count the frequency of words in each category. Finally, we display the top 20 most common words for both real and fake news.


In [9]:
from collections import Counter

# Split the data into real_news and fake_news messages
real_news = data_train[data_train['label'] == 0]['lemmatize_words']
fake_news = data_train[data_train['label'] == 1]['lemmatize_words']

# Count the frequency of each word in the real_news 
real_words = Counter(' '.join(real_news).split())

# Count the frequency of each word in the fake_news 
fake_words = Counter(' '.join(fake_news).split())

# Get the top 20 words in the ham messages
top_real_words = real_words.most_common(20)

# Get the top 20 words in the spam messages
top_fake_words = fake_words.most_common(20)

# Print the results
print("Top 20 words in real news:")
for word, count in top_real_words:
    print(f"{word}: {count}")

print("\nTop 20 words in fake news:")
for word, count in top_fake_words:
    print(f"{word}: {count}")

Top 20 words in real news:
trump: 5685
video: 4428
hillary: 1192
obama: 1061
clinton: 665
president: 652
republican: 628
gop: 557
get: 557
donald: 554
tweet: 546
breaking: 505
new: 481
white: 474
black: 460
u: 458
news: 457
watch: 429
make: 404
democrat: 401

Top 20 words in fake news:
trump: 4144
u: 2810
say: 1941
house: 1126
republican: 772
white: 631
russia: 600
senate: 582
bill: 539
new: 529
clinton: 511
tax: 508
state: 496
court: 478
obama: 465
china: 432
north: 429
korea: 402
call: 365
official: 352


#### 6. Feature Engineering

6.1 Vectorization

Convert data into vectors so that we can apply the it to a classifier, then vectorize the dataset.


In [10]:
# To make Bag of Words work with Count Vectorizer, we will have to combine the messages so that we can convert it to a vector so that we can apply the it to a classifier.
# We will join every message in a link them with blank spaces. ".iloc" is Purely integer-location based indexing for selection by position. 

headlines = []
for row in range(0,len(data_train.index)):
    headlines.append(' '.join(str(x) for x in data_train.iloc[row,1:2])) # change index to choose column 


# Predict for the Test Dataset
test_transform= []
for row in range(0,len(data_val.index)):
    test_transform.append(' '.join(str(x) for x in data_val.iloc[row,1:2]))


print (headlines [1])
print (test_transform [1])

trump supporter just mocked hispanics by insulting the spanish language on cnn
donald trump just got a big ‚f*** you‚ from goldman sachs (tweets)


6.2 Bag of Words with CountVectorizer


In [11]:
# implement BAG OF WORDS with CountVectorizer
bow_countvector=CountVectorizer(ngram_range=(1,2)) # (ngram_range=(n,n))

# Vectorize the messages dataset
bow_traindataset=bow_countvector.fit_transform(headlines)
bow_test_dataset = bow_countvector.transform(test_transform)

# Print the shape of the vectorized messages
print(bow_traindataset.shape)

(27321, 171423)


6.3 TD-IDF


In [12]:
# Create a Vectorizer  for TF-IDF
tfidfvector = TfidfVectorizer(ngram_range=(1,2))

# Vectorize the messages dataset
tdidf_traindataset = tfidfvector.fit_transform(headlines)
tdidf_test_dataset = tfidfvector.transform(test_transform)

# Print the shape of the vectorized messages
print(tdidf_test_dataset.shape)

(6831, 171423)


#### 7. Implement Classifiers


7.1 Logistic Regression

- **Logistic Regression**: A linear model used for binary classification tasks.


In [13]:
# Fit the Logistic Regression model with bow
logreg=LogisticRegression(C=6, penalty='l2', max_iter=150, tol=0.00001, solver='lbfgs')
logreg.fit(bow_traindataset,data_train['label'])

In [14]:
bow_predictions = logreg.predict(bow_test_dataset)
print ('Logistic Regression with BOW')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],bow_predictions)
print(matrix)
score=accuracy_score(data_val['label'],bow_predictions)
print(f'Accuracy score for Logistic Regression with BOW: {score}') 
report=classification_report(data_val['label'],bow_predictions)
print(report)

Logistic Regression with BOW
[[3278  173]
 [ 152 3228]]
Accuracy score for Logistic Regression with BOW: 0.952422778509735
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      3451
           1       0.95      0.96      0.95      3380

    accuracy                           0.95      6831
   macro avg       0.95      0.95      0.95      6831
weighted avg       0.95      0.95      0.95      6831



In [15]:
# Set up Stratified K-Fold to ensure that class distribution 
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation (evaluating accuracy)
scores = cross_val_score(logreg, bow_traindataset, data_train['label'], cv=cv, scoring='accuracy')

# Output the accuracy scores and their mean
print(f'Accuracy scores for each of the 10 cross-validation folds: {scores}')
print(f'Mean accuracy score: {scores.mean()}')

Accuracy scores for each of the 10 cross-validation folds: [0.94914014 0.95168375 0.94070278 0.94326501 0.94253294 0.94582723
 0.9454612  0.94765739 0.95021962 0.9465593 ]
Mean accuracy score: 0.9463049363053058


In [16]:
# Fit the Logistic Regression model with TF-IDF
logreg.fit(tdidf_traindataset,data_train['label'])

In [17]:
tfidf_predictions = logreg.predict(tdidf_test_dataset) 
print ('Logistic Regression with TF-IDF') 

# Evaluate the model
matrix=confusion_matrix(data_val['label'],tfidf_predictions)
print(matrix)
score=accuracy_score(data_val['label'],tfidf_predictions)
print(f' Accuracy score for Logistic Regression with TF-IDF: {score}') 
report=classification_report(data_val['label'],tfidf_predictions)
print(report)

Logistic Regression with TF-IDF
[[3293  158]
 [ 165 3215]]
 Accuracy score for Logistic Regression with TF-IDF: 0.9527155614112136
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      3451
           1       0.95      0.95      0.95      3380

    accuracy                           0.95      6831
   macro avg       0.95      0.95      0.95      6831
weighted avg       0.95      0.95      0.95      6831



7.2. Naive Bayes Classifier

1. CountVectorizer
2. TF-IDF


In [18]:
# Fit the Naive Bayes model with bow
naive=MultinomialNB()
naive.fit(bow_traindataset,data_train['label'])

In [19]:
bow_predictions = naive.predict(bow_test_dataset)
print ('Naive Bayes with BOW')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],bow_predictions)
print(matrix)
score=accuracy_score(data_val['label'],bow_predictions)
print(f'Accuracy score for Naive Bayes with BOW: {score}')
report=classification_report(data_val['label'],bow_predictions)
print(report)

Naive Bayes with BOW
[[3335  116]
 [ 237 3143]]
Accuracy score for Naive Bayes with BOW: 0.9483238178890353
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      3451
           1       0.96      0.93      0.95      3380

    accuracy                           0.95      6831
   macro avg       0.95      0.95      0.95      6831
weighted avg       0.95      0.95      0.95      6831



In [20]:
# Fit the Naive Bayes model with TF-IDF
naive.fit(tdidf_traindataset,data_train['label'])

In [21]:
tfidf_predictions = naive.predict(tdidf_test_dataset)
print ('Naive Bayes with TF-IDF')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],tfidf_predictions)
print(matrix)
score=accuracy_score(data_val['label'],tfidf_predictions)
print(f'Accuracy score for Naive Bayes with TF-IDF: {score}')
report=classification_report(data_val['label'],tfidf_predictions)
print(report)

Naive Bayes with TF-IDF
[[3335  116]
 [ 269 3111]]
Accuracy score for Naive Bayes with TF-IDF: 0.9436392914653784
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      3451
           1       0.96      0.92      0.94      3380

    accuracy                           0.94      6831
   macro avg       0.94      0.94      0.94      6831
weighted avg       0.94      0.94      0.94      6831



7.3. Random Forest Classifier

1. CountVectorizer
2. TF-IDF


In [22]:
# Fit the Random Forest model with bow
rf = RandomForestClassifier()
rf.fit(bow_traindataset,data_train['label'])

In [23]:
bow_predictions = rf.predict(bow_test_dataset)
print ('Random Forest with BOW')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],bow_predictions)
print(matrix)
score=accuracy_score(data_val['label'],bow_predictions)
print(f' Accuracy score for Random Forest with BOW: {score}') 
report=classification_report(data_val['label'],bow_predictions)
print(report)

Random Forest with BOW
[[3275  176]
 [ 295 3085]]
 Accuracy score for Random Forest with BOW: 0.9310496267018006
              precision    recall  f1-score   support

           0       0.92      0.95      0.93      3451
           1       0.95      0.91      0.93      3380

    accuracy                           0.93      6831
   macro avg       0.93      0.93      0.93      6831
weighted avg       0.93      0.93      0.93      6831



In [24]:
# Fit the Random Forest model with TF-IDF
rf.fit(tdidf_traindataset,data_train['label'])

In [25]:
tfidf_predictions = rf.predict(tdidf_test_dataset)
print ('Random Forest with TF-IDF')
# Evaluate the model
matrix=confusion_matrix(data_val['label'],tfidf_predictions)
print(matrix)
score=accuracy_score(data_val['label'],tfidf_predictions)
print(f' Accuracy score for Random Forest with TF-IDF: {score}') 
report=classification_report(data_val['label'],tfidf_predictions)
print(report)

Random Forest with TF-IDF
[[3066  385]
 [ 228 3152]]
 Accuracy score for Random Forest with TF-IDF: 0.9102620406968233
              precision    recall  f1-score   support

           0       0.93      0.89      0.91      3451
           1       0.89      0.93      0.91      3380

    accuracy                           0.91      6831
   macro avg       0.91      0.91      0.91      6831
weighted avg       0.91      0.91      0.91      6831



7.4. Decision Tree Classifier

1. CountVectorizer
2. TF-IDF


In [26]:
# Fit the Decision Tree model with bow
dt = DecisionTreeClassifier()
dt.fit(bow_traindataset,data_train['label'])

In [27]:
bow_predictions = dt.predict(bow_test_dataset)
print ('Decision Tree with BOW')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],bow_predictions)
print(matrix)
score=accuracy_score(data_val['label'],bow_predictions)
print(f'Accuracy score for Decision Tree with BOW: {score}')
report=classification_report(data_val['label'],bow_predictions)
print(report)

Decision Tree with BOW
[[3095  356]
 [ 434 2946]]
Accuracy score for Decision Tree with BOW: 0.8843507539159713
              precision    recall  f1-score   support

           0       0.88      0.90      0.89      3451
           1       0.89      0.87      0.88      3380

    accuracy                           0.88      6831
   macro avg       0.88      0.88      0.88      6831
weighted avg       0.88      0.88      0.88      6831



In [28]:
# Fit the Decision Tree model with TF-IDF
dt.fit(tdidf_traindataset,data_train['label'])

In [29]:
tfidf_predictions = dt.predict(tdidf_test_dataset)
print ('Decision Tree with TF-IDF')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],tfidf_predictions)
print(matrix)
score=accuracy_score(data_val['label'],tfidf_predictions)
print(f'Accuracy score for Decision Tree with TF-IDF: {score}') #print(score)
report=classification_report(data_val['label'],tfidf_predictions)
print(report)

Decision Tree with TF-IDF
[[2883  568]
 [ 924 2456]]
Accuracy score for Decision Tree with TF-IDF: 0.781583955496999
              precision    recall  f1-score   support

           0       0.76      0.84      0.79      3451
           1       0.81      0.73      0.77      3380

    accuracy                           0.78      6831
   macro avg       0.78      0.78      0.78      6831
weighted avg       0.78      0.78      0.78      6831



7.5. KNN Classifier

1. CountVectorizer
2. TF-IDF


In [30]:
# Fit the KNN model with bow
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(bow_traindataset,data_train['label'])

In [31]:
bow_predictions = knn.predict(bow_test_dataset)
print ('KNN with BOW')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],bow_predictions)
print(matrix)
score=accuracy_score(data_val['label'],bow_predictions)
print(f' Accuracy score for KNN with BOW: {score}') 
report=classification_report(data_val['label'],bow_predictions)
print(report)

KNN with BOW
[[3444    7]
 [3274  106]]
 Accuracy score for KNN with BOW: 0.5196896501244327
              precision    recall  f1-score   support

           0       0.51      1.00      0.68      3451
           1       0.94      0.03      0.06      3380

    accuracy                           0.52      6831
   macro avg       0.73      0.51      0.37      6831
weighted avg       0.72      0.52      0.37      6831



In [32]:
# Fit the KNN model with TF-IDF
knn.fit(tdidf_traindataset,data_train['label'])

In [33]:
tfidf_predictions = knn.predict(tdidf_test_dataset)
print ('KNN with TF-IDF')
# Evaluate the model
matrix=confusion_matrix(data_val['label'],tfidf_predictions)
print(matrix)
score=accuracy_score(data_val['label'],tfidf_predictions)
print(f' Accuracy score for KNN with TF-IDF: {score}')
report=classification_report(data_val['label'],tfidf_predictions)
print(report)

KNN with TF-IDF
[[2953  498]
 [ 342 3038]]
 Accuracy score for KNN with TF-IDF: 0.8770311813790075
              precision    recall  f1-score   support

           0       0.90      0.86      0.88      3451
           1       0.86      0.90      0.88      3380

    accuracy                           0.88      6831
   macro avg       0.88      0.88      0.88      6831
weighted avg       0.88      0.88      0.88      6831



7.6. SVM Classifier

1. CountVectorizer
2. TF-IDF


In [34]:
# Fit the SVM model with TF-IDF
svm = SVC()
svm.fit(tdidf_traindataset,data_train['label'])

In [35]:
bow_predictions = svm.predict(bow_test_dataset)
print ('SVM with BOW')
# Evaluate the model
matrix=confusion_matrix(data_val['label'],bow_predictions) # best prediction 
print(matrix)
score=accuracy_score(data_val['label'],bow_predictions)
print(f'Accuracy score for SVM with BOW: {score}') 
report=classification_report(data_val['label'],bow_predictions)
print(report)

SVM with BOW
[[  19 3432]
 [   0 3380]]
Accuracy score for SVM with BOW: 0.4975845410628019
              precision    recall  f1-score   support

           0       1.00      0.01      0.01      3451
           1       0.50      1.00      0.66      3380

    accuracy                           0.50      6831
   macro avg       0.75      0.50      0.34      6831
weighted avg       0.75      0.50      0.33      6831



In [36]:
# Fit the SVM model with TF-IDF
svm = SVC()
svm.fit(tdidf_traindataset,data_train['label'])

In [37]:
tfidf_predictions = svm.predict(tdidf_test_dataset)
print ('SVM with TF-IDF')
# Evaluate the model
matrix=confusion_matrix(data_val['label'],tfidf_predictions)
print(matrix)
score=accuracy_score(data_val['label'],tfidf_predictions)
print(f' Accuracy score for SVM with TF-IDF: {score}') 
report=classification_report(data_val['label'],tfidf_predictions)
print(report)

SVM with TF-IDF
[[3276  175]
 [ 152 3228]]
 Accuracy score for SVM with TF-IDF: 0.9521299956082565
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      3451
           1       0.95      0.96      0.95      3380

    accuracy                           0.95      6831
   macro avg       0.95      0.95      0.95      6831
weighted avg       0.95      0.95      0.95      6831



7.7. Super Gradient Boost Classifier

1. CountVectorizer
2. TF-IDF


In [38]:
# Fit the SGBC Classifier with BOW
sgbc = GradientBoostingClassifier()
sgbc.fit(bow_traindataset,data_train['label'])

In [39]:
bow_predictions = sgbc.predict(bow_test_dataset)
print ('SGBC with BOW')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],bow_predictions)
print(matrix)
score=accuracy_score(data_val['label'],bow_predictions)
print(f' Accuracy score for SGBC with BOW: {score}') 
report=classification_report(data_val['label'],bow_predictions)
print(report)

SGBC with BOW
[[2634  817]
 [ 182 3198]]
 Accuracy score for SGBC with BOW: 0.8537549407114624
              precision    recall  f1-score   support

           0       0.94      0.76      0.84      3451
           1       0.80      0.95      0.86      3380

    accuracy                           0.85      6831
   macro avg       0.87      0.85      0.85      6831
weighted avg       0.87      0.85      0.85      6831



In [40]:
# Fit the SGBC Classifier with TF-IDF
sgbc = GradientBoostingClassifier()
sgbc.fit(tdidf_traindataset,data_train['label'])

In [41]:
tfidf_predictions = sgbc.predict(tdidf_test_dataset)
print ('SGBC with TF-IDF')

# Evaluate the model
matrix=confusion_matrix(data_val['label'],tfidf_predictions)
print(matrix)
score=accuracy_score(data_val['label'],tfidf_predictions)
print(f' Accuracy score for SGBC with TF-IDF: {score}')
report=classification_report(data_val['label'],tfidf_predictions)
print(report)

SGBC with TF-IDF
[[2588  863]
 [ 331 3049]]
 Accuracy score for SGBC with TF-IDF: 0.8252086078173034
              precision    recall  f1-score   support

           0       0.89      0.75      0.81      3451
           1       0.78      0.90      0.84      3380

    accuracy                           0.83      6831
   macro avg       0.83      0.83      0.82      6831
weighted avg       0.83      0.83      0.82      6831



#### 8. Save Predictions on best model

In [48]:
# 1. Load the test data
data_val = pd.read_csv('testing_data_lowercase_nolabels.csv', sep='\t', names=['label', 'text'])

# 2. Preprocess the test data (transform using the already fitted tfidfvector)
test_transform = []
for row in range(0, len(data_val.index)):
    test_transform.append(' '.join(str(x) for x in data_val.iloc[row, 1:2]))  # Adjust column selection as needed

tdidf_test_dataset = tfidfvector.transform(test_transform)

# 3. Predict using the already trained model
predictions = logreg.predict(tdidf_test_dataset) 
data_val ['label'] = predictions
# 4. Save predictions to a CSV file
data_val.to_csv('test_predictions.csv', index=False)

# 5. Print the shape of the vectorized test dataset
print(tdidf_test_dataset.shape)
data_val

(9984, 171423)


Unnamed: 0,label,text
0,0,copycat muslim terrorist arrested with assault...
1,0,wow! chicago protester caught on camera admits...
2,1,germany's fdp look to fill schaeuble's big shoes
3,0,mi school sends welcome back packet warning ki...
4,1,u.n. seeks 'massive' aid boost amid rohingya '...
...,...,...
9979,0,boom! fox news leftist chris wallace attempts ...
9980,0,here it is: list of democrat hypocrites who vo...
9981,1,new fires ravage rohingya villages in northwes...
9982,0,meals on wheels shuts the lyin‚ lefties up wit...
